 # Data cleaning and preprocessing:

Pandas is a powerful Python library for data manipulation and analysis. It provides a wide range of tools for data cleaning and preprocessing, which are crucial steps in preparing data for machine learning models.

     **Handling Missing Values:**
     
Pandas provides functions to identify and handle missing values.

**isnull() and notnull():** These functions check for missing values in a DataFrame or Series

**dropna():** This function removes rows or columns containing missing values based on specified criteria.

**fillna():** This function replaces missing values with specified values, such as the mean, median, or a constant value.

In [8]:
import pandas as pd
data = {'Age': [21, None, 23, None], 'Salary': [250000 ,56000, None, 98000]}
df = pd.DataFrame(data)
missing_values = df.notnull()
print(missing_values)

     Age  Salary
0   True    True
1  False    True
2   True   False
3  False    True


 **With .csv file**

In [9]:
import pandas as pd
df = pd.read_csv('aug_test.csv')
df

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,32403,city_41,0.827,Male,Has relevent experience,Full time course,Graduate,STEM,9,<10,,1,21
1,9858,city_103,0.920,Female,Has relevent experience,no_enrollment,Graduate,STEM,5,,Pvt Ltd,1,98
2,31806,city_21,0.624,Male,No relevent experience,no_enrollment,High School,,<1,,Pvt Ltd,never,15
3,27385,city_13,0.827,Male,Has relevent experience,no_enrollment,Masters,STEM,11,10/49,Pvt Ltd,1,39
4,27724,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,10000+,Pvt Ltd,>4,72
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2124,1289,city_103,0.920,Male,No relevent experience,no_enrollment,Graduate,Humanities,16,,Public Sector,4,15
2125,195,city_136,0.897,Male,Has relevent experience,no_enrollment,Masters,STEM,18,,,2,30
2126,31762,city_100,0.887,Male,No relevent experience,no_enrollment,Primary School,,3,,Pvt Ltd,never,18
2127,7873,city_102,0.804,Male,Has relevent experience,Full time course,High School,,7,100-500,Public Sector,1,84


In [4]:
df=pd.read_csv('aug_test.csv') #isnull() & notnull(): These functions check for missing values in a DataFrame or Series.
missing_values = df.notnull()
missing_values

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,True,True,True,True,True,True,True,True,True,True,False,True,True
1,True,True,True,True,True,True,True,True,True,False,True,True,True
2,True,True,True,True,True,True,True,False,True,False,True,True,True
3,True,True,True,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2124,True,True,True,True,True,True,True,True,True,False,True,True,True
2125,True,True,True,True,True,True,True,True,True,False,False,True,True
2126,True,True,True,True,True,True,True,False,True,False,True,True,True
2127,True,True,True,True,True,True,True,False,True,True,True,True,True


In [8]:
df=pd.read_csv('aug_test.csv')  #count the number of null values in each column of a Pandas DataFrame.
missing_values = df.isnull().sum()
missing_values

enrollee_id                 0
city                        0
city_development_index      0
gender                    508
relevent_experience         0
enrolled_university        31
education_level            52
major_discipline          312
experience                  5
company_size              622
company_type              634
last_new_job               40
training_hours              0
dtype: int64

In [9]:
df=pd.read_csv('aug_test.csv')
notmissing_values = df.notnull().sum()
notmissing_values

enrollee_id               2129
city                      2129
city_development_index    2129
gender                    1621
relevent_experience       2129
enrolled_university       2098
education_level           2077
major_discipline          1817
experience                2124
company_size              1507
company_type              1495
last_new_job              2089
training_hours            2129
dtype: int64

**dropna**

In [18]:
##axis ='index': Drops rows that contain missing values.
df = pd.read_csv('aug_test.csv')
print("======= before  =========")
print(df.shape) 
print(df.isnull().sum()) 
df.dropna(axis='index', inplace=True)   # Drop rows with any missing values
print("======= After  =========")
print(df.isnull().sum())         # Check if there are still missing values 

(2129, 13)
enrollee_id                 0
city                        0
city_development_index      0
gender                    508
relevent_experience         0
enrolled_university        31
education_level            52
major_discipline          312
experience                  5
company_size              622
company_type              634
last_new_job               40
training_hours              0
dtype: int64
enrollee_id               0
city                      0
city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
last_new_job              0
training_hours            0
dtype: int64


In [16]:
## axis='columns': Drops columns that contain missing values.
df = pd.read_csv('aug_test.csv')
print("======= before  =========")  # Print original DataFrame shape
print(df.shape) 
print(df.isnull().sum())
print("======= After  =========")
df.dropna(axis='columns', inplace=True) # Drop columns with any missing values
print(df.shape)
print(df.isnull().sum())

(2129, 13)
enrollee_id                 0
city                        0
city_development_index      0
gender                    508
relevent_experience         0
enrolled_university        31
education_level            52
major_discipline          312
experience                  5
company_size              622
company_type              634
last_new_job               40
training_hours              0
dtype: int64
(2129, 5)
enrollee_id               0
city                      0
city_development_index    0
relevent_experience       0
training_hours            0
dtype: int64


**how='any': If any value in the row/column is missing, drop the entire row/column.** 

In [5]:
import pandas as pd
df = pd.read_csv('aug_test.csv')
print("======= BEFORE ========")
print(df.shape)
print(df.isnull().sum()) 
df.dropna(axis='columns', how='any', inplace=True)     # Drop columns with any missing values
print("\n======= AFTER ========")
print(df.shape) 
print(df.isnull().sum())

(2129, 13)
enrollee_id                 0
city                        0
city_development_index      0
gender                    508
relevent_experience         0
enrolled_university        31
education_level            52
major_discipline          312
experience                  5
company_size              622
company_type              634
last_new_job               40
training_hours              0
dtype: int64

(2129, 5)
enrollee_id               0
city                      0
city_development_index    0
relevent_experience       0
training_hours            0
dtype: int64


**how='any': If any value in the row/column is missing, drop the entire row/column**

In [7]:
import pandas as pd
df = pd.read_csv('aug_test.csv')
print("======= BEFORE =======")
print(df.shape)
print(df.isnull().sum())
print("\n======= AFTER =======")
df.dropna(axis='columns', how='all', inplace=True) 
print(df.shape)
print(df.isnull().sum())


(2129, 13)
enrollee_id                 0
city                        0
city_development_index      0
gender                    508
relevent_experience         0
enrolled_university        31
education_level            52
major_discipline          312
experience                  5
company_size              622
company_type              634
last_new_job               40
training_hours              0
dtype: int64

(2129, 13)
enrollee_id                 0
city                        0
city_development_index      0
gender                    508
relevent_experience         0
enrolled_university        31
education_level            52
major_discipline          312
experience                  5
company_size              622
company_type              634
last_new_job               40
training_hours              0
dtype: int64
