In [1]:
import pandas as pd
import numpy as np

In [4]:
df=pd.read_csv('/content/adult.csv')

In [5]:
print(df)

       age     workclass  fnlwgt     education  educational-num  \
0       25       Private  226802          11th                7   
1       38       Private   89814       HS-grad                9   
2       28     Local-gov  336951    Assoc-acdm               12   
3       44       Private  160323  Some-college               10   
4       18             ?  103497  Some-college               10   
...    ...           ...     ...           ...              ...   
48837   27       Private  257302    Assoc-acdm               12   
48838   40       Private  154374       HS-grad                9   
48839   58       Private  151910       HS-grad                9   
48840   22       Private  201490       HS-grad                9   
48841   52  Self-emp-inc  287927       HS-grad                9   

           marital-status         occupation relationship   race  gender  \
0           Never-married  Machine-op-inspct    Own-child  Black    Male   
1      Married-civ-spouse    Farming-fishin

In [6]:
df.isnull().sum()

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
educational-num,0
marital-status,0
occupation,0
relationship,0
race,0
gender,0


In [7]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

imputed_data = imputer.fit_transform(df)
df_imputed = pd.DataFrame(imputed_data, columns=df.columns)

print("\nImputed DataFrame:")
print(df_imputed)


Imputed DataFrame:
      age     workclass  fnlwgt     education educational-num  \
0      25       Private  226802          11th               7   
1      38       Private   89814       HS-grad               9   
2      28     Local-gov  336951    Assoc-acdm              12   
3      44       Private  160323  Some-college              10   
4      18             ?  103497  Some-college              10   
...    ..           ...     ...           ...             ...   
48837  27       Private  257302    Assoc-acdm              12   
48838  40       Private  154374       HS-grad               9   
48839  58       Private  151910       HS-grad               9   
48840  22       Private  201490       HS-grad               9   
48841  52  Self-emp-inc  287927       HS-grad               9   

           marital-status         occupation relationship   race  gender  \
0           Never-married  Machine-op-inspct    Own-child  Black    Male   
1      Married-civ-spouse    Farming-fishing   

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [9]:
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = df.select_dtypes(include='object').columns.tolist()

print("Numeric Columns:", numeric_cols)
print("Categorical Columns:", categorical_cols)

Numeric Columns: ['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']
Categorical Columns: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']


In [10]:
df_cleaned = df.copy()

for col in numeric_cols:
    Q1 = df_cleaned[col].quantile(0.25)
    Q3 = df_cleaned[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    df_cleaned[col] = np.where(df_cleaned[col] < lower_bound, lower_bound, df_cleaned[col])
    df_cleaned[col] = np.where(df_cleaned[col] > upper_bound, upper_bound, df_cleaned[col])

print("Outliers handled in numeric columns. Displaying first 5 rows of the cleaned DataFrame:")
display(df_cleaned.head())

Outliers handled in numeric columns. Displaying first 5 rows of the cleaned DataFrame:


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25.0,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K
1,38.0,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K
2,28.0,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
3,44.0,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,0.0,0.0,40.0,United-States,>50K
4,18.0,?,103497.0,Some-college,10.0,Never-married,?,Own-child,White,Female,0.0,0.0,32.5,United-States,<=50K


In [11]:
from sklearn.preprocessing import LabelEncoder

df_encoded = df_cleaned.copy()

for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])

print("Categorical columns have been label encoded. Displaying first 5 rows of the encoded DataFrame:")
display(df_encoded.head())

Categorical columns have been label encoded. Displaying first 5 rows of the encoded DataFrame:


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25.0,4,226802.0,1,7.0,4,7,3,2,1,0.0,0.0,40.0,39,0
1,38.0,4,89814.0,11,9.0,2,5,0,4,1,0.0,0.0,50.0,39,0
2,28.0,2,336951.0,7,12.0,2,11,0,4,1,0.0,0.0,40.0,39,1
3,44.0,4,160323.0,15,10.0,2,7,0,2,1,0.0,0.0,40.0,39,1
4,18.0,0,103497.0,15,10.0,4,0,3,4,0,0.0,0.0,32.5,39,0


standardscaler


In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
for col in numeric_cols:
    df[col] = scaler.fit_transform(df[[col]])

In [14]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,-0.995129,Private,0.351675,11th,-1.197259,Never-married,Machine-op-inspct,Own-child,Black,Male,-0.144804,-0.217127,-0.034087,United-States,<=50K
1,-0.046942,Private,-0.945524,HS-grad,-0.419335,Married-civ-spouse,Farming-fishing,Husband,White,Male,-0.144804,-0.217127,0.77293,United-States,<=50K
2,-0.776316,Local-gov,1.394723,Assoc-acdm,0.74755,Married-civ-spouse,Protective-serv,Husband,White,Male,-0.144804,-0.217127,-0.034087,United-States,>50K
3,0.390683,Private,-0.277844,Some-college,-0.030373,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,0.886874,-0.217127,-0.034087,United-States,>50K
4,-1.505691,?,-0.815954,Some-college,-0.030373,Never-married,?,Own-child,White,Female,-0.144804,-0.217127,-0.841104,United-States,<=50K
