<a href="https://colab.research.google.com/github/DammuNikhitha/AI-ML-Internship-Task-4/blob/main/01_task4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 4: Feature Encoding & Scaling.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

In [2]:
df=pd.read_csv("adult.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [4]:
df.describe()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


# Step 1 : Identify Categorical & Numerical Features

Identifying Feature Types

In [5]:
categorical_features = df.select_dtypes(include='object').columns
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns

print("Categorical Features:", categorical_features)
print("Numerical Features:", numerical_features)

Categorical Features: Index(['workclass', 'education', 'marital.status', 'occupation',
       'relationship', 'race', 'sex', 'native.country', 'income'],
      dtype='object')
Numerical Features: Index(['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss',
       'hours.per.week'],
      dtype='object')


# Step 2 : Label Encoding (Ordered Categorical Data)

Label Encoding is applied where an order exists in the data.

In [6]:
le= LabelEncoder()
df['income']=le.fit_transform(df['income'])
df['income'].value_counts()

Unnamed: 0_level_0,count
income,Unnamed: 1_level_1
0,24720
1,7841


# Step 3 : One-Hot Encoding (Unordered Categorical Data)

One-Hot Encoding is used where no order exists among categories.

In [7]:
df_encoded=pd.get_dummies(df,columns=categorical_features,drop_first=True)
df_encoded.head()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,...,native.country_Puerto-Rico,native.country_Scotland,native.country_South,native.country_Taiwan,native.country_Thailand,native.country_Trinadad&Tobago,native.country_United-States,native.country_Vietnam,native.country_Yugoslavia,income_1
0,90,77053,9,0,4356,40,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
1,82,132870,9,0,4356,18,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
2,66,186061,10,0,4356,40,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3,54,140359,4,0,3900,40,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
4,41,264663,10,0,3900,40,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False


# Step 4 : Feature Scaling Using StandardScaler

Numerical features are scaled using StandardScaler.

In [8]:
scaler=StandardScaler()
df_encoded[numerical_features]=scaler.fit_transform(df_encoded[numerical_features])
df_encoded.head()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,...,native.country_Puerto-Rico,native.country_Scotland,native.country_South,native.country_Taiwan,native.country_Thailand,native.country_Trinadad&Tobago,native.country_United-States,native.country_Vietnam,native.country_Yugoslavia,income_1
0,3.769612,-1.067997,-0.42006,-0.14592,10.593507,-0.035429,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
1,3.183112,-0.539169,-0.42006,-0.14592,10.593507,-1.817204,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
2,2.01011,-0.03522,-0.03136,-0.14592,10.593507,-0.035429,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3,1.130359,-0.468215,-2.363558,-0.14592,9.461864,-0.035429,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False
4,0.177296,0.709482,-0.03136,-0.14592,9.461864,-0.035429,False,False,False,True,...,False,False,False,False,False,False,True,False,False,False


# Step 5 : Compare Before & After Scaling

Comparison Before and After Scaling

In [9]:
df[numerical_features].describe()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [10]:
df_encoded[numerical_features].describe()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,-3.666078e-17,-1.008172e-16,1.466431e-16,4.189804e-17,-3.4915030000000003e-17,-2.793203e-17
std,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015
min,-1.582206,-1.681631,-3.529656,-0.1459205,-0.2166595,-3.19403
25%,-0.7757679,-0.681691,-0.4200596,-0.1459205,-0.2166595,-0.03542945
50%,-0.1159546,-0.1082193,-0.03136003,-0.1459205,-0.2166595,-0.03542945
75%,0.6904838,0.4478765,0.7460392,-0.1459205,-0.2166595,0.3695194
max,3.769612,12.26856,2.300838,13.39458,10.59351,4.742967


# Step 6 : Impact of Scaling on ML Algorithms


*   Scaling ensures all features contribute equally.
*   Distance-based algorithms perform better after scaling.
*   Faster convergence in gradient-based algorithms.






# Step 7 : Save Preprocessed Dataset

Saving Processed Dataset

In [11]:
df_encoded.to_csv("adult_income_preprocessed.csv",index=False)

## Final Outcome
- Identified categorical and numerical features
- Applied Label Encoding and One-Hot Encoding
- Scaled numerical features using StandardScaler
- Dataset is now ready for machine learning models