In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import stats
from sklearn.preprocessing import OneHotEncoder , OrdinalEncoder
from sklearn.preprocessing import StandardScaler , RobustScaler , MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv("adult_with_headers.csv")
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


## 1. Data Exploration and Preprocessing

In [3]:
df.shape

(32561, 15)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [5]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income'],
      dtype='object')

In [6]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

### there is no null values

In [7]:
cat_col=[i for i in df.columns if df[i].dtype=='object']
cat_col

['workclass',
 'education',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native_country',
 'income']

In [8]:
num_col=[i for i in df.columns if df[i].dtype!='object']
num_col

['age',
 'fnlwgt',
 'education_num',
 'capital_gain',
 'capital_loss',
 'hours_per_week']

### apply standard scaler , minmax scaler

In [9]:
std_sca = StandardScaler()
min_max_sca = MinMaxScaler()

In [10]:
df[num_col]=std_sca.fit_transform(df[num_col])
df[num_col]

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...
32556,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [11]:
df[num_col]=min_max_sca.fit_transform(df[num_col])
df[num_col]

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,0.301370,0.044302,0.800000,0.021740,0.0,0.397959
1,0.452055,0.048238,0.800000,0.000000,0.0,0.122449
2,0.287671,0.138113,0.533333,0.000000,0.0,0.397959
3,0.493151,0.151068,0.400000,0.000000,0.0,0.397959
4,0.150685,0.221488,0.800000,0.000000,0.0,0.397959
...,...,...,...,...,...,...
32556,0.136986,0.166404,0.733333,0.000000,0.0,0.377551
32557,0.315068,0.096500,0.533333,0.000000,0.0,0.397959
32558,0.561644,0.094827,0.533333,0.000000,0.0,0.397959
32559,0.068493,0.128499,0.533333,0.000000,0.0,0.193878


## 2. Encoding Techniques

In [12]:
cat_col

['workclass',
 'education',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native_country',
 'income']

In [13]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0.301370,State-gov,0.044302,Bachelors,0.800000,Never-married,Adm-clerical,Not-in-family,White,Male,0.021740,0.0,0.397959,United-States,<=50K
1,0.452055,Self-emp-not-inc,0.048238,Bachelors,0.800000,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.000000,0.0,0.122449,United-States,<=50K
2,0.287671,Private,0.138113,HS-grad,0.533333,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.000000,0.0,0.397959,United-States,<=50K
3,0.493151,Private,0.151068,11th,0.400000,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.000000,0.0,0.397959,United-States,<=50K
4,0.150685,Private,0.221488,Bachelors,0.800000,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.000000,0.0,0.397959,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.136986,Private,0.166404,Assoc-acdm,0.733333,Married-civ-spouse,Tech-support,Wife,White,Female,0.000000,0.0,0.377551,United-States,<=50K
32557,0.315068,Private,0.096500,HS-grad,0.533333,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.000000,0.0,0.397959,United-States,>50K
32558,0.561644,Private,0.094827,HS-grad,0.533333,Widowed,Adm-clerical,Unmarried,White,Female,0.000000,0.0,0.397959,United-States,<=50K
32559,0.068493,Private,0.128499,HS-grad,0.533333,Never-married,Adm-clerical,Own-child,White,Male,0.000000,0.0,0.193878,United-States,<=50K


In [14]:
unique=df[cat_col].nunique()
unique

workclass          9
education         16
marital_status     7
occupation        15
relationship       6
race               5
sex                2
native_country    42
income             2
dtype: int64

In [15]:
for columns in cat_col:
    unique=df[columns].nunique()
    if unique<5:
        print(columns)

sex
income



### let's perform one hot encoding

In [16]:
cat_col_less = [i for i in df.columns if df[i].dtype=='object'and df[i].nunique() <5 ]
cat_col_less

['sex', 'income']

In [17]:
one_hot=make_pipeline(OneHotEncoder())

In [18]:
transformers = ColumnTransformer(transformers=[ ('one_hot', OneHotEncoder(), cat_col_less)],remainder='drop')
transformers

In [19]:
transformers.get_feature_names_out

<bound method ColumnTransformer.get_feature_names_out of ColumnTransformer(transformers=[('one_hot', OneHotEncoder(),
                                 ['sex', 'income'])])>

In [20]:
df_one_hot = pd.DataFrame(transformers.fit_transform(df[cat_col_less]),columns=transformers.get_feature_names_out())

In [21]:
df_one_hot

Unnamed: 0,one_hot__sex_ Female,one_hot__sex_ Male,one_hot__income_ <=50K,one_hot__income_ >50K
0,0.0,1.0,1.0,0.0
1,0.0,1.0,1.0,0.0
2,0.0,1.0,1.0,0.0
3,0.0,1.0,1.0,0.0
4,1.0,0.0,1.0,0.0
...,...,...,...,...
32556,1.0,0.0,1.0,0.0
32557,0.0,1.0,0.0,1.0
32558,1.0,0.0,1.0,0.0
32559,0.0,1.0,1.0,0.0


### lets perform label encoding

In [22]:
cat_col_more = [ i for i in df.columns if df[i].dtype=='object' and df[i].nunique() > 5 ]
cat_col_more

['workclass',
 'education',
 'marital_status',
 'occupation',
 'relationship',
 'native_country']

In [23]:
ord_enc = make_pipeline(OrdinalEncoder())

In [24]:
transformers = ColumnTransformer(transformers=[('ord_enc', OrdinalEncoder() , cat_col_more)], remainder='drop')
transformers

In [25]:
transformers.get_feature_names_out

<bound method ColumnTransformer.get_feature_names_out of ColumnTransformer(transformers=[('ord_enc', OrdinalEncoder(),
                                 ['workclass', 'education', 'marital_status',
                                  'occupation', 'relationship',
                                  'native_country'])])>

In [26]:
df_ord_enc = pd.DataFrame(ord_enc.fit_transform(df[cat_col_more]), columns=cat_col_more)
df_ord_enc

Unnamed: 0,workclass,education,marital_status,occupation,relationship,native_country
0,7.0,9.0,4.0,1.0,1.0,39.0
1,6.0,9.0,2.0,4.0,0.0,39.0
2,4.0,11.0,0.0,6.0,1.0,39.0
3,4.0,1.0,2.0,6.0,0.0,39.0
4,4.0,9.0,2.0,10.0,5.0,5.0
...,...,...,...,...,...,...
32556,4.0,7.0,2.0,13.0,5.0,39.0
32557,4.0,11.0,2.0,7.0,0.0,39.0
32558,4.0,11.0,6.0,1.0,4.0,39.0
32559,4.0,11.0,4.0,1.0,3.0,39.0


### concat

In [27]:
df_new=pd.concat([df_one_hot,df_ord_enc],axis=1)
df_new

Unnamed: 0,one_hot__sex_ Female,one_hot__sex_ Male,one_hot__income_ <=50K,one_hot__income_ >50K,workclass,education,marital_status,occupation,relationship,native_country
0,0.0,1.0,1.0,0.0,7.0,9.0,4.0,1.0,1.0,39.0
1,0.0,1.0,1.0,0.0,6.0,9.0,2.0,4.0,0.0,39.0
2,0.0,1.0,1.0,0.0,4.0,11.0,0.0,6.0,1.0,39.0
3,0.0,1.0,1.0,0.0,4.0,1.0,2.0,6.0,0.0,39.0
4,1.0,0.0,1.0,0.0,4.0,9.0,2.0,10.0,5.0,5.0
...,...,...,...,...,...,...,...,...,...,...
32556,1.0,0.0,1.0,0.0,4.0,7.0,2.0,13.0,5.0,39.0
32557,0.0,1.0,0.0,1.0,4.0,11.0,2.0,7.0,0.0,39.0
32558,1.0,0.0,1.0,0.0,4.0,11.0,6.0,1.0,4.0,39.0
32559,0.0,1.0,1.0,0.0,4.0,11.0,4.0,1.0,3.0,39.0


## 3. Feature Engineering

In [28]:
# Create new features
df['capital-gain-minus-loss'] = df['capital_gain'] - df['capital_loss']
df['age_squared'] = df['age'] ** 2

In [29]:
# Apply log transformation to skewed numerical feature (assuming 'capital-gain' is skewed)
import numpy as np
df['capital-gain_log'] = np.log1p(df['capital_gain'])

## 4. Feature Selection 

In [30]:
from sklearn.ensemble import IsolationForest

In [31]:
model=IsolationForest(contamination=0.02)     # 5 point oul=tlier chahiye so 

In [32]:
model.fit(df_new)

In [33]:
model.predict(df_new)

array([1, 1, 1, ..., 1, 1, 1])

In [34]:
df_new['Anomaly']=model.predict(df_new)     # anamoly name define kra h

In [35]:
df_new.head()

Unnamed: 0,one_hot__sex_ Female,one_hot__sex_ Male,one_hot__income_ <=50K,one_hot__income_ >50K,workclass,education,marital_status,occupation,relationship,native_country,Anomaly
0,0.0,1.0,1.0,0.0,7.0,9.0,4.0,1.0,1.0,39.0,1
1,0.0,1.0,1.0,0.0,6.0,9.0,2.0,4.0,0.0,39.0,1
2,0.0,1.0,1.0,0.0,4.0,11.0,0.0,6.0,1.0,39.0,1
3,0.0,1.0,1.0,0.0,4.0,1.0,2.0,6.0,0.0,39.0,1
4,1.0,0.0,1.0,0.0,4.0,9.0,2.0,10.0,5.0,5.0,1


### PPS 

In [37]:
import ppscore as pps

In [38]:
df

In [39]:
pps.matrix(df) 

In [40]:
## end