In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df=pd.read_csv(r"D:\AdultCensusIncomePrediction\notebooks\data\adult.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [20]:
df["country"].unique()

array([' United-States', ' Cuba', ' Jamaica', ' India', ' ?', ' Mexico',
       ' South', ' Puerto-Rico', ' Honduras', ' England', ' Canada',
       ' Germany', ' Iran', ' Philippines', ' Italy', ' Poland',
       ' Columbia', ' Cambodia', ' Thailand', ' Ecuador', ' Laos',
       ' Taiwan', ' Haiti', ' Portugal', ' Dominican-Republic',
       ' El-Salvador', ' France', ' Guatemala', ' China', ' Japan',
       ' Yugoslavia', ' Peru', ' Outlying-US(Guam-USVI-etc)', ' Scotland',
       ' Trinadad&Tobago', ' Greece', ' Nicaragua', ' Vietnam', ' Hong',
       ' Ireland', ' Hungary', ' Holand-Netherlands'], dtype=object)

In [3]:
df["education"].value_counts()

education
 HS-grad         10501
 Some-college     7291
 Bachelors        5355
 Masters          1723
 Assoc-voc        1382
 11th             1175
 Assoc-acdm       1067
 10th              933
 7th-8th           646
 Prof-school       576
 9th               514
 12th              433
 Doctorate         413
 5th-6th           333
 1st-4th           168
 Preschool          51
Name: count, dtype: int64

In [4]:
X=df.drop(['salary',"education-num"],axis=1)
y=df["salary"]

array([' Never-married', ' Married-civ-spouse', ' Divorced',
       ' Married-spouse-absent', ' Separated', ' Married-AF-spouse',
       ' Widowed'], dtype=object)

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=30)

# Segregating numerical and categorical variables

In [None]:

categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [None]:
X.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'marital-status',
       'occupation', 'relationship', 'race', 'sex', 'capital-gain',
       'capital-loss', 'hours-per-week', 'country'],
      dtype='object')

In [None]:
categorical_cols

Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'country'],
      dtype='object')

In [None]:
numerical_cols

Index(['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week'], dtype='object')

In [None]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder()),
    ('scaler',StandardScaler())
    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])


In [None]:
preprocessor

In [None]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [None]:
X_train.head()

Unnamed: 0,num_pipeline__age,num_pipeline__fnlwgt,num_pipeline__capital-gain,num_pipeline__capital-loss,num_pipeline__hours-per-week,cat_pipeline__workclass,cat_pipeline__education,cat_pipeline__marital-status,cat_pipeline__occupation,cat_pipeline__relationship,cat_pipeline__race,cat_pipeline__sex,cat_pipeline__country
0,-0.18447,-0.073413,-0.146309,-0.217065,-0.036989,0.091007,0.176899,0.923611,-0.13381,0.966709,0.393296,0.704642,0.29077
1,-1.510861,-0.456592,0.149568,-0.217065,-1.658547,0.091007,1.213637,0.923611,1.52227,0.966709,0.393296,0.704642,0.29077
2,-0.552912,0.231261,-0.146309,-0.217065,1.989959,0.091007,0.176899,-0.40974,0.339356,-0.898933,0.393296,0.704642,0.29077
3,0.773479,0.82997,-0.146309,-0.217065,0.77379,0.091007,0.176899,-0.40974,1.758853,-0.898933,0.393296,0.704642,0.29077
4,1.65774,-1.141907,-0.146309,-0.217065,-0.280223,-2.654493,0.176899,-0.40974,-1.553306,-0.898933,0.393296,0.704642,0.29077


In [None]:
y_train.value_counts()

salary
 <=50K    17256
 >50K      5536
Name: count, dtype: int64

## Imbalenced Datasets:-

In [None]:
# pip install imblearn

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
oversample=SMOTE()
X_train,y_train=oversample.fit_resample(X_train,y_train)

In [None]:
X_train.shape

(34512, 13)

In [None]:
y_train.value_counts()

salary
 <=50K    17256
 >50K     17256
Name: count, dtype: int64

In [None]:
models={
    'LogisticRegression':LogisticRegression(),
    'SVC':SVC(),
    'DecisionTreeClassifier':DecisionTreeClassifier(),
    'RandomForestClassifier':RandomForestClassifier(),
    'GradientBoostingClassifier':GradientBoostingClassifier(),
    'AdaBoostClassifier':AdaBoostClassifier(),
    'KNeighborsClassifier':KNeighborsClassifier(algorithm='auto',p=2)
}

In [None]:
params={
    "LogisticRegression": {
        # 'penalty':['l1', 'l2', 'elasticnet', None],
        # 'solver':['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
    },
    
    
    "SVC":{
        # 'kernel':['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],      
    },
    
    
    'DecisionTreeClassifier':{
        'criterion':['gini', 'entropy', 'log_loss'],
        'max_depth':[5,10,15,20,25,30,35,40],
        'splitter':['best','random']
        },
    
    
    
    'RandomForestClassifier':{
        'n_estimators': [10,20,30,40,50,60,70,80],
        'criterion':['gini', 'entropy', 'log_loss'],
        'max_depth':[5,10,15,20,25]
        # ,'learning_rate':[.1,.01,.05,.001]
    },
    
    'GradientBoostingClassifier':{
        'loss':['log_loss','exponential'],
        'n_estimators':[50,100,150],
        'criterion':['friedman_mse','squared_error'],
        'learning_rate':[.1,.01,.05,.001]
    },
    
    
    
    'AdaBoostClassifier':{
        'n_estimators':[50,100,150,200],
        'algorithm':['SAMME', 'SAMME.R'],
        'learning_rate':[.1,.01,.05,.001]  
    },
    
    
    
    
    'KNeighborsClassifier':{
        'n_neighbors':[3,4,5,6,7,8]
        
        
    }
    
    
    
}

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
from sklearn.model_selection import GridSearchCV
def get_best_model(X_train,y_train,X_test,y_test,model_dict,params_,base_accuracy=0.7):
    models = model_dict.keys()
    for model in models:
        print(f'Initial base accuracy is {base_accuracy}')
        print(20*'*',"Estimator: ",model,20*'*')
        gs = GridSearchCV(estimator=model_dict[model],param_grid=params_[model],cv=3)
        gs.fit(X_train,y_train)
        print(f'Best parameters are :{gs.best_params_} With accuracy score:{gs.best_score_}')
        
        regressor = model_dict[model].set_params(**gs.best_params_)
        regressor.fit(X_train,y_train)

        y_train_pred = regressor.predict(X_train)
        y_test_pred = regressor.predict(X_test)
        
        train_model_score = accuracy_score(y_train, y_train_pred)

        test_model_score = accuracy_score(y_test, y_test_pred)
        
        if test_model_score >= base_accuracy:
            print('Acceptabel model found!')
            base_accuracy = test_model_score
            best_model = {model:regressor}
        else:
            print('Model rejected\n',30*'__')
            
    
    print(f"Best model is:{list(best_model.keys())} with accuracy score: {round(test_model_score,2)}")        
    return best_model

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
get_best_model(X_train,y_train,X_test,y_test,model_dict=models,params_=params)

Initial base accuracy is 0.7
******************** Estimator:  LogisticRegression ********************
Best parameters are :{} With accuracy score:0.7362656467315717
Acceptabel model found!
Initial base accuracy is 0.7256628109325417
******************** Estimator:  SVC ********************
Best parameters are :{} With accuracy score:0.8132243857209086
Acceptabel model found!
Initial base accuracy is 0.7823728119561879
******************** Estimator:  DecisionTreeClassifier ********************
Best parameters are :{'criterion': 'entropy', 'max_depth': 40, 'splitter': 'random'} With accuracy score:0.8533263792304128
Acceptabel model found!
Initial base accuracy is 0.8090899785034292
******************** Estimator:  RandomForestClassifier ********************
Best parameters are :{'criterion': 'log_loss', 'max_depth': 25, 'n_estimators': 70} With accuracy score:0.8905887807139546
Acceptabel model found!
Initial base accuracy is 0.8523902139420616
******************** Estimator:  Gradient

In [None]:
model=RandomForestClassifier(max_depth=25,n_estimators=60,criterion="gini")

In [None]:
model.fit(X_train,y_train)

In [None]:
y_pred=model.predict(X_test)

In [None]:
y_pred

array([' <=50K', ' >50K', ' <=50K', ..., ' <=50K', ' >50K', ' <=50K'],
      dtype=object)

In [None]:
print(accuracy_score(y_test,y_pred))

0.8487050875217524
