In [38]:
import pandas as pd      
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.model_selection import GridSearchCV
import pickle
import warnings
warnings.filterwarnings("ignore")

In [39]:
data = pd.read_excel(r"C:\Users\aswin\Downloads\ICT\Exam\dataset2.xlsx")     # read the database

In [40]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [41]:
data.shape

(32561, 15)

In [42]:
data.isna().sum()     # checking for null values

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
salary            0
dtype: int64

In [43]:
# 1. Preprocessing
# Encode categorical features and scale numerical features
X = data.drop('salary', axis=1)
y = data['salary']

In [44]:
# Define categorical and numerical columns
cat_features = X.select_dtypes(include=['object']).columns
num_features = X.select_dtypes(exclude=['object']).columns

In [45]:
# Preprocessing pipeline for numerical features (scaling)
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [46]:
# Preprocessing pipeline for categorical features (encoding)
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [47]:
# Combine preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ])

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [48]:
# 2. Model selection
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}


In [49]:
# Creating a function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):


    # Pipeline with preprocessing and model
    clf = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

    # Fit the model
    clf.fit(X_train, y_train)

    # Predictions
    y_pred = clf.predict(X_test)

    # Evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label='>50K')
    recall = recall_score(y_test, y_pred, pos_label='>50K')
    f1 = f1_score(y_test, y_pred, pos_label='>50K')
    roc_auc = roc_auc_score(pd.get_dummies(y_test, drop_first=True), clf.predict_proba(X_test)[:, 1])

    # Print evaluation results
    print(f"Model: {model}")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-Score: {f1}")
    print(f"ROC-AUC: {roc_auc}")
    print(classification_report(y_test, y_pred))

    return clf


In [50]:
# 3. Evaluating each model
best_model = None
best_score = 0
for name, model in models.items():
    print(f"\n{name} Results:")
    clf = evaluate_model(model, X_train, X_test, y_train, y_test)
    
    # Save best model based on F1-Score
    if f1_score(y_test, clf.predict(X_test), pos_label='>50K') > best_score:
        best_score = f1_score(y_test, clf.predict(X_test), pos_label='>50K')
        best_model = clf


Logistic Regression Results:
Model: LogisticRegression()
Accuracy: 0.8581298940580377
Precision: 0.7517509727626459
Recall: 0.6148949713558243
F1-Score: 0.6764705882352942
ROC-AUC: 0.9085107166749831
              precision    recall  f1-score   support

       <=50K       0.88      0.94      0.91      4942
        >50K       0.75      0.61      0.68      1571

    accuracy                           0.86      6513
   macro avg       0.82      0.78      0.79      6513
weighted avg       0.85      0.86      0.85      6513


Decision Tree Results:
Model: DecisionTreeClassifier()
Accuracy: 0.8209734377399048
Precision: 0.6246153846153846
Recall: 0.6460852959898155
F1-Score: 0.6351689612015019
ROC-AUC: 0.7613267435027994
              precision    recall  f1-score   support

       <=50K       0.89      0.88      0.88      4942
        >50K       0.62      0.65      0.64      1571

    accuracy                           0.82      6513
   macro avg       0.76      0.76      0.76      6513
w

In [51]:
# 5. Hyperparameter tuning for the best model
param_grid = {
    'model__n_estimators': [10, 20, 50],
    'model__max_depth': [None, 5, 10, 15],
    'model__min_samples_split': [2, 5, 8]
}

def evaluate_model(model, X_train, X_test, y_train, y_test, preprocessor=None):

    if not isinstance(X_train, pd.DataFrame): 
        X_train = X_train.toarray()  
    if not isinstance(X_test, pd.DataFrame): 
        X_test = X_test.toarray()


# Performing GridSearchCV
grid_search = GridSearchCV(best_model, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Hyperparameters:", grid_search.best_params_)

# Evaluate the best model after tuning
final_model = grid_search.best_estimator_
evaluate_model(final_model, X_train, X_test, y_train, y_test)

# 4. Feature Importance for Random Forest
importances = final_model.named_steps['model'].feature_importances_
feature_names = final_model.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names_out(cat_features)
importances_df = pd.DataFrame(importances, index=list(num_features) + list(feature_names), columns=['Importance'])
importances_df.sort_values(by='Importance', ascending=False).head(10)

Best Hyperparameters: {'model__max_depth': None, 'model__min_samples_split': 2, 'model__n_estimators': 10}


Unnamed: 0,Importance
fnlwgt,0.159739
age,0.148434
capital-gain,0.08487
hours-per-week,0.081402
relationship_Husband,0.056494
education-num,0.051684
marital-status_Never-married,0.031317
capital-loss,0.030032
marital-status_Married-civ-spouse,0.025484
occupation_Exec-managerial,0.020067


In [52]:
with open('final_model.pkl', 'wb') as file:
    pickle.dump(model, file)
