In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('/Users/abhay/Desktop/Probation/Probation-Project-25/Abhay_Pratap_Singh_Task_8/AdultIncome - Sheet1.csv')

In [3]:
cat_value = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
num_value = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']

In [4]:
target = 'income'
X = df.drop(target, axis=1)
y = df[target]

In [5]:
X = pd.get_dummies(X, columns=cat_value, drop_first=True)
y = y.map({'<=50K': 0, '>50K': 1})

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [7]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [9]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [10]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(random_state=0)
dt_model.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,0
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [11]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=0)
rf_model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [12]:
y_pred_logreg = model.predict(X_test)

y_pred_knn = knn_model.predict(X_test)

y_pred_dt = dt_model.predict(X_test)

y_pred_rf = rf_model.predict(X_test)

In [13]:
acc_logreg = accuracy_score(y_test, y_pred_logreg)
acc_knn = accuracy_score(y_test, y_pred_knn)
acc_dt = accuracy_score(y_test, y_pred_dt)
acc_rf = accuracy_score(y_test, y_pred_rf)

print(f"Logistic Regression Accuracy: {acc_logreg:.4f}")
print(f"KNN Accuracy: {acc_knn:.4f}")
print(f"Decision Tree Accuracy: {acc_dt:.4f}")
print(f"Random Forest Accuracy: {acc_rf:.4f}")

Logistic Regression Accuracy: 0.8497
KNN Accuracy: 0.8251
Decision Tree Accuracy: 0.8159
Random Forest Accuracy: 0.8589


In [None]:
from sklearn.model_selection import GridSearchCV

logreg_params = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs'],
    'penalty': ['l2']
}
logreg_grid = GridSearchCV(LogisticRegression(max_iter=1000), logreg_params, cv=3, scoring='accuracy', n_jobs=-1)
logreg_grid.fit(X_train, y_train)
best_logreg = logreg_grid.best_estimator_

knn_params = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, cv=3, scoring='accuracy', n_jobs=-1)
knn_grid.fit(X_train, y_train)
best_knn = knn_grid.best_estimator_

dt_params = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
dt_grid = GridSearchCV(DecisionTreeClassifier(random_state=0), dt_params, cv=3, scoring='accuracy', n_jobs=-1)
dt_grid.fit(X_train, y_train)
best_dt = dt_grid.best_estimator_

rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
rf_grid = GridSearchCV(RandomForestClassifier(random_state=0), rf_params, cv=3, scoring='accuracy', n_jobs=-1)
rf_grid.fit(X_train, y_train)
best_rf = rf_grid.best_estimator_

acc_logreg_best = accuracy_score(y_test, best_logreg.predict(X_test))
acc_knn_best = accuracy_score(y_test, best_knn.predict(X_test))
acc_dt_best = accuracy_score(y_test, best_dt.predict(X_test))
acc_rf_best = accuracy_score(y_test, best_rf.predict(X_test))

print(f"Best Logistic Regression Accuracy: {acc_logreg_best:.4f}")
print(f"Best KNN Accuracy: {acc_knn_best:.4f}")
print(f"Best Decision Tree Accuracy: {acc_dt_best:.4f}")
print(f"Best Random Forest Accuracy: {acc_rf_best:.4f}")

Best Logistic Regression Accuracy: 0.8497
Best KNN Accuracy: 0.8297
Best Decision Tree Accuracy: 0.8501
Best Random Forest Accuracy: 0.8647
