In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler ,LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, RFE

In [44]:
data = pd.read_csv("adult.data")
data.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain','capital-loss','hours-per-week','native-country','income']
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [45]:
# Check for missing values
print(data.isnull().sum())

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64


In [46]:
# Convert the target variable to binary
data['income'] = data['income'].apply(lambda x: x.strip())
data['income'] = data['income'].replace({'>50K': 1, '<=50K': 0})

In [47]:
# Convert categorical variables to numerical
categorical_vars = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
for var in categorical_vars:
    le = LabelEncoder()
    data[var] = le.fit_transform(data[var])
    
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
1,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
2,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
3,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0
4,37,4,284582,12,14,2,4,5,4,0,0,0,40,39,0


In [48]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32560 entries, 0 to 32559
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   age             32560 non-null  int64
 1   workclass       32560 non-null  int32
 2   fnlwgt          32560 non-null  int64
 3   education       32560 non-null  int32
 4   education-num   32560 non-null  int64
 5   marital-status  32560 non-null  int32
 6   occupation      32560 non-null  int32
 7   relationship    32560 non-null  int32
 8   race            32560 non-null  int32
 9   sex             32560 non-null  int32
 10  capital-gain    32560 non-null  int64
 11  capital-loss    32560 non-null  int64
 12  hours-per-week  32560 non-null  int64
 13  native-country  32560 non-null  int32
 14  income          32560 non-null  int64
dtypes: int32(8), int64(7)
memory usage: 2.7 MB


In [49]:
# Split the data into training and testing sets
X = data.drop('income', axis=1)
y = data['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [50]:
# Scale the data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [51]:
#TASK 1

# Define the models
models = [
    ('Random Forest', RandomForestClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier()),
    ('SVM', SVC()),
    ('Naive Bayes', GaussianNB()),
    ('K-Neighbors Classifier', KNeighborsClassifier())
]


In [52]:
# Train and evaluate the models
for name, model in models:
    print('Model:', name)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('Confusion Matrix:', confusion_matrix(y_test, y_pred))
    print('Classification Report:', classification_report(y_test, y_pred))
    print('-'*50)

Model: Random Forest
Accuracy: 0.8571867321867321
Confusion Matrix: [[4569  343]
 [ 587 1013]]
Classification Report:               precision    recall  f1-score   support

           0       0.89      0.93      0.91      4912
           1       0.75      0.63      0.69      1600

    accuracy                           0.86      6512
   macro avg       0.82      0.78      0.80      6512
weighted avg       0.85      0.86      0.85      6512

--------------------------------------------------
Model: Gradient Boosting
Accuracy: 0.8657862407862408
Confusion Matrix: [[4650  262]
 [ 612  988]]
Classification Report:               precision    recall  f1-score   support

           0       0.88      0.95      0.91      4912
           1       0.79      0.62      0.69      1600

    accuracy                           0.87      6512
   macro avg       0.84      0.78      0.80      6512
weighted avg       0.86      0.87      0.86      6512

--------------------------------------------------
Mode

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Accuracy: 0.8194103194103194
Confusion Matrix: [[4441  471]
 [ 705  895]]
Classification Report:               precision    recall  f1-score   support

           0       0.86      0.90      0.88      4912
           1       0.66      0.56      0.60      1600

    accuracy                           0.82      6512
   macro avg       0.76      0.73      0.74      6512
weighted avg       0.81      0.82      0.81      6512

--------------------------------------------------


In [53]:
#TASK 2

# Attribute selection algorithms
attribute_selectors = [
    ('SelectKBest-f_classif', SelectKBest(f_classif, k=5)),
    ('SelectKBest-mutual_info_classif', SelectKBest(mutual_info_classif, k=5)),
    ('Recursive Feature Elimination', RFE(estimator=LogisticRegression(), n_features_to_select=5))
]

In [54]:
# Save the column names of the original X dataframe
all_columns = X.columns

# Train the attribute selectors on the training data
for name, selector in attribute_selectors:
    print('Attribute Selector:', name)
    selector.fit(X_train, y_train)
    X_train_selected = selector.transform(X_train)
    X_test_selected = selector.transform(X_test)
    selected_features = all_columns[selector.get_support()]
    print('Selected Features:', selected_features)
    print('-'*50)

Attribute Selector: SelectKBest-f_classif
Selected Features: Index(['age', 'education-num', 'relationship', 'capital-gain',
       'hours-per-week'],
      dtype='object')
--------------------------------------------------
Attribute Selector: SelectKBest-mutual_info_classif
Selected Features: Index(['age', 'education', 'marital-status', 'relationship', 'capital-gain'], dtype='object')
--------------------------------------------------
Attribute Selector: Recursive Feature Elimination
Selected Features: Index(['age', 'education-num', 'sex', 'capital-gain', 'hours-per-week'], dtype='object')
--------------------------------------------------


In [55]:
#TASK 3

# Train and evaluate the models with all attributes
print('Results with all attributes:')
for name, model in models:
    print('Model:', name)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('Accuracy:', accuracy_score(y_test, y_pred))

# Train and evaluate the models with selected attributes
print('\n\nResults with selected attributes:')
for name, selector in attribute_selectors:
    print('Attribute Selector:', name)
    selector.fit(X_train, y_train)
    X_train_selected = selector.transform(X_train)
    X_test_selected = selector.transform(X_test)
    for model_name, model in models:
        print('Model:', model_name)
        model.fit(X_train_selected, y_train)
        y_pred = model.predict(X_test_selected)
        print('Accuracy:', accuracy_score(y_test, y_pred))
    print('-'*50)


Results with all attributes:
Model: Random Forest
Accuracy: 0.8573402948402948
Model: Gradient Boosting
Accuracy: 0.8657862407862408
Model: SVM
Accuracy: 0.8433660933660934
Model: Naive Bayes
Accuracy: 0.8066646191646192
Model: K-Neighbors Classifier


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Accuracy: 0.8194103194103194


Results with selected attributes:
Attribute Selector: SelectKBest-f_classif
Model: Random Forest
Accuracy: 0.8347665847665847
Model: Gradient Boosting
Accuracy: 0.8538083538083538
Model: SVM
Accuracy: 0.8450552825552825
Model: Naive Bayes
Accuracy: 0.7922297297297297
Model: K-Neighbors Classifier


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Accuracy: 0.8286240786240786
--------------------------------------------------
Attribute Selector: SelectKBest-mutual_info_classif
Model: Random Forest
Accuracy: 0.8378378378378378
Model: Gradient Boosting
Accuracy: 0.8524262899262899
Model: SVM
Accuracy: 0.8106572481572482
Model: Naive Bayes
Accuracy: 0.7911547911547911
Model: K-Neighbors Classifier


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Accuracy: 0.8180282555282555
--------------------------------------------------
Attribute Selector: Recursive Feature Elimination
Model: Random Forest
Accuracy: 0.812960687960688
Model: Gradient Boosting
Accuracy: 0.8353808353808354
Model: SVM
Accuracy: 0.8267813267813268
Model: Naive Bayes
Accuracy: 0.792997542997543
Model: K-Neighbors Classifier


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Accuracy: 0.8014434889434889
--------------------------------------------------
