# **Homework 1 - Natali Francesco 1945581**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
data = pd.read_csv("/content/data1 (2).csv", sep=';')

# Data preprocessing
# Drop education and native-country as instructed
df = data.drop(['education', 'native-country'], axis=1)
print(df)

   age          workclass  fnlwgt  education-num       marital-status  \
0   39          State-gov   77516             13        Never-married   
1   50   Self-emp-not-inc   83311             13   Married-civ-spouse   
2   38            Private  215646              9             Divorced   
3   53            Private  234721              7   Married-civ-spouse   
4   28            Private  338409             13   Married-civ-spouse   

           occupation    relationship    race      sex  capital-gain  \
0        Adm-clerical   Not-in-family   White     Male          2174   
1     Exec-managerial         Husband   White     Male             0   
2   Handlers-cleaners   Not-in-family   White     Male             0   
3   Handlers-cleaners         Husband   Black     Male             0   
4      Prof-specialty            Wife   Black   Female             0   

   capital-loss  hours-per-week  target  
0             0              40   <=50K  
1             0              13   <=50K  
2 

In [None]:
# Combine occupation into 5 categories
# Define the mapping
occupation_map = {
    'Adm-clerical': 'Office & Professional',
    'Exec-managerial': 'Office & Professional',
    'Prof-specialty': 'Office & Professional',
    'Tech-support': 'Office & Professional',
    'Sales': 'Office & Professional',

    'Craft-repair': 'Skilled Manual Labor',
    'Machine-op-inspct': 'Skilled Manual Labor',
    'Handlers-cleaners': 'Skilled Manual Labor',
    'Transport-moving': 'Skilled Manual Labor',

    'Other-service': 'Personal & Protective Services',
    'Priv-house-serv': 'Personal & Protective Services',
    'Protective-serv': 'Personal & Protective Services',

    'Farming-fishing': 'Agriculture & Military',
    'Armed-forces': 'Agriculture & Military',

    '?': 'Not Specified'
}

df['occupation'] = df['occupation'].str.strip()
# Apply the mapping
df['occupation_grouped'] = df['occupation'].map(occupation_map)
print(df.head())

   age          workclass  fnlwgt  education-num       marital-status  \
0   39          State-gov   77516             13        Never-married   
1   50   Self-emp-not-inc   83311             13   Married-civ-spouse   
2   38            Private  215646              9             Divorced   
3   53            Private  234721              7   Married-civ-spouse   
4   28            Private  338409             13   Married-civ-spouse   

          occupation    relationship    race      sex  capital-gain  \
0       Adm-clerical   Not-in-family   White     Male          2174   
1    Exec-managerial         Husband   White     Male             0   
2  Handlers-cleaners   Not-in-family   White     Male             0   
3  Handlers-cleaners         Husband   Black     Male             0   
4     Prof-specialty            Wife   Black   Female             0   

   capital-loss  hours-per-week  target     occupation_grouped  
0             0              40   <=50K  Office & Professional  
1   

In [None]:
# Define target and features
X = df.drop(['target'], axis=1)
y = df['target'].str.strip()  # Rimuove spazi extra da target

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=123
)

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Remove 'education' and 'native-country' if still in the list
categorical_cols = [col for col in categorical_cols if col not in ['education', 'native-country']]

# Column transformer
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])

In [None]:
# Models to evaluate
models = {
    'Random Forest': RandomForestClassifier(random_state=123),
    'Gradient Boosting': GradientBoostingClassifier(random_state=123),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=123)
}

# Loop over models
for name, model in models.items():
    clf = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)

    print(f"--- {name} ---")
    print("Accuracy:", accuracy_score(y_val, y_pred))
    print(classification_report(y_val, y_pred))
    print(confusion_matrix(y_val, y_pred))
    print("\n")

--- Random Forest ---
Accuracy: 0.8584297266864571
              precision    recall  f1-score   support

       <=50K       0.89      0.93      0.91      7408
        >50K       0.75      0.63      0.68      2361

    accuracy                           0.86      9769
   macro avg       0.82      0.78      0.80      9769
weighted avg       0.85      0.86      0.85      9769

[[6905  503]
 [ 880 1481]]


--- Gradient Boosting ---
Accuracy: 0.8671307196232982
              precision    recall  f1-score   support

       <=50K       0.89      0.95      0.92      7408
        >50K       0.79      0.61      0.69      2361

    accuracy                           0.87      9769
   macro avg       0.84      0.78      0.80      9769
weighted avg       0.86      0.87      0.86      9769

[[7022  386]
 [ 912 1449]]


--- Logistic Regression ---
Accuracy: 0.8547446002661481
              precision    recall  f1-score   support

       <=50K       0.88      0.94      0.91      7408
        >50K    

In [None]:
# Now search optimal hyperparameters for each model

In [None]:
# Random Forest
param_grid_rf = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5]
}

rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=123))
])

grid_rf = GridSearchCV(rf_pipeline, param_grid_rf, cv=3, n_jobs=-1, scoring='accuracy')
grid_rf.fit(X_train, y_train)

print("Best RF parameters:", grid_rf.best_params_)
print("Best RF score:", grid_rf.best_score_)

Best RF parameters: {'classifier__max_depth': 20, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
Best RF score: 0.8631097701859726


In [None]:
# Gradient Boosting
param_grid_gb = {
    'classifier__n_estimators': [100, 200],
    'classifier__learning_rate': [0.1, 0.05],
    'classifier__max_depth': [3, 5]
}

gb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=123))
])

grid_gb = GridSearchCV(gb_pipeline, param_grid_gb, cv=3, n_jobs=-1, scoring='accuracy')
grid_gb.fit(X_train, y_train)

print("Best GB parameters:", grid_gb.best_params_)
print("Best GB score:", grid_gb.best_score_)

Best GB parameters: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__n_estimators': 200}
Best GB score: 0.8728061818687776


In [None]:
# Logistic Regression
param_grid_lr = {
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__penalty': ['l2'],
    'classifier__solver': ['lbfgs']
}

lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=123))
])

grid_lr = GridSearchCV(lr_pipeline, param_grid_lr, cv=3, n_jobs=-1, scoring='accuracy')
grid_lr.fit(X_train, y_train)

print("Best LR parameters:", grid_lr.best_params_)
print("Best LR score:", grid_lr.best_score_)

Best LR parameters: {'classifier__C': 1, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}
Best LR score: 0.851263502519299


In [None]:
# Best model result is obtained with GB