## Machine Learning Pipeline for Income Classification

This script implements a machine learning pipeline to classify income levels based on various features.

The pipeline includes data preprocessing, feature transformation, model training, and evaluation.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import os

In [None]:
%matplotlib inline
sns.set_style('whitegrid')

### Load and explore the dataset

In [None]:
# Ensure necessary directories exist
os.makedirs("./src/models", exist_ok=True)
os.makedirs("./src/result_notebooks", exist_ok=True)

In [None]:
# Load Dataset
df_train = pd.read_csv("./src/data/train.csv")
df_test = pd.read_csv("./src/data/test.csv")

In [None]:
# Display basic information about the dataset
print("Training Data Info:\n")
df_train.info()
print("\nTest Data Info:\n")
df_test.info()

# Display first few rows of the dataset
print("\nFirst few rows of training data:\n")
print(df_train.head())

# Summary statistics
print("\nSummary Statistics:\n")
print(df_train.describe())

In [None]:
# Check for missing values
print("\nMissing Values in Training Data:\n")
print(df_train.isnull().sum())

# Target variable analysis
print("\nTarget Variable Distribution (INCOME):\n")
print(df_train["INCOME"].value_counts())

# Plot the distribution of the target variable
plt.figure(figsize=(6,4))
sns.countplot(data=df_train, x="INCOME", palette="viridis")
plt.title("Distribution of Target Variable (INCOME)")
plt.xlabel("Income Category")
plt.ylabel("Count")
plt.show()

### Preprocess the data

In [None]:
# Data Preprocessing
"""
### Data Preprocessing

The dataset contains both numerical and categorical features. We need to:
1. Handle missing values.
2. Encode categorical variables using OneHotEncoder.
3. Scale numerical features using StandardScaler.
"""

# Identify categorical and numerical columns
categorical_cols = df_train.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols.remove("INCOME")  # Remove target variable from features

# Define transformers for preprocessing
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Combine transformers into a ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

### Create and Train the model

In [None]:
# Model Training
"""
### Model Training

We will train three different models:
1. RandomForestClassifier
2. GradientBoostingClassifier
3. LogisticRegression

The best-performing model will be selected based on cross-validation scores.
"""

# Define models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000)
}

# Train models and evaluate performance
for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    scores = cross_val_score(pipeline, df_train.drop(columns=['INCOME']), df_train['INCOME'], cv=5, scoring='accuracy')
    print(f"{name}: Mean Accuracy = {scores.mean():.4f} ± {scores.std():.4f}")

### Cross-validation and hyperparameter tuning

In [None]:
if mean_score > best_score:
        best_score = mean_score
        best_model = (name, model)

print(f"\nBest Model: {best_model[0]} with Accuracy = {best_score:.4f}")

# Hyperparameter tuning for the best model
"""
### Hyperparameter Tuning

We will use GridSearchCV to find the best hyperparameters for the selected model.
"""

param_grid = {
    "Random Forest": {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [10, 20, None],
        'classifier__min_samples_split': [2, 5, 10]
    },
    "Gradient Boosting": {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 10]
    },
    "Logistic Regression": {
        'classifier__C': [0.01, 0.1, 1, 10],
        'classifier__solver': ['lbfgs', 'liblinear']
    }
}

best_model_name, best_model_instance = best_model
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', best_model_instance)])
grid_search = GridSearchCV(pipeline, param_grid[best_model_name], cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(df_train.drop(columns=['INCOME']), df_train['INCOME'])

# Display best parameters and best score
print(f"Best parameters for {best_model_name}: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

### Evaluate the model

In [None]:
# Model Evaluation on Test Data
"""
### Model Evaluation

Now, we evaluate the best model on the test dataset and generate key performance metrics.
"""

# Make predictions
y_test = df_test["INCOME"]
X_test = df_test.drop(columns=["INCOME"])
y_pred = grid_search.best_estimator_.predict(X_test)

# Print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

# Generate classification report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

### Save the trained model

In [None]:
# Save the trained model
"""
### Save the Best Model

We save the best trained model using joblib for future use.
"""

model_path = "./src/models/best_model.joblib"
joblib.dump(grid_search.best_estimator_, model_path)
print(f"Model saved to {model_path}")