In [None]:
## Step 2 of Project 1 - Grace and Alexander

### Loading Data:

The load_income_data function loads a dataset from a CSV file called "adult.csv" into a Pandas DataFrame.
It checks if the 'income' column exists in the DataFrame. If it does, it encodes this column using LabelEncoder and adds the encoded version as 'income_encoded'.

### Splitting Train and Test Sets:
The split_train_test function performs stratified sampling to split the data into training and test sets.
It also adds a new 'education_cat' column to the data based on the 'education.num' column's values and then stratifies the data based on 'education_cat'.
It returns X_train, X_test, y_train, and y_test, which are the feature and target sets for training and testing.

### Data Preprocessing:
The code defines several data preprocessing functions such as fill_na for imputing missing values, get_outlier_indices for identifying and removing outliers, standard_scaler for scaling numeric features, and one_hot_encoder for encoding categorical features.

### Preparing Data for Training:

The prepare_for_train function applies preprocessing steps to the training and test data. It performs imputation, outlier removal, and feature scaling, and it uses one-hot encoding for categorical features.
It uses a ColumnTransformer to apply different preprocessing steps to numeric and categorical features separately.
The function returns the prepared training and test datasets.
Hyperparameter Tuning:


### Main Function:

The main function is the entry point of the script.
It loads the data, splits it into training and test sets, prepares the data, performs hyperparameter tuning, and evaluates the final model.
The code prints out various statistics, including the root mean squared error (RMSE) and error rate of the final model.

In [3]:
#helping functions
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector, make_column_transformer

from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDRegressor

from sklearn.metrics import mean_squared_error
import numpy as np
import sys
import io
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder

In [4]:
#load data
def load_income_data():
    Income = pd.read_csv("adult.csv")

    if 'income' in Income.columns:
        label_encoder = LabelEncoder()
        Income['income_encoded'] = label_encoder.fit_transform(Income['income'])
    else:
        raise KeyError("The 'income' column does not exist in the DataFrame.")

    return Income

def split_train_test(X, y, test_ratio = 0.2):
    #---stratified sampling
    X_columns = X.columns
    y_columns = y.columns
    data = pd.concat([X, y], axis=1)

    print("Shapes of X, y, data", X.shape, y.shape, data.shape)

    df_income = data.copy()
    df_income["education_cat"] = pd.cut(df_income["education.num"],
                                       bins=[0, 3, 6, 9, 12, np.inf],
                                       labels=[1, 2, 3, 4, 5])

    dftrain_strat, dftest_strat = train_test_split(
        df_income, test_size=test_ratio, stratify=df_income["education_cat"], random_state=42)

    # Drop "education_cat" from X_train and X_test
    dftrain_strat = dftrain_strat.drop(['education_cat'], axis=1)
    dftest_strat = dftest_strat.drop(['education_cat'], axis = 1)


    X_train = dftrain_strat[X_columns]
    y_train = dftrain_strat[y_columns]

    X_test = dftest_strat[X_columns]
    y_test = dftest_strat[y_columns]

    return X_train, X_test, y_train, y_test

def fill_na(X, strategy = 'median'):
    imputer = SimpleImputer(strategy = strategy)
    imputer.fit(X)

    return imputer.transform(X)

def get_outlier_indices(X):

    isolation_forest = IsolationForest(random_state = 42)
    outlier_pred = isolation_forest.fit_predict(X)

    return outlier_pred

def standard_scaler(X):
    #scaling all columns in X such that for each column, we have mean = 0, std = 1

    std_scaler = StandardScaler()
    return std_scaler.fit_transform(X)

def one_hot_encoder(df_one_column):
    #df_one_column: a dataframe with one categorical column
    #return the trained model

    cat_encoder = OneHotEncoder(sparse = False)
    return cat_encoder.fit(df_one_column)

def prepare_for_train(Xtrain, Xtest, ytrain, ytest):

    num_pipeline = make_pipeline(SimpleImputer(strategy = 'median'),\
                                 StandardScaler())

    cat_pipeline = make_pipeline(SimpleImputer(strategy = "most_frequent"),\
                                 OneHotEncoder(handle_unknown='ignore'))

    preprocessing = ColumnTransformer([("num", num_pipeline, make_column_selector(dtype_include=np.number)),\
                                       ("cat", cat_pipeline, make_column_selector(dtype_include=object))])

    Xtrain_num = Xtrain.select_dtypes(include=[np.number])
    Xtrain_num = fill_na(Xtrain_num)
    outlier_indices = get_outlier_indices(Xtrain_num)

    Xtrain = Xtrain.iloc[outlier_indices == 1]
    ytrain = ytrain.iloc[outlier_indices == 1]

    Xtrain_prepared = preprocessing.fit_transform(Xtrain, ytrain)

    print("Xtrain_prepared.shape: ", Xtrain_prepared.shape)
    print("preprocessing.get_feature_names_out: ", preprocessing.get_feature_names_out())

    Xtest_prepared = preprocessing.fit_transform(Xtest, ytest)

    print("Xtest_prepared.shape: ", Xtest_prepared.shape)
    print("preprocessing.get_feature_names_out: ", preprocessing.get_feature_names_out())

    ytrain_prepared, ytest_prepared = ytrain, ytest




    return Xtrain_prepared, Xtest_prepared, ytrain_prepared, ytest_prepared


def grid_search_hyperparams(X_train, y_train):

    # defining parameter range
    m = X_train.shape[0]

    param_grid = {'alpha': [0.1/m, 1/m, 10/m, 100/m],
                  'eta0': [0.0005, 0.0004, 0.0003, 0.0002],
                  'penalty':['l2'],
                  'random_state': [42],
                  'max_iter':[4000]}

    print("Training ...")
    grid = GridSearchCV(SGDRegressor(),
                        param_grid,
                        return_train_score = True,
                        refit = True,
                        verbose = 3,
                        n_jobs=1,
                        cv = 3)

    # fitting the model for grid search
    grid.fit(X_train, y_train)

    #print(grid.cv_results_)
    # print best parameters after tuning
    print("Grid searching is done!")
    print("The best score: ", grid.best_score_)
    print("The best hyperparameters:")
    print(grid.best_params_)
    return grid


def main():

    #1 load data

    Income = load_income_data()

    Income_X = Income.drop("income_encoded", axis=1)
    Income_y = Income[["income_encoded"]].copy()

    # 2 split train, test sets
    Income_Xtrain, Income_Xtest, Income_ytrain, Income_ytest = split_train_test(Income_X, Income_y, test_ratio=0.35)

    #prepare for training
    Income_Xtrain_prepared, Income_Xtest_prepared, Income_ytrain_prepared, Income_ytest_prepared = prepare_for_train(Income_Xtrain, Income_Xtest, Income_ytrain, Income_ytest)

    print ("Adult_Income_Xtrain_prepared.shape: ", Income_Xtrain_prepared.shape)
    print ("Adult_Income_Xtest_prepared.shape: ", Income_Xtest_prepared.shape)

    #4 grid search for the best hyper parameters

    final_model = grid_search_hyperparams(Income_Xtrain_prepared, np.ravel(Income_ytrain_prepared,))

    #5 evaluate the final model

    final_predictions = final_model.predict(Income_Xtest_prepared)
    final_rmse = mean_squared_error(Income_ytest_prepared, final_predictions, squared=False)
    print("RMSE: ", final_rmse) # prints
    print ("Error rate (%): ", 100 * final_rmse / np.mean(Income_ytest_prepared))


In [6]:
main()

Shapes of X, y, data (32561, 15) (32561, 1) (32561, 16)
Xtrain_prepared.shape:  (18800, 109)
preprocessing.get_feature_names_out:  ['num__age' 'num__fnlwgt' 'num__education.num' 'num__capital.gain'
 'num__capital.loss' 'num__hours.per.week' 'cat__workclass_?'
 'cat__workclass_Federal-gov' 'cat__workclass_Local-gov'
 'cat__workclass_Never-worked' 'cat__workclass_Private'
 'cat__workclass_Self-emp-inc' 'cat__workclass_Self-emp-not-inc'
 'cat__workclass_State-gov' 'cat__workclass_Without-pay'
 'cat__education_10th' 'cat__education_11th' 'cat__education_12th'
 'cat__education_1st-4th' 'cat__education_5th-6th'
 'cat__education_7th-8th' 'cat__education_9th' 'cat__education_Assoc-acdm'
 'cat__education_Assoc-voc' 'cat__education_Bachelors'
 'cat__education_Doctorate' 'cat__education_HS-grad'
 'cat__education_Masters' 'cat__education_Preschool'
 'cat__education_Prof-school' 'cat__education_Some-college'
 'cat__marital.status_Divorced' 'cat__marital.status_Married-AF-spouse'
 'cat__marital.stat

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [None]:
Income = load_income_data()

Income_X = Income.drop("income_encoded", axis=1)
Income_y = Income[["income_encoded"]].copy()

    # 2 split train, test sets
Income_Xtrain, Income_Xtest, Income_ytrain, Income_ytest = split_train_test(Income_X, Income_y, test_ratio=0.35)

    #prepare for training
Income_Xtrain_prepared, Income_Xtest_prepared, Income_ytrain_prepared, Income_ytest_prepared = prepare_for_train(Income_Xtrain, Income_Xtest, Income_ytrain, Income_ytest)

#print(Income_Xtrain_prepared)
#print(Income_Xtest_prepared)
print(Income_ytrain_prepared)
print(Income_ytest_prepared)

Shapes of X, y, data (32561, 15) (32561, 1) (32561, 16)
Xtrain_prepared.shape:  (18800, 109)
preprocessing.get_feature_names_out:  ['num__age' 'num__fnlwgt' 'num__education.num' 'num__capital.gain'
 'num__capital.loss' 'num__hours.per.week' 'cat__workclass_?'
 'cat__workclass_Federal-gov' 'cat__workclass_Local-gov'
 'cat__workclass_Never-worked' 'cat__workclass_Private'
 'cat__workclass_Self-emp-inc' 'cat__workclass_Self-emp-not-inc'
 'cat__workclass_State-gov' 'cat__workclass_Without-pay'
 'cat__education_10th' 'cat__education_11th' 'cat__education_12th'
 'cat__education_1st-4th' 'cat__education_5th-6th'
 'cat__education_7th-8th' 'cat__education_9th' 'cat__education_Assoc-acdm'
 'cat__education_Assoc-voc' 'cat__education_Bachelors'
 'cat__education_Doctorate' 'cat__education_HS-grad'
 'cat__education_Masters' 'cat__education_Preschool'
 'cat__education_Prof-school' 'cat__education_Some-college'
 'cat__marital.status_Divorced' 'cat__marital.status_Married-AF-spouse'
 'cat__marital.stat