In [18]:
# Logistic Regression Project 1 Grace and Alexander 

### Logistic Regression 
In our algorithm exploration the second best alogorithm to train our data was logistic regression

### Main
The code itself follows the same principles as the decision tree code. We fit the data using Logistic Regression and get our mean squared error (MSE) and our error rate. 

In [7]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

import math
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector, make_column_transformer

from sklearn import set_config

from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression




from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


from sklearn.metrics import mean_squared_error
import numpy as np
import sys
import io
import matplotlib.pyplot as plt

In [16]:
def load_income_data(): 
    Income = pd.read_csv("adult.csv")
    
    # Check if the 'income' column exists in the DataFrame
    if 'income' in Income.columns:
        # Encode the 'income' column and add it as 'income_encoded'
        label_encoder = LabelEncoder()
        Income['income_encoded'] = label_encoder.fit_transform(Income['income'])
    else:
        raise KeyError("The 'income' column does not exist in the DataFrame.")
    
    return Income

def split_train_test(X, y, test_ratio = 0.2):
    #---stratified sampling
    X_columns = X.columns
    y_columns = y.columns
    data = pd.concat([X, y], axis=1)

    print("Shapes of X, y, data", X.shape, y.shape, data.shape)

    df_income = data.copy()
    df_income["education_cat"] = pd.cut(df_income["education.num"],
                                       bins=[0, 3, 6, 9, 12, np.inf],
                                       labels=[1, 2, 3, 4, 5])
    
    dftrain_strat, dftest_strat = train_test_split(
        df_income, test_size=test_ratio, stratify=df_income["education_cat"], random_state=42)
    
    # Drop "income_encoded" from X_train and X_test
    dftrain_strat = dftrain_strat.drop(['education_cat'], axis=1)
    dftest_strat = dftest_strat.drop(['education_cat'], axis = 1)

    
    X_train = dftrain_strat[X_columns]
    y_train = dftrain_strat[y_columns]

    X_test = dftest_strat[X_columns]
    y_test = dftest_strat[y_columns]

    return X_train, X_test, y_train, y_test

def fill_na(X, strategy = 'median'):
    imputer = SimpleImputer(strategy = strategy)
    imputer.fit(X)

    return imputer.transform(X)

def get_outlier_indices(X):
    
    isolation_forest = IsolationForest(random_state = 42)
    outlier_pred = isolation_forest.fit_predict(X)

    return outlier_pred

def prepare_for_train(Xtrain, Xtest, ytrain, ytest):
    
    num_pipeline = make_pipeline(SimpleImputer(strategy = 'median'),\
                                 StandardScaler())
    
    cat_pipeline = make_pipeline(SimpleImputer(strategy = "most_frequent"),\
                                 OneHotEncoder(handle_unknown='ignore'))

    preprocessing = ColumnTransformer([("num", num_pipeline, make_column_selector(dtype_include=np.number)),\
                                       ("cat", cat_pipeline, make_column_selector(dtype_include=object))])

    Xtrain_num = Xtrain.select_dtypes(include=[np.number])
    Xtrain_num = fill_na(Xtrain_num)
    outlier_indices = get_outlier_indices(Xtrain_num)

    Xtrain = Xtrain.iloc[outlier_indices == 1]
    ytrain = ytrain.iloc[outlier_indices == 1]
    
    Xtrain_prepared = preprocessing.fit_transform(Xtrain, ytrain)
    Xtest_prepared = preprocessing.fit_transform(Xtest, ytest)
    ytrain_prepared, ytest_prepared = ytrain.values, ytest.values
    
    return Xtrain_prepared, Xtest_prepared, ytrain_prepared, ytest_prepared

def main():

    #1 load data
    
    Income = load_income_data()

    Income_X = Income.drop("income_encoded", axis=1)
    Income_y = Income[["income_encoded"]].copy()

    # 2 split train, test sets
    Income_Xtrain, Income_Xtest, Income_ytrain, Income_ytest = split_train_test(Income_X, Income_y, test_ratio=0.35)
    
    #prepare for training 
    Income_Xtrain_prepared, Income_Xtest_prepared, Income_ytrain_prepared, Income_ytest_prepared = prepare_for_train(Income_Xtrain, Income_Xtest, Income_ytrain, Income_ytest)

    print("Training...") 
    print("Income_Xtrain_prepared.shape: ", Income_Xtrain_prepared.shape) 
    print("mean_Income_ytrain_prepared: ", np.mean(Income_ytrain_prepared)) 
    
    logistic_regressor = LogisticRegression(random_state=42)  
    
    logistic_regressor.fit(Income_Xtrain_prepared, np.ravel(Income_ytrain_prepared))

    test_pred = logistic_regressor.predict(Income_Xtest_prepared)

    # For regression tasks, you can calculate Mean Squared Error (MSE)
    mse = mean_squared_error(Income_ytest_prepared, test_pred, squared=True)
    print("MSE: ", mse)
    print("Error rate (%): ", 100 * math.sqrt(mse) / np.mean(Income_ytest_prepared))
        

In [17]:
main()

Shapes of X, y, data (32561, 15) (32561, 1) (32561, 16)
Training...
Income_Xtrain_prepared.shape:  (18800, 109)
mean_Income_ytrain_prepared:  0.2078191489361702
MSE:  0.0
Error rate (%):  0.0
