In [35]:
# set up working catalog
import sys
from pathlib import Path
project_path = str(Path().cwd().parent.parent.resolve())
if project_path not in sys.path:
    sys.path.append(project_path)

# imports
from common.utils import get_data, get_preprocessor

import pandas as pd
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [36]:
C_VALUES = [0.01, 0.1, 1, 10, 100, 1000]

data = get_data()

X = data.drop(columns=["Target"])
y = data["Target"]

numerical_column_names = X.select_dtypes(include=["number"]).columns.tolist()
categorical_column_names = X.select_dtypes(include=["object"]).columns.tolist()

data.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,single,2nd phase - general contingent,6th choice,Animation and Multimedia Design,Daytime,Secondary education,122.0,Portuguese,Basic Ed 3rd Cycle,Other - 11th Year,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,single,International student (bachelor),2nd choice,Tourism,Daytime,Secondary education,160.0,Portuguese,Secondary Education - 12th Year or Eq.,Higher Ed - Degree,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,single,1st phase - general contingent,6th choice,Communication Design,Daytime,Secondary education,122.0,Portuguese,Basic Ed 1st Cycle (4th/5th),Basic Ed 1st Cycle (4th/5th),...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,single,2nd phase - general contingent,3rd choice,Journalism and Communication,Daytime,Secondary education,122.0,Portuguese,Basic Ed 2nd Cycle (6th–8th),Basic Ed 1st Cycle (4th/5th),...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,married,Over 23 years old,2nd choice,Social Service (evening attendance),Evening,Secondary education,100.0,Portuguese,Basic Ed 1st Cycle (4th/5th),Basic Ed 2nd Cycle (6th–8th),...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [37]:
kfold = KFold(n_splits=5, shuffle=True, random_state=6)

datasets = []

for train_indices, test_indices in kfold.split(X, y):
    datasets.append((X.iloc[train_indices], X.iloc[test_indices], y.iloc[train_indices], y.iloc[test_indices]))    

In [38]:
def train_and_predict(model, X_train, X_test, y_train, y_test):
    pipeline = Pipeline([
        ("preprocessing", get_preprocessor(numerical_column_names, categorical_column_names)),
        ("classifier", model)
    ])
    
    pipeline.fit(X_train, y_train)
    
    y_pred_train = pipeline.predict(X_train)
    y_pred_test = pipeline.predict(X_test)
    
    accuracy_train = accuracy_score(y_train, y_pred_train)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    
    return accuracy_train, accuracy_test

In [39]:
model_base = LogisticRegression(max_iter=8000)
results = {"Train": [], "Test": []}

for dataset in datasets:
    accuracy_train, accuracy_test = train_and_predict(model_base, dataset[0], dataset[1], dataset[2], dataset[3])
    results["Train"].append(accuracy_train)
    results["Test"].append(accuracy_test)
    
pd.DataFrame(results).T

Unnamed: 0,0,1,2,3,4
Train,0.809268,0.812659,0.809833,0.811529,0.803955
Test,0.777401,0.768362,0.777401,0.768362,0.795249


In [40]:
ITERS = [50, 200, 200, 300, 500, 600]
results_train = {}
results_test = {}

for c, iters in zip(C_VALUES, ITERS):
    results_train[c] = []
    results_test[c] = []

    for dataset in datasets:
        model_ridge = LogisticRegression(penalty="l2", C=c, max_iter=iters)
        
        accuracy_train, accuracy_test = train_and_predict(model_ridge, dataset[0], dataset[1], dataset[2], dataset[3])
        
        results_train[c].append(accuracy_train)
        results_test[c].append(accuracy_test)
        

print("Train")
display(pd.DataFrame(results_train).T)
print("Test")
display(pd.DataFrame(results_test).T)

Train


Unnamed: 0,0,1,2,3,4
0.01,0.775925,0.779034,0.776208,0.779599,0.771751
0.1,0.798248,0.799378,0.798248,0.802769,0.790395
1.0,0.809268,0.812659,0.809833,0.811529,0.803955
10.0,0.815202,0.817463,0.814637,0.816615,0.807627
100.0,0.815202,0.81718,0.814919,0.816615,0.808192
1000.0,0.815767,0.81718,0.815485,0.81605,0.807627


Test


Unnamed: 0,0,1,2,3,4
0.01,0.770621,0.751412,0.758192,0.750282,0.794118
0.1,0.784181,0.771751,0.771751,0.764972,0.799774
1.0,0.777401,0.768362,0.777401,0.768362,0.795249
10.0,0.769492,0.766102,0.772881,0.767232,0.791855
100.0,0.766102,0.764972,0.767232,0.762712,0.789593
1000.0,0.767232,0.762712,0.766102,0.760452,0.789593


In [41]:
ITERS = [50, 200, 200, 300, 500, 600]
results_train = {}
results_test = {}

for c, iters in zip(C_VALUES, ITERS):
    results_train[c] = []
    results_test[c] = []

    for dataset in datasets:
        model_ridge = LogisticRegression(penalty="l1", solver="liblinear", C=c, max_iter=iters)
        
        accuracy_train, accuracy_test = train_and_predict(model_ridge, dataset[0], dataset[1], dataset[2], dataset[3])
        
        results_train[c].append(accuracy_train)
        results_test[c].append(accuracy_test)
        

print("Train")
display(pd.DataFrame(results_train).T)
print("Test")
display(pd.DataFrame(results_test).T)

Train


Unnamed: 0,0,1,2,3,4
0.01,0.73015,0.728454,0.724781,0.73241,0.724011
0.1,0.771687,0.780164,0.773947,0.778186,0.771186
1.0,0.798531,0.799943,0.7974,0.798813,0.790678
10.0,0.807855,0.809833,0.807855,0.808986,0.803672
100.0,0.808703,0.809268,0.809268,0.810116,0.80452
1000.0,0.808703,0.809833,0.808986,0.810398,0.804802


Test


Unnamed: 0,0,1,2,3,4
0.01,0.723164,0.724294,0.740113,0.709605,0.738688
0.1,0.770621,0.749153,0.763842,0.755932,0.785068
1.0,0.783051,0.760452,0.770621,0.763842,0.799774
10.0,0.772881,0.758192,0.766102,0.763842,0.794118
100.0,0.770621,0.760452,0.761582,0.758192,0.783937
1000.0,0.766102,0.758192,0.760452,0.754802,0.778281
