This notebook is comparing regression coefficients for L1, L2, and elasticnet.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# import warnings 
# warnings.filterwarnings("ignore")

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef

**Loading Data**

Cleaned and integrated from [Data_Integration.ipynb](Data_Integration.ipynb)

In [3]:
df = pd.read_csv('DATA_Texas_District_v2.csv', sep=',', header=0)
print(df.shape)

(955, 90)


**Get dummies for categorical feature `Locale`**

In [4]:
df = pd.get_dummies(df, columns=['Locale'], prefix='Locale')
print(df.shape)

(955, 101)


**Split data**

In [5]:
labels = ['Label_Math', 'Label_Reading', 'Label_All']

cols_drop = ['% Tested Reading G3 Diff',
            '% Tested Reading G4 Diff',
            '% Tested Reading G5 Diff',
            '% Tested Reading G6 Diff',
            '% Tested Reading G7 Diff',
            '% Tested Reading G8 Diff',
            'District #', 'County #']

y = df['Label_Math'].values
X = df.drop(columns=labels + cols_drop).values
columns = df.drop(columns=labels + cols_drop).columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y,
                                                    random_state=123, shuffle=True)
print(X.shape)

(955, 90)


**Normalization**

In [6]:
scaler = StandardScaler()

scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
random_state=123
results = pd.DataFrame()

### Logistic Regression - Lasso ###

In [8]:
from sklearn.linear_model import LogisticRegression

estimator = LogisticRegression(penalty='l1', solver='liblinear', C=0.1, multi_class='ovr', random_state=random_state)
estimator.fit(X_train, y_train)

print('Train Accuracy: {:0.2f}'.format(estimator.score(X_train, y_train)))
print('Test Accuracy: {:0.2f}'.format(estimator.score(X_test, y_test)))


classes = ['Loss', 'Expected', 'Gain']
for coef, cl in zip(estimator.coef_, classes):
    temp = pd.DataFrame({'feature': columns,
                         'model': 'Logistic Regression - L1',
                         'coef': coef, 
                         'class': cl})
    results = pd.concat([results, temp], ignore_index=True)

Train Accuracy: 0.65
Test Accuracy: 0.63


### Logistic Regression - Ridge ###

In [9]:
from sklearn.linear_model import LogisticRegression

estimator = LogisticRegression(penalty='l2', solver='liblinear', multi_class='ovr', C=0.1, random_state=random_state)
estimator.fit(X_train, y_train)

print('Train Accuracy: {:0.2f}'.format(estimator.score(X_train, y_train)))
print('Test Accuracy: {:0.2f}'.format(estimator.score(X_test, y_test)))


classes = ['Loss', 'Expected', 'Gain']
for coef, cl in zip(estimator.coef_, classes):
    temp = pd.DataFrame({'feature': columns,
                         'model': 'Logistic Regression - L2',
                         'coef': coef, 
                         'class': cl})
    results = pd.concat([results, temp], ignore_index=True)

Train Accuracy: 0.68
Test Accuracy: 0.61


### Logistic Regression - ElasticNet ###

In [10]:
from sklearn.linear_model import LogisticRegression

estimator = LogisticRegression(penalty='elasticnet', solver='saga', multi_class='ovr', C=0.1, random_state=random_state,
                              l1_ratio=.5, max_iter=10000)
estimator.fit(X_train, y_train)

print('Train Accuracy: {:0.2f}'.format(estimator.score(X_train, y_train)))
print('Test Accuracy: {:0.2f}'.format(estimator.score(X_test, y_test)))


classes = ['Loss', 'Expected', 'Gain']
for coef, cl in zip(estimator.coef_, classes):
    temp = pd.DataFrame({'feature': columns,
                         'model': 'Logistic Regression - elasticnet',
                         'coef': coef, 
                         'class': cl})
    results = pd.concat([results, temp], ignore_index=True)

Train Accuracy: 0.67
Test Accuracy: 0.64


In [11]:
results.to_csv('Feature_Selection_Regression_Math.csv', index=False)