In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
sns.color_palette("muted")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc
from sklearn.pipeline import make_pipeline
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings('ignore')
import pickle as pkl
#%matplotlib inline

In [4]:
df = pd.read_csv('../data/fitting_data.csv')
df.drop(['Unnamed: 0'],axis=1,inplace=True)
df = df.fillna(0)
y = df.pop('music').values
X = df.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44, stratify=X[:,2])

# Pipeline dictionary
pipelines = {
    'l1' : make_pipeline(StandardScaler(), LogisticRegression( penalty = 'l1', random_state=123)),
    'l2' : make_pipeline(StandardScaler(), LogisticRegression( penalty = 'l2', random_state=123))
#     'rf' : make_pipeline(StandardScaler(), RandomForestClassifier(random_state=123)),
#     'gb' : make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=123)),
#     'linsvc' : make_pipeline(StandardScaler(), SVC(random_state=123,probability=True)),
#     'rbfsvc' : make_pipeline(StandardScaler(), SVC(random_state=123,probability=True))
}

# Logistic Regression hyperparameters
l1_hyperparameters = {
    'logisticregression__C' : np.linspace(1e-3, 1e3, 10),
}

l2_hyperparameters = {
    'logisticregression__C' : np.linspace(1-3, 1e3, 10),
}

# Random Forest hyperparameters
rf_hyperparameters = {
    'randomforestclassifier__n_estimators': [100, 200],
    'randomforestclassifier__max_features': ['auto', 'sqrt', 0.33]
}

# Boosted Tree hyperparameters
gb_hyperparameters = {
    'gradientboostingclassifier__n_estimators': [100, 200],
    'gradientboostingclassifier__learning_rate': [0.01, 0.1],
    'gradientboostingclassifier__max_depth': [1, 3, 5]
}

linsvc_hyperparameters = {
    'svc__C' : [1e-5, 1e-1, 1e3],
    'svc__kernel' : ['linear']
}

rbfsvc_hyperparameters = {
    'svc__C': [1e-7, 1e-3, 1e1],
    'svc__gamma' : [1e-7, 1e-3, 1e1],
    'svc__kernel' : ['rbf']
}
# Create hyperparameters dictionary
hyperparameters = {
    'l1' : l1_hyperparameters, 
    'l2' : l2_hyperparameters 
#     'rf' : rf_hyperparameters,
#     'gb' : gb_hyperparameters,
#     'linsvc' : linsvc_hyperparameters,
#     'rbfsvc' : rbfsvc_hyperparameters
    
}

# Create empty dictionary called fitted_models
fitted_models = {}

# Loop through model pipelines, tuning each one and saving it to fitted_models
for name, pipeline in pipelines.items():
    # Create cross-validation object from pipeline and hyperparameters
    model = GridSearchCV(pipeline, hyperparameters[name], scoring = 'neg_log_loss', cv=10, refit=True)

    # Fit model on X_train, y_train
    model.fit(X_train, y_train)    
    
    # Store model in fitted_models[name] 
    fitted_models[name] = model    
    
    # Print '{name} has been fitted'
    print(name, 'has been fitted.')
    
for name, model in fitted_models.items():
    pred = fitted_models[name].predict_proba(X_test)
    # Get just the prediction for the positive class (1)
    pred = [p[1] for p in pred]
    # Calculate ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, pred)
    # Calculate AUROC
    print(name, auc(fpr, tpr))

l1 has been fitted.


ValueError: Penalty term must be positive; got (C=-2.0)

In [30]:
df.columns

Index(['financial_aid', 'caucasian', 'african_american', 'latino', 'asian',
       'other_race', 'multiracial', 'male', 'median_income'],
      dtype='object')

In [None]:
pred = fitted_models['l2'].predict_proba(X_test)
pred = [p[1] for p in pred]
fpr, tpr, thresholds = roc_curve(y_test, pred)

# Initialize figure
fig = plt.figure(figsize=(8,8))
plt.title('Receiver Operating Characteristic',fontsize=20)

# Plot ROC curve
plt.plot(fpr, tpr, label='L2 norm LR',linewidth=3)
plt.legend(loc='lower right',fontsize=16)

# Diagonal 45 degree line
plt.plot([0,1],[0,1], 'k--',linewidth=3)

# Axes limits and labels
plt.xlim([-0.1, 1.1])
plt.ylim([-0.1, 1.1])
plt.ylabel('True Positive Rate',fontsize=18)
plt.xlabel('False Positive Rate',fontsize=18)
plt.tick_params(labelsize=16)
plt.grid(True)
plt.show()
fig.savefig('../plots/ROC.pdf',bbox_inches='tight')

In [None]:
coefficients = fitted_models['l2'].best_estimator_.named_steps['logisticregression'].coef_.reshape(-1,1)[:,0]
features = np.array(df.columns)
coefficients_inds = coefficients.argsort()
sorted_coefficients = coefficients[coefficients_inds[::-1]]
sorted_features = features[coefficients_inds[::-1]]
plot_colors = ['royalblue','lightgrey','lightgrey','lightgrey','lightgrey','lightgrey','lightgrey','lightgrey','lightgrey','lightgrey','darksalmon']
ax = sns.barplot(x=sorted_coefficients,y=sorted_features,color='grey')#,palette=plot_colors)
ax.set_xlabel(r'$\leftarrow$' + 'non-music major                                   music major'+r'$\rightarrow$',fontsize=13)
ax.set_ylabel('Student Characteristic',fontsize=13)
plt.title('Which graduates will continue in music?',fontsize=13)
plt.tick_params(labelsize=11)
ax.get_xaxis().set_ticks([])
fig = ax.get_figure()
fig.savefig('../plots/coefficients.pdf',bbox_inches='tight')





In [2]:
fitted_models['l2'].best_estimator_.named_steps['logisticregression']

NameError: name 'fitted_models' is not defined

In [34]:
pipelines['l2'].get_params

<bound method Pipeline.get_params of Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=123, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])>