In [109]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [110]:
#setting up salary data
sal=pd.concat([pd.read_csv(f"salaries/p_{yr}.csv") for yr in range(2018,2010,-1)])
sal=sal.groupby("name").first()
sal.head()

Unnamed: 0_level_0,title,salary,benefits
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A. Tupling,Professor,180567.92,519.72
A.Russell Tupling,Professor,166933.16,578.96
Abigail Scholer,Associate Professor,132597.73,373.0
Achim Kempf,Professor,187834.44,395.88
Ada Barlatt,Assistant Professor,102159.48,359.08


In [111]:
#Setting up dataframes
eval_data = pd.read_csv("averages.csv")
eval_data[["term"]]=pd.Categorical(eval_data.term)
eval_data[["section"]]=pd.Categorical(eval_data.section)
eval_data["response_rate"] = eval_data.num_responses / eval_data.enrolled
df.replace([np.inf, -np.inf], np.nan)

df=eval_data[eval_data.num_responses>10] # only surveys with >10 responses
df=df.groupby("instructor").filter(lambda x:len(x)>2) # only teachers with >=3 classes
df=pd.merge(df,sal,how='left', left_on=["instructor"],right_on=['name']).drop(['benefits',"section"],axis=1)
df.head()

Unnamed: 0,term,ccode,instructor,organization,expl_lvl,q_treatment,visual,oral,help,interesting,...,printed_notes,textbook,new_material,assign_amount,hours_outside,num_responses,enrolled,response_rate,title,salary
0,1131,['ACTSC 232'],James Adcock,1.360825,2.804124,1.453608,1.56701,1.268041,2.0,1.641304,...,1.516129,1.865169,2.659794,2.927835,2.0,97,143,0.678322,Lecturer,136295.28
1,1131,['ACTSC 371'],Brent Matheson,2.221053,2.821053,2.0,2.452632,2.242105,2.565217,2.070588,...,1.909091,2.271429,2.851064,3.021277,1.903226,95,236,0.402542,Lecturer,128478.6
2,1131,['ACTSC 372'],Peter Wood,1.435644,2.623762,1.4,1.574257,1.386139,1.712121,1.59,...,1.785714,2.103896,2.762376,2.891089,1.93,101,174,0.58046,Lecturer,162561.88
3,1131,"['ACTSC 433', 'ACTSC 833']",Jun Cai,1.723077,2.859375,2.262295,1.923077,2.476923,2.352941,1.833333,...,1.578947,2.166667,2.907692,2.707692,1.96875,65,107,0.607477,Professor,161277.18
4,1131,"['ACTSC 446', 'ACTSC 846']",Ruodo Wang,2.308411,2.557692,1.990566,2.64486,2.679245,2.0,2.03125,...,1.819444,2.370968,2.538462,2.84466,2.038095,107,178,0.601124,Associate Professor,141182.6


In [112]:
df_tenure=df.groupby("instructor").mean()
df_tenure["num_taught"]=df.groupby("instructor").size()
df_tenure["title"]=df.groupby("instructor").first()[["title"]]
has_tenure=df_tenure.title.isin(["Professor","Associate Professor"])

df_tenure.head()

Unnamed: 0_level_0,organization,expl_lvl,q_treatment,visual,oral,help,interesting,overall,attendance,assign_helpful,...,textbook,new_material,assign_amount,hours_outside,num_responses,enrolled,response_rate,salary,num_taught,title
instructor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Adam Kolkiewicz,1.691446,2.429451,1.51089,1.9506,1.885041,1.362193,1.635859,1.561237,1.243845,1.47017,...,1.550505,2.583002,2.639731,2.495896,29.0,72.333333,0.446812,175958.2,3,Associate Professor
Adam Roegiest,1.86239,2.794451,1.832141,2.112739,1.795893,2.033704,1.717994,1.85625,1.278992,1.531858,...,2.391384,2.577641,2.122081,3.652393,51.75,90.5,0.583992,,4,
Adriel Dean-Hall,2.314286,3.033968,2.013506,2.449048,2.227879,2.418803,1.849768,2.234791,1.599206,1.598413,...,2.055556,2.777233,2.349206,3.24127,25.0,69.333333,0.359124,,3,
Ahmad Alrefai,3.056345,3.086275,2.738697,3.266667,3.419231,1.827778,1.891059,3.225774,1.857143,1.709017,...,2.236467,2.333333,2.300389,3.327778,32.0,53.0,0.558923,,3,
Ahmed Ayaz Ataullah,2.038713,2.891438,1.781366,2.207169,2.030823,2.06904,2.168041,1.939435,1.687095,1.855265,...,2.356944,2.887103,2.975629,1.891201,51.0,98.166667,0.525865,,6,


In [113]:
#set up of names and classifiers 
names = ["Nearest Neighbors", "SVM tuned",
         "Decision Tree", "Random Forest","AdaBoost",
         "Naive Bayes","LDA","QDA","LogReg"]

In [114]:
#grid search for parameter selection of certain estimators
## SVC grid search
param_grid_SVC = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}
 ]

SVC_clf = RandomizedSearchCV(SVC(), param_grid_SVC, cv=5,iid=False)

#RF
param_grid_RF = [
 {'bootstrap': [True, False],
 'max_depth':[10,20,30,40,50,80,100],
 'max_features':['auto',None],
 'min_samples_leaf': [1,2,3],
 'min_samples_split': [2,5,10],
 'n_estimators': [200,400,800,1000,2000]}
 ]

RF_clf = RandomizedSearchCV(RandomForestClassifier(),param_grid_RF,cv=5,iid=False)

param_grid_DT = [
 {'criterion': ['gini','entropy'],
  'splitter': ['best','random'],
 'max_depth': [10,20,30,40,50,80,100],
 'max_features': ['auto',None],
 'min_samples_leaf': [1,2,3],
 'min_samples_split': [2,5,10],
 'n_estimators': [200,400,800,1000,2000]}
 ]

DT_clf = RandomizedSearchCV(DecisionTreeClassifier(),param_grid_DT,cv=5,iid=False)


In [115]:

classifiers = [
    KNeighborsClassifier(10),
    #SVC_clf,
    #DecisionTreeClassifier(max_depth = 5 ,criterion = 'gini', splitter='best',max_features='auto'),
    #RF_clf,
    AdaBoostClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression(solver='newton-cg')]



In [116]:
feats=list(df_tenure.select_dtypes('number').columns)
feats.remove("salary")

print(feats)

['organization', 'expl_lvl', 'q_treatment', 'visual', 'oral', 'help', 'interesting', 'overall', 'attendance', 'assign_helpful', 'printed_notes', 'textbook', 'new_material', 'assign_amount', 'hours_outside', 'num_responses', 'enrolled', 'response_rate', 'num_taught']


In [117]:
#standardize the data
X= df_tenure
X=StandardScaler().fit_transform(X[feats])

ValueError: Input contains infinity or a value too large for dtype('float64').

In [107]:


X_train, X_test, y_train, y_test = train_test_split(X,has_tenure,test_size=0.3,random_state=42)

output = pd.DataFrame(index = ['train error', 'test_error', 'base_line', 'score']) 

for name,classifier in zip(names,classifiers):
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test,y_test)
    train_err = np.mean(classifier.predict(X_train)!=y_train)
    test_err = np.mean(classifier.predict(X_test)!=y_test)
    baseline = np.mean(has_tenure)
    output_l = [train_err, test_err, baseline, score]
    output[f"{name}"] = output_l
#     print(f"train error: {np.mean(clf.predict(X_train)!=y_train)}")
#     print(f"test error:  {np.mean(clf.predict(X_test)!=y_test)}")
#     print(f"baseline:    {np.mean(has_tenure)}")
#print(output_l)
print(output)






AttributeError: 'list' object has no attribute 'values'

In [16]:
output = pd.DataFrame(index = ['train error', 'test_error', 'base_line', 'score']) 
classifier = LogisticRegression(solver="saga",penalty="elasticnet")
classifier.fit(X_train, y_train)
score = classifier.score(X_test,y_test)
train_err = np.mean(classifier.predict(X_train)!=y_train)
test_err = np.mean(classifier.predict(X_test)!=y_test)
baseline = np.mean(has_tenure)
print(score)
print(train_err)
print(test_err)
print(baseline)



ValueError: l1_ratio must be between 0 and 1; got (l1_ratio=None)

In [99]:
from sklearn.decomposition import PCA
X= df_tenure
X=StandardScaler().fit_transform(X[feats])
X = PCA(.95).fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X,has_tenure,test_size=0.3,random_state=42)

output = pd.DataFrame(index = ['train error', 'test_error', 'base_line', 'score']) 

for name,classifier in zip(names,classifiers):
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test,y_test)
    train_err = np.mean(classifier.predict(X_train)!=y_train)
    test_err = np.mean(classifier.predict(X_test)!=y_test)
    baseline = np.mean(has_tenure)
    output_l = [train_err, test_err, baseline, score]
    output[f"{name}"] = output_l
#     print(f"train error: {np.mean(clf.predict(X_train)!=y_train)}")
#     print(f"test error:  {np.mean(clf.predict(X_test)!=y_test)}")
#     print(f"baseline:    {np.mean(has_tenure)}")
#print(output_l)
print(output)

             Nearest Neighbors  SVM tuned  Decision Tree  Random Forest  \
train error           0.268293   0.239837       0.182927       0.020325   
test_error            0.311321   0.283019       0.311321       0.283019   
base_line             0.423295   0.423295       0.423295       0.423295   
score                 0.688679   0.716981       0.688679       0.716981   

             AdaBoost  Naive Bayes       LDA       QDA    LogReg  
train error  0.077236     0.243902  0.239837  0.199187  0.256098  
test_error   0.283019     0.283019  0.273585  0.301887  0.283019  
base_line    0.423295     0.423295  0.423295  0.423295  0.423295  
score        0.716981     0.716981  0.726415  0.698113  0.716981  
