In [69]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [70]:
#Setting up dataframes

eval_data = pd.read_csv("averages.csv")
sal=pd.concat([pd.read_csv(f"salaries/p_{yr}.csv") for yr in range(2018,2010,-1)])
sal=sal.groupby("name").first()


df=eval_data[eval_data.num_responses>10] # only surveys with >10 responses
df=df.groupby("instructor").filter(lambda x:len(x)>2) # only teachers with >=3 classes
df=pd.merge(df,sal,how='left', left_on=["instructor"],right_on=['name']).drop(['benefits',"section"],axis=1)


#df_tenure= df[~df.title.isna()]
df_tenure=df.groupby("instructor").mean()
df_tenure["num_taught"]=df.groupby("instructor").size()
df_tenure["title"]=df.groupby("instructor").first()[["title"]]
# dfp=dfp.reset_index()
has_tenure=df_tenure.title.isin(["Professor","Associate Professor"])

df_tenure.head()

Unnamed: 0_level_0,term,organization,expl_lvl,q_treatment,visual,oral,help,interesting,overall,attendance,...,printed_notes,textbook,new_material,assign_amount,hours_outside,num_responses,enrolled,salary,num_taught,title
instructor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Adam Kolkiewicz,1173.0,1.691446,2.429451,1.51089,1.9506,1.885041,1.362193,1.635859,1.561237,1.243845,...,1.471251,1.550505,2.583002,2.639731,2.495896,29.0,72.333333,175958.2,3,Associate Professor
Adam Roegiest,1154.0,1.86239,2.794451,1.832141,2.112739,1.795893,2.033704,1.717994,1.85625,1.278992,...,2.032509,2.391384,2.577641,2.122081,3.652393,51.75,90.5,,4,
Adriel Dean-Hall,1139.666667,2.314286,3.033968,2.013506,2.449048,2.227879,2.418803,1.849768,2.234791,1.599206,...,1.792929,2.055556,2.777233,2.349206,3.24127,25.0,69.333333,,3,
Ahmad Alrefai,1176.333333,3.056345,3.086275,2.738697,3.266667,3.419231,1.827778,1.891059,3.225774,1.857143,...,2.039683,2.236467,2.333333,2.300389,3.327778,32.0,53.0,,3,
Ahmed Ayaz Ataullah,1148.0,2.038713,2.891438,1.781366,2.207169,2.030823,2.06904,2.168041,1.939435,1.687095,...,1.856527,2.356944,2.887103,2.975629,1.891201,51.0,98.166667,,6,


In [83]:
#set up of names and classifiers 
names = ["Nearest Neighbors", "SVM tuned",
         "Decision Tree", "Random Forest","AdaBoost",
         "Naive Bayes","LDA","QDA","LogReg"]

#grid search for parameter selection of certain estimators
## SVC grid search
param_grid_SVC = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]

SVC_clf = GridSearchCV(SVC(), param_grid_SVC, cv=5,iid=False)

classifiers = [
    KNeighborsClassifier(10),
    SVC_clf,
    DecisionTreeClassifier(max_depth = 5 ,criterion = 'gini', splitter='best',max_features='auto'),
    RandomForestClassifier(n_estimators=10),
    AdaBoostClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression(solver='newton-cg')]



In [86]:
feats=list(df_tenure.select_dtypes('number').columns)
feats.remove("salary")

X= df_tenure
X=StandardScaler().fit_transform(X[feats])

X_train, X_test, y_train, y_test = train_test_split(X,has_tenure,test_size=0.3,random_state=42)

output = pd.DataFrame(index = ['train error', 'test_error', 'base_line', 'score']) 

for name,classifier in zip(names,classifiers):
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test,y_test)
    train_err = np.mean(classifier.predict(X_train)!=y_train)
    test_err = np.mean(classifier.predict(X_test)!=y_test)
    baseline = np.mean(has_tenure)
    output_l = [train_err, test_err, baseline, score]
    output[f"{name}"] = output_l
#     print(f"train error: {np.mean(clf.predict(X_train)!=y_train)}")
#     print(f"test error:  {np.mean(clf.predict(X_test)!=y_test)}")
#     print(f"baseline:    {np.mean(has_tenure)}")
#print(output_l)
print(output)




             Nearest Neighbors  SVM tuned  Decision Tree  Random Forest  \
train error           0.276423   0.239837       0.174797       0.016260   
test_error            0.283019   0.301887       0.283019       0.386792   
base_line             0.423295   0.423295       0.423295       0.423295   
score                 0.716981   0.698113       0.716981       0.613208   

             AdaBoost  Naive Bayes       LDA       QDA    LogReg  
train error  0.069106     0.317073  0.239837  0.138211  0.243902  
test_error   0.349057     0.301887  0.320755  0.254717  0.330189  
base_line    0.423295     0.423295  0.423295  0.423295  0.423295  
score        0.650943     0.698113  0.679245  0.745283  0.669811  


In [99]:
from sklearn.decomposition import PCA
X= df_tenure
X=StandardScaler().fit_transform(X[feats])
X = PCA(.95).fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X,has_tenure,test_size=0.3,random_state=42)

output = pd.DataFrame(index = ['train error', 'test_error', 'base_line', 'score']) 

for name,classifier in zip(names,classifiers):
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test,y_test)
    train_err = np.mean(classifier.predict(X_train)!=y_train)
    test_err = np.mean(classifier.predict(X_test)!=y_test)
    baseline = np.mean(has_tenure)
    output_l = [train_err, test_err, baseline, score]
    output[f"{name}"] = output_l
#     print(f"train error: {np.mean(clf.predict(X_train)!=y_train)}")
#     print(f"test error:  {np.mean(clf.predict(X_test)!=y_test)}")
#     print(f"baseline:    {np.mean(has_tenure)}")
#print(output_l)
print(output)

             Nearest Neighbors  SVM tuned  Decision Tree  Random Forest  \
train error           0.268293   0.239837       0.182927       0.020325   
test_error            0.311321   0.283019       0.311321       0.283019   
base_line             0.423295   0.423295       0.423295       0.423295   
score                 0.688679   0.716981       0.688679       0.716981   

             AdaBoost  Naive Bayes       LDA       QDA    LogReg  
train error  0.077236     0.243902  0.239837  0.199187  0.256098  
test_error   0.283019     0.283019  0.273585  0.301887  0.283019  
base_line    0.423295     0.423295  0.423295  0.423295  0.423295  
score        0.716981     0.716981  0.726415  0.698113  0.716981  
