In [1]:
import numpy as np
import pandas as pd
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn import metrics
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')

In C:\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The savefig.frameon rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.
In C:\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The verbose.level rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.
In C:\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The verbose.fileo rcparam was deprecated in Matplotlib 3.1 and will be removed in 3.3.


# Load and clean data functions

In [2]:
df_complete = pd.read_csv('C:/Users/Cole/Documents/DATA/PLIC_DATA/Collective_Surveys/Complete/Complete_Concat.csv')
df_beta = pd.read_excel('C:/Users/Cole/Documents/DATA/PLIC_DATA/Data_from_development/Coded_FR/PLIC_beta_FR_Tim+Saaj_collapsed.xlsx').fillna(0)

def GetQColumns(df, Q, suffix = ''):
    df_temp = df.loc[:, [col for col in df.columns if ((Q in col) & (('_' + suffix) in col) & ('l' not in col))]]
    other_col = [col for col in df_temp.columns if 'TEXT' in col][0].split('_')[1]
    df_temp = df_temp.loc[:, [col for col in df_temp.columns if other_col not in col]]
    return df_temp

def GetNewResponses(df, Q):
    def GetPrePost(df, suffix):
        df_temp = df.loc[df['Survey_' + suffix] == 'F', :]
        df_temp = GetQColumns(df_temp, Q, suffix)
        df_temp = df_temp.dropna(subset = [col for col in df_temp.columns if col != (Q + '_' + suffix)], how = 'all')
        df_temp.columns = [col[:-2] for col in df_temp.columns]
        return df_temp
        
    df_pre = GetPrePost(df, 'x')
    df_post = GetPrePost(df, 'y')
    return pd.concat([df_pre, df_post], axis = 0).reset_index(drop = True).fillna(0)

def GetOldResponses(df, Q, collapse = True, reset = True):
    df_temp = df.loc[:, [col for col in df.columns if Q in col]]
    
    if collapse:
        cols = list(set(['_'.join(col.split('_')[:2]) for col in df_temp.columns if '_' in col]))
        cols = [col.replace('.1', '') for col in cols]
        for col_new in cols:
            df_temp[col_new] = 1 * (df_temp.loc[:, [col for col in df_temp.columns[1:] if 
                                                    col_new.split('_')[-1] == col.split('_')[1]]].sum(axis = 1) > 0)
    else:
        cols = ['_'.join(col.split('_')[:2]).replace('.1', '') for col in df_temp.columns if '_' in col]
        df_temp.columns = [Q] + cols
    
    if reset:
        df_temp = df_temp.loc[df_temp[cols].sum(axis = 1) > 0, :]
        cols.append(Q)
        return df_temp[cols].reset_index(drop = True)
    else:
        cols.append(Q)
        return df_temp[cols]
    
def GetAllData(df_old, df_new, Q):
    df_old = GetOldResponses(df_old, Q)
    df_new = GetNewResponses(df_new, Q)
    return pd.concat([df_old, df_new], axis = 0, join = 'outer').loc[:, list(df_new.columns)].reset_index(drop = True).sample(frac = 1, random_state = 11).reset_index(drop = True)

def CompareHumans(Q, who, cols = None):
    if(who == 'NH'):
        df_nh = pd.read_excel('C:/Users/Cole/Documents/DATA/PLIC_DATA/Coded_OR/NH/Comparison_Set_NH.xlsx', skiprows = [1])
        df_cw_small = pd.read_excel('C:/Users/Cole/Documents/DATA/PLIC_DATA/Coded_OR/CW/Comparison_Set_CW.xlsx', skiprows = [1])

        df_nh = GetQColumns(df_nh, Q).notnull().astype(int)
        df_cw_small = GetQColumns(df_cw_small, Q).notnull().astype(int)
        rows = (df_nh.sum(axis = 1) > 0) & (df_cw_small.sum(axis = 1) > 0)
        df_nh = df_nh.loc[rows, :]
        df_cw_small = df_cw_small.loc[rows, :]
        
        return df_nh, df_cw_small
    else:
        df_undergrads = pd.read_excel('C:/Users/Cole/Documents/DATA/PLIC_DATA/Data_from_development/Coded_FR/PLIC_beta_FR_Tim+Saaj_collapsed.xlsx').fillna(0)
        df_cw_large = pd.read_excel('C:/Users/Cole/Documents/DATA/PLIC_DATA/Data_from_development/Coded_FR/PLIC_beta_FR_CW_collapsed.xlsx', skiprows = [1]).fillna(0)
        df_undergrads = GetOldResponses(df_undergrads, Q, False, False).loc[:, cols]
        df_cw_large = GetOldResponses(df_cw_large, Q, True, False).loc[:, cols]
        rows = (df_undergrads.sum(axis = 1) > 0) & (df_cw_large.sum(axis = 1) > 0)
        df_undergrads = df_undergrads.loc[rows, :]
        df_cw_large = df_cw_large.loc[rows, :]
        
        return df_undergrads, df_cw_large

# ML pipeline

In [3]:
Scoring = 'f1'
CV = 5

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        Tokenizer = RegexpTokenizer(r'\w+|%|\+|\-')
        return [self.wnl.lemmatize(t) for t in Tokenizer.tokenize(doc)]
    
class StemTokenizer(object):
    def __init__(self):
        self.ps = PorterStemmer()
    def __call__(self, doc):
        Tokenizer = RegexpTokenizer(r'\w+|%|\+|\-')
        return [self.ps.stem(t) for t in Tokenizer.tokenize(doc)]

def CodeFR(df, Q, Scoring, CV):
    # add stemming
    Pipe = Pipeline([
                    ('TFIDF', TfidfVectorizer(stop_words = 'english', tokenizer = StemTokenizer(), ngram_range = (1, 2), 
                                              max_features = 1000)),
                    ('SVM', SVC(kernel = 'linear', random_state = 11))
                    ])

    Params = {
            'SVM__C': (0.001, 0.01, 0.1, 1),
            }
    Grid_Search = GridSearchCV(Pipe, Params, n_jobs = 1, verbose = 1, cv = CV, scoring = Scoring)

    X = df[Q].astype(str).fillna('')
    y = df.drop(Q, axis = 1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 11)

    df_new1, df_new2 = CompareHumans(Q, 'NH')
    df_old1, df_old2 = CompareHumans(Q, 'undergrads', [Q] + list(y_train.columns))
    
    F1_scores_ML = []
    F1_scores_NH = []
    F1_scores_Undergrads = []
    drop_cols = []
    for col in y_train.columns:
        na_rows_train = np.array(np.isnan(y_train[col]))
        if y_train[col].nunique() < 2:
            drop_cols.append(col)
            continue
        Grid_Search.fit(X_train[~na_rows_train], y_train.loc[~na_rows_train, col])
        
        na_rows_test = np.array(np.isnan(y_test[col]))
        y_pred = Grid_Search.predict(X_test[~na_rows_test])

        print(col)
        
        print('\nML\n')
        print(metrics.confusion_matrix(y_test.loc[~na_rows_test, col], y_pred))
        F1_scores_ML.append(metrics.f1_score(y_test.loc[~na_rows_test, col], y_pred))
        
        print('\nNH\n')
        try:
            print(metrics.confusion_matrix(df_new1.loc[:, col], df_new2.loc[:, col]))
            F1_scores_NH.append(metrics.f1_score(df_new1.loc[:, col], df_new2.loc[:, col]))
        except:
            F1_scores_NH.append(None)
        
        if(Q in ['Q1b', 'Q1d', 'Q1e']):
            print('\nUndergrads\n')
            try:
                print(metrics.confusion_matrix(df_old1.loc[:, col], df_old2.loc[:, col]))
                F1_scores_Undergrads.append(metrics.f1_score(df_old1.loc[:, col], df_old2.loc[:, col]))
            except:
                F1_scores_Undergrads.append(None)
    if Q in ['Q1b', 'Q1d', 'Q1e']:
        return pd.DataFrame({'Q': y_train.drop(columns = drop_cols).columns, 'ML': F1_scores_ML, 'NH': F1_scores_NH, 
                             'Undergrads': F1_scores_Undergrads})
    else:
        return pd.DataFrame({'Q': y_train.drop(columns = drop_cols).columns, 'ML': F1_scores_ML, 'NH': F1_scores_NH})      

# Question Q1B

In [5]:
df_1b = GetAllData(df_beta, df_complete, 'Q1b')
CodeFR(df_1b, 'Q1b', Scoring, CV)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    5.2s finished


Q1b_16

ML

[[137   1]
 [  7   7]]

NH

[[48  0]
 [ 1  0]]

Undergrads

[[417   3]
 [  5  28]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    8.2s finished


Q1b_28

ML

[[121   3]
 [ 24   4]]

NH

[[31  2]
 [ 5 11]]

Undergrads

[[392  19]
 [  9  33]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    8.5s finished


Q1b_2

ML

[[70 22]
 [14 46]]

NH

[[33  6]
 [ 5  5]]

Undergrads

[[200  17]
 [ 19 217]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    8.1s finished


Q1b_31

ML

[[143   2]
 [  4   3]]

NH

[[47  1]
 [ 0  1]]

Undergrads

[[394  12]
 [  3  44]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    8.1s finished


Q1b_5

ML

[[72 11]
 [21 48]]

NH

[[19  2]
 [ 2 26]]

Undergrads

[[237   9]
 [ 14 193]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    7.8s finished


Q1b_8

ML

[[137   2]
 [  3  10]]

NH

[[44  0]
 [ 1  4]]

Undergrads

[[412   0]
 [  2  39]]


Unnamed: 0,Q,ML,NH,Undergrads
0,Q1b_16,0.636364,0.0,0.875
1,Q1b_28,0.228571,0.758621,0.702128
2,Q1b_2,0.71875,0.47619,0.923404
3,Q1b_31,0.5,0.666667,0.854369
4,Q1b_5,0.75,0.928571,0.943765
5,Q1b_8,0.8,0.888889,0.975


# Question Q1D

In [35]:
df_1d = GetAllData(df_beta, df_complete, 'Q1d')
CodeFR(df_1d, 'Q1d', Scoring, CV)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    7.0s finished


Q1d_1

ML

[[61 15]
 [11 47]]

NH

[[21  0]
 [ 1 21]]

Undergrads

[[36  1]
 [ 3 34]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    8.3s finished


Q1d_3

ML

[[86  6]
 [14 28]]

NH

[[33  0]
 [ 1  9]]

Undergrads

[[39  0]
 [ 3 32]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    8.0s finished


Q1d_51

ML

[[129   0]
 [  5   0]]

NH

[[40  0]
 [ 2  1]]

Undergrads

[[67  1]
 [ 5  1]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    8.7s finished


Q1d_53

ML

[[114   2]
 [ 14   4]]

NH

[[39  1]
 [ 0  3]]

Undergrads

[[62  2]
 [ 6  4]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    8.2s finished


Q1d_55

ML

[[124   0]
 [  5   5]]

NH

[[38  0]
 [ 1  4]]

Undergrads

[[66  1]
 [ 1  6]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    8.0s finished


Q1d_57

ML

[[118   2]
 [ 10   4]]

NH

[[40  2]
 [ 0  1]]

Undergrads

[[61  1]
 [ 2 10]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    8.0s finished


Q1d_59

ML

[[122   1]
 [ 10   1]]

NH

[[38  0]
 [ 2  3]]

Undergrads

[[40 21]
 [ 5  8]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    8.2s finished


Q1d_5

ML

[[101   4]
 [  6  23]]

NH

[[26  1]
 [ 1 15]]

Undergrads

[[56  0]
 [ 2 16]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    7.3s finished


Q1d_61

ML

[[129   0]
 [  5   0]]

NH

[[43]]

Undergrads

[[66  1]
 [ 5  2]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    8.0s finished


Q1d_63

ML

[[110   5]
 [ 15   4]]

NH

[[39  1]
 [ 1  2]]

Undergrads

[[64  3]
 [ 4  3]]


Unnamed: 0,Q,ML,NH,Undergrads
0,Q1d_1,0.783333,0.976744,0.944444
1,Q1d_3,0.736842,0.947368,0.955224
2,Q1d_51,0.0,0.5,0.25
3,Q1d_53,0.333333,0.857143,0.5
4,Q1d_55,0.666667,0.888889,0.857143
5,Q1d_57,0.4,0.5,0.869565
6,Q1d_59,0.153846,0.75,0.380952
7,Q1d_5,0.821429,0.9375,0.941176
8,Q1d_61,0.0,0.0,0.4
9,Q1d_63,0.285714,0.666667,0.461538


# Question Q1E

In [36]:
df_1e = GetAllData(df_beta, df_complete, 'Q1e')
CodeFR(df_1e, 'Q1e', Scoring, CV)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    4.0s finished


Q1e_13

ML

[[120   1]
 [  8  12]]

NH

[[45  0]
 [ 0  5]]

Undergrads

[[63  1]
 [ 0 11]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    4.2s finished


Q1e_14

ML

[[117   9]
 [  9   6]]

NH

[[46  0]
 [ 0  4]]

Undergrads

[[54  4]
 [10  7]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    5.6s finished


Q1e_16

ML

[[132   0]
 [  9   0]]

NH

[[48  0]
 [ 0  2]]

Undergrads

[[72  0]
 [ 1  2]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    5.8s finished


Q1e_17

ML

[[133   0]
 [  6   2]]

NH

[[50]]

Undergrads

[[71  1]
 [ 1  2]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    5.5s finished


Q1e_18

ML

[[137   0]
 [  4   0]]

NH

[[49  0]
 [ 1  0]]

Undergrads

[[72  0]
 [ 0  3]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    5.9s finished


Q1e_19

ML

[[133   0]
 [  8   0]]

NH

[[48  0]
 [ 0  2]]

Undergrads

[[67  0]
 [ 5  3]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    6.0s finished


Q1e_1

ML

[[45 11]
 [11 74]]

NH

[[15  0]
 [ 1 34]]

Undergrads

[[24  2]
 [ 4 45]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    5.7s finished


Q1e_20

ML

[[131   0]
 [  1   9]]

NH

[[44  0]
 [ 1  5]]

Undergrads

[[67  0]
 [ 0  8]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    6.2s finished


Q1e_23

ML

[[130   2]
 [  3   6]]

NH

[[47  0]
 [ 1  2]]

Undergrads

[[69  0]
 [ 1  5]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    5.8s finished


Q1e_24

ML

[[135   0]
 [  6   0]]

NH

[[50]]

Undergrads

[[69  0]
 [ 4  2]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    5.5s finished


Q1e_28

ML

[[141]]

NH

[[50]]

Undergrads

[[75]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    5.7s finished


Q1e_4

ML

[[126   1]
 [  9   5]]

NH

[[46  0]
 [ 0  4]]

Undergrads

[[66  2]
 [ 0  7]]


Unnamed: 0,Q,ML,NH,Undergrads
0,Q1e_13,0.727273,1.0,0.956522
1,Q1e_14,0.4,1.0,0.5
2,Q1e_16,0.0,1.0,0.8
3,Q1e_17,0.4,0.0,0.666667
4,Q1e_18,0.0,0.0,1.0
5,Q1e_19,0.0,1.0,0.545455
6,Q1e_1,0.870588,0.985507,0.9375
7,Q1e_20,0.947368,0.909091,1.0
8,Q1e_23,0.705882,0.8,0.909091
9,Q1e_24,0.0,0.0,0.5


In [6]:
df_2b = GetAllData(df_beta, df_complete, 'Q2b')
CodeFR(df_2b, 'Q2b', Scoring, CV)

677