In [154]:
import numpy as np
import pandas as pd
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn import metrics
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')

# Load and clean data function

In [136]:
df_complete = pd.read_csv('C:/Users/Cole/Documents/DATA/PLIC_DATA/Collective_Surveys/Complete/Complete_Concat.csv')
df_beta = pd.read_excel('C:/Users/Cole/Documents/DATA/PLIC_DATA/Data_from_development/Coded_FR/PLIC_beta_FR_Cole.xlsx').fillna(0)

def GetNewResponses(df, Q):
    def GetPrePost(df, prepost):
        df_temp = df.loc[df['Survey_' + prepost] == 'F', [col for col in df.columns if ((Q in col) & 
                                                                                        (('_' + prepost) in col) & 
                                                                                        ('l' not in col))]]
        other_col = [col for col in df_temp.columns if 'TEXT' in col][0].split('_')[1]
        df_temp = df_temp.loc[:, [col for col in df_temp.columns if other_col not in col]]
        df_temp = df_temp.dropna(subset = [col for col in df_temp.columns if col != (Q + '_' + prepost)], how = 'all')
        df_temp.columns = [col[:-2] for col in df_temp.columns]
        return df_temp
        
    df_pre = GetPrePost(df, 'x')
    df_post = GetPrePost(df, 'y')
    return pd.concat([df_pre, df_post], axis = 0).reset_index(drop = True).fillna(0)

def GetOldResponses(df, Q, collapse = True):
    if(collapse):
        df_temp = df.loc[:, [col for col in df_beta.columns if Q in col]]
        cols = list(set(['_'.join(col.split('_')[:2]) for col in df_temp.columns if '_' in col]))
        cols = [col.replace('.1', '') for col in cols]

        for col_new in cols:
            df_temp[col_new] = 1 * (df_temp.loc[:, [col for col in df_temp.columns[1:] if 
                                                    col_new.split('_')[-1] == col.split('_')[1]]].sum(axis = 1) > 0)
        df_temp = df_temp.loc[df_temp[cols].sum(axis = 1) > 0, :]
        cols.append(Q)
        return df_temp[cols].reset_index(drop = True)
    
def GetAllData(df_old, df_new, Q):
    df_old = GetOldResponses(df_old, Q)
    df_new = GetNewResponses(df_new, Q)
    return pd.concat([df_old, df_new], axis = 0, join = 'outer').loc[:, list(df_new.columns)].reset_index(drop = True).sample(frac = 1, random_state = 11).reset_index(drop = True)

  interactivity=interactivity, compiler=compiler, result=result)


# ML pipeline

In [162]:
Scoring = 'f1'
CV = 5

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        Tokenizer = RegexpTokenizer(r'\w+|%|\+|\-')
        return [self.wnl.lemmatize(t) for t in Tokenizer.tokenize(doc)]
    
class StemTokenizer(object):
    def __init__(self):
        self.ps = PorterStemmer()
    def __call__(self, doc):
        Tokenizer = RegexpTokenizer(r'\w+|%|\+|\-')
        return [self.ps.stem(t) for t in Tokenizer.tokenize(doc)]

def CodeFR(df, Q, Scoring, CV):
    # add stemming
    Pipe = Pipeline([
                    ('TFIDF', TfidfVectorizer(stop_words = 'english', tokenizer = StemTokenizer(), ngram_range = (1, 2), 
                                              max_features = 1000)),
                    ('SVM', SVC(kernel = 'linear', random_state = 11))
                    ])

    Params = {
            'SVM__C': (0.001, 0.01, 0.1, 1),
            }
    Grid_Search = GridSearchCV(Pipe, Params, n_jobs = 1, verbose = 1, cv = CV, scoring = Scoring)

    X = df[Q]
    y = df.drop(Q, axis = 1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 11)
    
    for col in y_train.columns:
        Grid_Search.fit(X_train, y_train.loc[:, col])
        y_pred = Grid_Search.predict(X_test)

        print(col)
        print((metrics.confusion_matrix(y_test.loc[:, col], y_pred))/len(y_test))
        print(metrics.f1_score(y_test.loc[:, col], y_pred))
        
    return 0

# Question Q1B

In [163]:
df_1b = GetAllData(df_beta, df_complete, 'Q1b')
CodeFR(df_1b, 'Q1b', Scoring, CV)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    7.6s finished


Q1b_16
[[0.91447368 0.        ]
 [0.04605263 0.03947368]]
0.631578947368421
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    8.2s finished


Q1b_28
[[0.84210526 0.00657895]
 [0.125      0.02631579]]
0.2857142857142857
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    8.7s finished


Q1b_2
[[0.50657895 0.125     ]
 [0.06578947 0.30263158]]
0.7603305785123968
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    8.5s finished


Q1b_31
[[0.90131579 0.01315789]
 [0.06578947 0.01973684]]
0.33333333333333337
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    8.3s finished


Q1b_5
[[0.47368421 0.06578947]
 [0.11184211 0.34868421]]
0.7969924812030075
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    8.6s finished


Q1b_8
[[0.91447368 0.        ]
 [0.01973684 0.06578947]]
0.8695652173913044


0