In [1]:
import pandas as pd
import glob

## Load files & create df

In [2]:
these_files = glob.glob("/home/bryan/Documents/Code/si699/data/mturk_results/*.csv")

# HITId is a unique identifier for a task (text fragment given to Turkers to label). The text used for labeling could have been duplicated across trials (e.g. gold samples used more than once)

In [3]:
THRESHOLD = 0.5 #threshold for setting a label to 1/Directive

In [4]:
# iterate through all files & append to ground_truth_df

errors = []
error_messages = []
ground_truth_df = pd.DataFrame()

for file in these_files:
    try:
        this_df = pd.read_csv(file, index_col=0)

        ground_truth = this_df.groupby("HITId").mean()["Answer.yes.1"]
        ground_truth = ground_truth.to_frame()

        new_values = []
        for value in ground_truth.values:
            if value < THRESHOLD:
                new_values.append(0)
            else:
                new_values.append(1)

        ground_truth["new_values"] = new_values
        ground_truth = ground_truth.merge(this_df[["HITId","Input.TEXT"]], how = "left", on = "HITId")
        ground_truth = ground_truth.drop_duplicates()
        
        ground_truth_df = ground_truth_df.append(ground_truth)
        
    except Exception as e:
        errors.append(file)
        error_messages.append(e)
        
print(f"ground_truth_df shape: {ground_truth_df.shape}")
ground_truth_df.drop(columns="HITId", inplace=True)

ground_truth_df shape: (341, 4)


In [5]:
errors,error_messages

([], [])

In [6]:
ground_truth_df.shape

(341, 3)

### Resolve duplicate gold samples

In [7]:
good_samples = ground_truth_df[~ground_truth_df.duplicated("Input.TEXT", keep=False)].sort_values("Input.TEXT")
gold_samples = ground_truth_df[ground_truth_df.duplicated("Input.TEXT", keep=False)].sort_values("Input.TEXT")

gold_samples = gold_samples.groupby("Input.TEXT").mean()

new_values = []
for value in gold_samples["Answer.yes.1"]:
            if value < THRESHOLD:
                new_values.append(0)
            else:
                new_values.append(1)
gold_samples["new_values"] = new_values

gold_samples.reset_index(inplace = True)
ground_truth_df = pd.concat([good_samples,gold_samples])

In [8]:
ground_truth_df.shape

(284, 3)

## ADD ADDL COMMON FEATURES HERE (WHERE DATA LEAKAGE ISN'T POSSIBLE)

In [9]:
import re

In [10]:
ground_truth_df["indicator_question"] = ground_truth_df.apply(lambda x: 1 if re.search("\?",x["Input.TEXT"]) else 0, axis=1)

# simply binary flags
# ground_truth_df["indicator_will"] = ground_truth_df.apply(lambda x: 1 if re.search("[Ww]ill",x["Input.TEXT"]) else 0, axis=1)
# ground_truth_df["indicator_would"] = ground_truth_df.apply(lambda x: 1 if re.search("[Ww]ould",x["Input.TEXT"]) else 0, axis=1)
# ground_truth_df["indicator_could"] = ground_truth_df.apply(lambda x: 1 if re.search("[Cc]ould",x["Input.TEXT"]) else 0, axis=1)
# ground_truth_df["indicator_can"] = ground_truth_df.apply(lambda x: 1 if re.search("[Cc]an",x["Input.TEXT"]) else 0, axis=1)
# ground_truth_df["indicator_dont"] = ground_truth_df.apply(lambda x: 1 if re.search("[Dd]on't",x["Input.TEXT"]) else 0, axis=1)

# counts of occurrences
ground_truth_df["indicator_will"] = ground_truth_df.apply(lambda x: len(re.findall(r"\b[Ww]ill\b",x["Input.TEXT"])), axis=1)
ground_truth_df["indicator_would"] = ground_truth_df.apply(lambda x: len(re.findall(r"\b[Ww]ould\b",x["Input.TEXT"])), axis=1)
ground_truth_df["indicator_could"] = ground_truth_df.apply(lambda x: len(re.findall(r"\b[Cc]ould\b",x["Input.TEXT"])), axis=1)
ground_truth_df["indicator_can"] = ground_truth_df.apply(lambda x: len(re.findall(r"\b[Cc]an\b",x["Input.TEXT"])), axis=1)
ground_truth_df["indicator_dont"] = ground_truth_df.apply(lambda x: len(re.findall(r"\b[Dd]on't\b",x["Input.TEXT"])), axis=1)

In [11]:
import textstat

In [12]:
ground_truth_df["readability"] = ground_truth_df.apply(lambda x: textstat.text_standard(x["Input.TEXT"], float_output=True), axis=1) #estimated school grade level required to understand the text, lower is easier
ground_truth_df["flesch_reading_ease"] = ground_truth_df.apply(lambda x: textstat.flesch_reading_ease(x["Input.TEXT"]), axis=1) #Flesch Reading Ease, lower is more confusing

In [13]:
ground_truth_df.describe()

Unnamed: 0,Answer.yes.1,new_values,indicator_question,indicator_will,indicator_would,indicator_could,indicator_can,indicator_dont,readability,flesch_reading_ease
count,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0
mean,0.296482,0.327465,0.566901,0.373239,0.126761,0.038732,0.327465,0.309859,4.573944,94.748697
std,0.343927,0.470117,0.496379,0.595735,0.417957,0.193297,0.689512,0.642248,3.235977,12.833385
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,42.38
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,89.29
50%,0.236458,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,95.88
75%,0.5,1.0,1.0,1.0,0.0,0.0,0.0,0.0,6.0,104.34
max,1.0,1.0,1.0,3.0,3.0,1.0,5.0,4.0,17.0,117.36


In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(ground_truth_df[["Input.TEXT"]], ground_truth_df["new_values"],train_size = 0.7, random_state = 444)

## ADD ADDL FEATURES HERE (WHERE DATA LEAKAGE IS POSSIBLE)
* e.g. standardization should be done on training data by itself & then applied to test data

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
#tfidf
word_vectorizer = TfidfVectorizer(strip_accents = "unicode", lowercase = True, 
                             stop_words = "english", analyzer = "word", token_pattern = "[a-z]+", smooth_idf = True, ngram_range=(1,3))

In [18]:
from sklearn.compose import ColumnTransformer

In [19]:
column_trans = ColumnTransformer([('tfidf', word_vectorizer, 'Input.TEXT')], remainder="passthrough")

train_transformed = column_trans.fit_transform(X_train)
test_transformed = column_trans.transform(X_test)

# Dummy Classifier

In [20]:
from sklearn.dummy import DummyClassifier

In [21]:
dummy = DummyClassifier(strategy = "stratified", random_state = 8)
# dummy = DummyClassifier(strategy = "most_frequent", random_state = 8)

dummy.fit(train_transformed, y_train)
dummy_preds = dummy.predict(test_transformed)

In [22]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(y_test, dummy_preds).ravel()
print(f"TN: {tn}\nFP: {fp}\nFN: {fn}\nTP: {tp}")

precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1 = round((2*precision*recall)/(precision+recall),3)

print(f"Precision: {round(precision,2)}")
print(f"Recall: {round(recall,2)}")
print(f"F1: {round(f1,2)}")

TN: 48
FP: 17
FN: 12
TP: 9
Precision: 0.35
Recall: 0.43
F1: 0.38


# Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [24]:
%%time

lr = LogisticRegression(random_state=8)

# lr_param_grid = [{"penalty":["l2","none","l1"],
#               "C":[10000,1000],
#                   "fit_intercept": [True, False],
#                   "class_weight": [None, "balanced"],
# #                   "solver": ["newton-cg","sag","saga"]
#                   "solver": ["lbfgs","liblinear", "saga"]
#               }]

lr_param_grid = [{"penalty":["l2","l1"],
              "C":[10000,1000,10,1],
                  "fit_intercept": [True, False],
                  "class_weight": [None, "balanced"],
                  "solver": ["liblinear"],
                  "n_jobs": [-1]
              },
                 {"penalty":["l2"],
                  "C":[10000,1000, 10, 1],
                  "fit_intercept": [True, False],
                  "class_weight": [None, "balanced"],
                  "solver": ["lbfgs", "saga"],
                  "n_jobs": [-1] 
              }]
lr_gs = GridSearchCV(
    estimator = lr,
    param_grid = lr_param_grid,
    cv=10
)
lr_gs.fit(train_transformed, y_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


CPU times: user 3.73 s, sys: 145 ms, total: 3.87 s
Wall time: 11.7 s


GridSearchCV(cv=10, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=8, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid=[{'C': [10000, 1000, 10, 1],
                          'class_weight': [None, 'balanced'],
                          'fit_intercept': [True, False], 'n_jobs': [-1],
                          'penalty': ['l2', 'l1'], 'solver': ['liblinear']},
                         {'C': [10000, 1000, 10, 1],
                          'class_we

In [25]:
lr_gs.best_params_

{'C': 1000,
 'class_weight': 'balanced',
 'fit_intercept': True,
 'n_jobs': -1,
 'penalty': 'l2',
 'solver': 'liblinear'}

In [26]:
lr_gs.best_estimator_

LogisticRegression(C=1000, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=-1, penalty='l2',
                   random_state=8, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [27]:
lr_preds = lr_gs.predict(test_transformed)

In [28]:
tn, fp, fn, tp = confusion_matrix(y_test, lr_preds).ravel()
print(f"TN: {tn}\nFP: {fp}\nFN: {fn}\nTP: {tp}")

precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1 = round((2*precision*recall)/(precision+recall),3)

print(f"Precision: {round(precision,2)}")
print(f"Recall: {round(recall,2)}")
print(f"F1: {round(f1,2)}")

TN: 59
FP: 6
FN: 14
TP: 7
Precision: 0.54
Recall: 0.33
F1: 0.41


## SVM

In [29]:
from sklearn import svm

In [30]:
svm = svm.SVC(random_state=8)

In [31]:
svm_param_grid = [{"C":[10000,1000,10,1],
                   "kernel": ["linear", "rbf", "poly"],
                   "degree": [2,3],
                   "gamma": ["scale","auto"],
                  "class_weight": ["balanced",None]
                  }]

svm_gs = GridSearchCV(
    estimator = svm,
    param_grid = svm_param_grid,
    cv=10
)
svm_gs.fit(train_transformed, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=8, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid=[{'C': [10000, 1000, 10, 1],
                          'class_weight': ['balanced', None], 'degree': [2, 3],
                          'gamma': ['scale', 'auto'],
                          'kernel': ['linear', 'rbf', 'poly']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [32]:
svm_gs.best_params_

{'C': 1,
 'class_weight': 'balanced',
 'degree': 2,
 'gamma': 'scale',
 'kernel': 'linear'}

In [33]:
svm_gs.best_estimator_

SVC(C=1, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=2, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=8, shrinking=True, tol=0.001,
    verbose=False)

In [34]:
svm_preds = svm_gs.predict(test_transformed)

In [35]:
tn, fp, fn, tp = confusion_matrix(y_test, svm_preds).ravel()
print(f"TN: {tn}\nFP: {fp}\nFN: {fn}\nTP: {tp}")

precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1 = round((2*precision*recall)/(precision+recall),3)

print(f"Precision: {round(precision,2)}")
print(f"Recall: {round(recall,2)}")
print(f"F1: {round(f1,2)}")

TN: 57
FP: 8
FN: 14
TP: 7
Precision: 0.47
Recall: 0.33
F1: 0.39
