In [64]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [65]:
def normalize_features(X_train, X_test):
    from sklearn.preprocessing import StandardScaler # import library 
    scaler = StandardScaler() # call an object function
    scaler.fit(X_train)   # calculate mean, std in X_train  (x-u)/s
    X_train_norm = scaler.transform(X_train)  # apply normalization on X_train
    X_test_norm = scaler.transform(X_test)    # apply normalization on X_test
    return X_train_norm, X_test_norm

In [66]:
#load concatenated data in csv format

data = pd.read_csv('Causal_Text.csv')

data.drop('Unnamed: 0', inplace=True, axis=1)
data.drop('Unnamed: 1', inplace=True, axis=1)
data.drop('Unnamed: 2', inplace=True, axis=1)
data.drop('Unnamed: 3', inplace=True, axis=1)
data.drop('Unnamed: 4', inplace=True, axis=1)
data.drop('Unnamed: 5', inplace=True, axis=1)
data.drop('Unnamed: 6', inplace=True, axis=1)

print(data.head())

X = data.Text
y = data.Contains_Relation


                                                Text  Contains_Relation
0  "We find that callers experience three to six ...                  1
1  "Empirically, higher volatility of electricity...                  1
2  "I find that the mission, whether matched or r...                  1
3  "This suggests that superstitious belief still...                  1
4  "The change in the sampling level is indetermi...                  1


In [67]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X) #create bag of words

X_train, X_test, y_train, y_test = train_test_split(X, y) #split data 

#X_train_norm, X_test_norm = normalize_features(X_train, X_test)


In [68]:
#summary of words in data

feature_names = vectorizer.get_feature_names()
print("Number of features: {}".format(len(feature_names)))
print("Last 15 Features: {}".format(feature_names[(2857-15):2857]))
vectorizer.vocabulary_


Number of features: 6338
Last 15 Features: ['income', 'incoming', 'incompatible', 'incomplete', 'incongruent', 'inconsistencies', 'inconsistency', 'inconsistent', 'incorporated', 'incorporates', 'incorporating', 'incorporation', 'increase', 'increased', 'increases']




{'we': 6066,
 'find': 2330,
 'that': 5630,
 'callers': 891,
 'experience': 2171,
 'three': 5688,
 'to': 5715,
 'six': 5199,
 'times': 5707,
 'less': 3260,
 'discomfort': 1721,
 'per': 4127,
 'unit': 5879,
 'of': 3862,
 'time': 5704,
 'while': 6107,
 'waiting': 6042,
 'for': 2384,
 'callbacks': 889,
 'than': 5627,
 'in': 2821,
 'queue': 4537,
 'suggesting': 5471,
 'offering': 3869,
 'can': 899,
 'increase': 2854,
 'service': 5086,
 'quality': 4523,
 'by': 870,
 'channeling': 997,
 'an': 410,
 'alternative': 390,
 'channel': 995,
 'where': 6099,
 'they': 5671,
 'empirically': 1968,
 'higher': 2671,
 'volatility': 6028,
 'electricity': 1942,
 'prices': 4349,
 'leads': 3236,
 'cash': 939,
 'holdings': 2698,
 'and': 426,
 'this': 5680,
 'effect': 1917,
 'is': 3100,
 'robust': 4912,
 'instrumenting': 2983,
 'price': 4346,
 'risk': 4898,
 'using': 5936,
 'weather': 6077,
 'the': 5632,
 'mission': 3571,
 'whether': 6104,
 'matched': 3448,
 'or': 3953,
 'random': 4565,
 'increases': 2856,
 'eff

In [69]:
#performance metrics 

LogReg = LogisticRegression()
LogReg.fit(X_train, y_train)

CV_scores = cross_val_score(LogReg, X_train, y_train, cv = 5)
print("Cross Validation Mean Score: {}".format(np.mean(CV_scores)))

print("Training Score: {}".format(LogReg.score(X_train, y_train)))
print("Testing Score: {}".format(LogReg.score(X_test, y_test)))


Cross Validation Mean Score: 0.9349189236760468
Training Score: 0.9814077025232404
Testing Score: 0.9284294234592445


In [70]:
#confusion matrix 

y_pred = LogReg.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)

print("Confusion Matrix: {}".format(confusion))

Confusion Matrix: [[225  18]
 [ 18 242]]


In [79]:
#Examine predicted probabilities vs class labels

probabilities = LogReg.predict_proba(X_test)

probabilities = np.delete(probabilities, 0, axis=1)

y_prob = []
for i in range(probabilities.shape[0]):
    y_prob.append(np.around(probabilities[i],2))
    
y_prob = np.array(y_prob).ravel()

MSE = np.mean((np.array(y_test) - np.array(y_prob))**2)
print("Mean Squared Error: {}".format(MSE))


Mean Squared Error: 0.05502425447316104
