### Define the network

In [20]:
import numpy as np
import pandas as pd
from collections import OrderedDict
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score
import sklearn.neighbors
import joblib

In [11]:
word_embeddings = pd.read_csv('glove.6B.50d.txt.zip',
                               header=None, sep=' ', index_col=0,
                               nrows=100000, compression='zip', encoding='utf-8', quoting=3)
word_list = word_embeddings.index.values.tolist()
word2vec = OrderedDict(zip(word_list, word_embeddings.values))

In [12]:
#two numpy arrays
def accuracy(pred,actual):
    cor = 0
    for i in range(pred.shape[0]):
        if round(pred[i].item()) == actual[i]:
            cor += 1
    return cor / pred.shape[0]

## Control Metrics for Unbalacned Set

In [13]:
testDf = pd.read_csv("testu.csv", delimiter ="|")
trainDf = pd.read_csv("trainu.csv",  delimiter ="|")

tfidf = TfidfVectorizer()
X = tfidf.fit_transform(trainDf['text'])
testX = tfidf.transform(testDf['text'])

clf = LogisticRegression(random_state=0, max_iter = 250).fit(X, trainDf['class'])
predictions = clf.predict(testX)

print(accuracy(predictions,testDf["class"]))
print(f1_score(predictions,testDf["class"]))
print(precision_score(predictions,testDf["class"]))
print(recall_score(predictions,testDf["class"]))

0.9730664602834729
0.016944573521942577
0.00872565386755704
0.2917752139933011


In [14]:
X = []
for response in trainDf['text']:
    v=np.zeros(50)
    response = response.split()
    for word in response:
        v+=word2vec.get(word,np.zeros(50))
    v=v/len(response)
    X.append(v)
testX = []
for response in testDf['text']:
    v=np.zeros(50)
    response = response.split()
    for word in response:
        v+=word2vec.get(word,np.zeros(50))
    v=v/len(response)
    testX.append(v)

In [15]:
clf = LogisticRegression(random_state=0, max_iter = 250).fit(np.array(X), trainDf['class'])
predictions = clf.predict(np.array(testX))
print(accuracy(predictions,testDf["class"]))
print(f1_score(predictions,testDf["class"]))
print(precision_score(predictions,testDf["class"]))
print(recall_score(predictions,testDf["class"]))

0.9733967306344034
0.0
0.0
0.0


## Control Metrics for Balacned Set

In [16]:
testDf = pd.read_csv("testb.csv", delimiter ="|")
trainDf = pd.read_csv("trainb.csv",  delimiter ="|")

tfidf = TfidfVectorizer()
X = tfidf.fit_transform(trainDf['text'])
testX = tfidf.transform(testDf['text'])

clf = LogisticRegression(random_state=0, max_iter = 250).fit(X, trainDf['class'])
predictions = clf.predict(testX)

print(accuracy(predictions,testDf["class"]))
print(f1_score(predictions,testDf["class"]))
print(precision_score(predictions,testDf["class"]))
print(recall_score(predictions,testDf["class"]))

0.6212847555129435
0.6070156295131424
0.5849751028361118
0.6307820576955144


In [17]:
X = []
for response in trainDf['text']:
    v=np.zeros(50)
    response = response.split()
    for word in response:
        v+=word2vec.get(word,np.zeros(50))
    v=v/len(response)
    X.append(v)
testX = []
for response in testDf['text']:
    v=np.zeros(50)
    response = response.split()
    for word in response:
        v+=word2vec.get(word,np.zeros(50))
    v=v/len(response)
    testX.append(v)

In [18]:
clf = LogisticRegression(random_state=0, max_iter = 250).fit(np.array(X), trainDf['class'])
predictions = clf.predict(np.array(testX))
print(accuracy(predictions,testDf["class"]))
print(f1_score(predictions,testDf["class"]))
print(precision_score(predictions,testDf["class"]))
print(recall_score(predictions,testDf["class"]))

0.5350879905978412
0.5297650702286734
0.5237682862709925
0.5359007626340938


In [21]:
joblib.dump(clf, 'log.pkl', compress=9)

['log.pkl']