In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import csv

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

EMBEDDING_FILE = "w2v.pkl"

In [2]:
# loads the word embedding file "w2v.pkl"
def load_w2v(filepath):
    with open(filepath, 'rb') as fin:
        return pkl.load(fin)

# returns the embedding as a list
def w2v(word2vec, token):
    word_vector = np.zeros(300, )

    # [YOUR CODE HERE]
    word2vec = load_w2v(EMBEDDING_FILE)
    if token in word2vec:
        word_vector = word2vec[token]

    return np.array(word_vector)

# can read in data file and return list based on separator
# reads in the train and test data and returns it as a pandas list
def load_as_list(fname):
    df = pd.read_csv(fname)
    id = df['id'].values.tolist()
    label = df['label'].values.tolist()
    tweets = df['tweet'].values.tolist()
    return tweets, label

#tokenizes string
def get_tokens(inp_str):
    return inp_str.split()

In [3]:
def string2vec(word2vec, user_input):
    embedding = np.zeros(300, )

    # Tokenizing the string
    tokenized_input = get_tokens(user_input)

    # storing all the word vector of each of the
    # tokenized string in a list of arrays
    all_word_vectors = np.zeros(shape=(len(tokenized_input), 300))
    for i in range(len(tokenized_input)):
        all_word_vectors[i] = w2v(word2vec, tokenized_input[i])

    # Adding up all the word embeddings array
    embedding_sum = np.sum(all_word_vectors, axis=0)

    # Dividing word embeddings array
    # After this step we should have the embeddings array
    embedding = embedding_sum / len(tokenized_input)
    embedding = np.array(embedding)

    return embedding

In [4]:
def vectorize_train(training_documents):
    # Initialize the TfidfVectorizer model and document-term matrix
    vectorizer = TfidfVectorizer()
    tfidf_train = None
    # [YOUR CODE HERE]
    tfidf_train = vectorizer.fit_transform(training_documents)

    return vectorizer, tfidf_train

In [5]:
def train_model(model, word2vec, training_documents, training_labels):
    # Write your code here
    # print(training_documents)
    training_documents_array = np.array(training_documents)
    training_documents_trained = []
    for i in range(len(training_documents_array)):
        training_documents_trained.append(string2vec(word2vec, training_documents[i]))
    model.fit(training_documents_trained, training_labels)
    return model

def test_model(model, word2vec, test_documents, test_labels):
    training_documents_array = np.array(test_documents)
    training_documents_trained = []
    for i in range(len(training_documents_array)):
        training_documents_trained.append(string2vec(word2vec, test_documents[i]))

    pred = model.predict(training_documents_trained)
    print(pred)
    precision = precision_score(test_labels, pred)
    recall = recall_score(test_labels, pred)
    f1 = f1_score(test_labels, pred)
    accuracy = accuracy_score(test_labels, pred)

    return round(precision, 2), round(recall, 2), round(f1, 2), round(accuracy, 2)

In [6]:
tweets, label = load_as_list("train.csv")
vectorizer, tfidf_train = vectorize_train(tweets)

# testing begins here
test_data, test_labels = load_as_list("train.csv")

print("xxxx____TFIDF START____xxxx \n")
print(tfidf_train)
print("xxxx_____TFIDF END_____xxxx")

xxxx____TFIDF START____xxxx 

  (0, 31398)	0.24587523640473063
  (0, 11441)	0.37328100670717695
  (0, 18940)	0.2205095089178642
  (0, 20271)	0.2217974158889632
  (0, 17200)	0.379914618721787
  (0, 11118)	0.35472088764116205
  (0, 16758)	0.18302237090130583
  (0, 32276)	0.3200037447188773
  (0, 33622)	0.13945209452277157
  (0, 2405)	0.10773554811927136
  (0, 11442)	0.388013001997706
  (0, 19101)	0.2263921129123013
  (0, 13066)	0.18433419465066356
  (0, 39790)	0.15463350244422686
  (0, 38416)	0.07466312011486949
  (1, 15090)	0.3279202126540206
  (1, 10522)	0.34086198826458086
  (1, 27504)	0.31873787672819126
  (1, 18369)	0.09644664542534936
  (1, 38561)	0.3279202126540206
  (1, 39784)	0.31873787672819126
  (1, 26284)	0.257788438360121
  (1, 10904)	0.15684662390347334
  (1, 36533)	0.15263459071612748
  (1, 6907)	0.22604173086962553
  :	:
  (31958, 18369)	0.08516260158505909
  (31959, 26830)	0.4502363745857929
  (31959, 21561)	0.34994315261917547
  (31959, 33828)	0.36609346975257884
  (319

In [7]:
word2vec = load_w2v(EMBEDDING_FILE)
print("word2vec")

word2vec


In [None]:
# model = MLPClassifier()
# mlp.fit()
i = 0
df = pd.DataFrame(columns = ['alpha','max_iter','train_acc','test_acc','train_time'])
for a in [0.00001,0.0001,0.001,0.01, 0.1, 1, 10]:
    for mi in [10,100,200,500,1000,2000]:
        st = time()
        mlp = MLPClassifier(alpha=a, max_iter=mi)
        mlp.fit(tfidf_train, trlab)
        end = time() - st
        
        acc_tr = accuracy_score(trlab, mlp.predict(train)) # Train Accuracy
        acc = accuracy_score(tslab, mlp.predict(test)) # Test Accuracy
        df.loc[i] = [a,mi,acc_tr,acc,end]
        i=i+1
print(df)

In [None]:
acc = []
acc_tr = []
timelog = []
for l in [10,20,50,100,200,500,1000]:
    t = time()
    mlp = MLPClassifier(alpha=0.1, max_iter=200, hidden_layer_sizes=(l,))
    mlp.fit(train, trlab)
    endt = time() - t
        
    a_tr = accuracy_score(trlab, mlp.predict(train)) # Train Accuracy
    a = accuracy_score(tslab, mlp.predict(test)) # Test Accuracy

    acc_tr.append(a_tr)
    acc.append(a)
    timelog.append(endt)
    
l = [10,20,50,100,200,500,1000]
N = len(l)
l2 = np.arange(N)
matplot.subplots(figsize=(10, 5))
matplot.plot(l2, acc, label="Testing Accuracy")
matplot.plot(l2, acc_tr, label="Training Accuracy")
matplot.xticks(l2,l)
matplot.grid(True)
matplot.xlabel("Hidden Layer Nodes")
matplot.ylabel("Accuracy")
matplot.legend()
matplot.title('Accuracy versus Nodes in the Hidden Layer for MLPClassifier', fontsize=12)
matplot.show()

In [None]:
l = [10,20,50,100,200,500,1000]
N = len(l)
l2 = np.arange(N)
matplot.subplots(figsize=(10, 5))
matplot.plot(l2, timelog, label="Training time in s")
matplot.xticks(l2,l)
matplot.grid(True)
matplot.xlabel("Hidden Layer Nodes")
matplot.ylabel("Time (s)")
matplot.legend()
matplot.title('Training Time versus Nodes in the Hidden Layer for MLPClassifier', fontsize=12)
matplot.show()

In [None]:
outfile = open("classification_report.csv", "w")
outfile_writer = csv.writer(outfile)
outfile_writer.writerow(["Name", "Precision", "Recall", "F1", "Accuracy"])  # Header row
p, r, f, a = test_model(mlp, word2vec, test_data, test_labels)
outfile_writer.writerow([mlp, p, r, f, a])
outfile.close()  

In [None]:
w2v_test = string2vec(word2vec, test_data[0])

In [None]:
label = mlp.predict(w2v_test.reshape(1, -1))

In [None]:
if label == 0:
    print("Great!  It sounds like you are a decent human being.")
elif label == 1:
    print("Oh no!  It sounds like you're being offensive.")
