In [27]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
import io
import re
import csv
from numpy import savetxt
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import simplemma

# SVM imports
from sklearn import svm
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# K-NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier

# NN imports
from sklearn.neural_network import MLPClassifier

## Load & Pre-process the training datasets

In [28]:
df_train_neg = pd.read_csv("data/twitter-datasets/train_neg.txt", delimiter="\t", header=None, names = ['tweets'], error_bad_lines=False)
df_train_pos = pd.read_csv("data/twitter-datasets/train_pos.txt", delimiter="\t", header=None, names = ['tweets'], error_bad_lines=False)



  df_train_neg = pd.read_csv("data/twitter-datasets/train_neg.txt", delimiter="\t", header=None, names = ['tweets'], error_bad_lines=False)


  df_train_pos = pd.read_csv("data/twitter-datasets/train_pos.txt", delimiter="\t", header=None, names = ['tweets'], error_bad_lines=False)


In [29]:
df_train_neg

Unnamed: 0,tweets
0,vinco tresorpack 6 ( difficulty 10 of 10 objec...
1,glad i dot have taks tomorrow ! ! #thankful #s...
2,1-3 vs celtics in the regular season = were fu...
3,<user> i could actually kill that girl i'm so ...
4,<user> <user> <user> i find that very hard to ...
...,...
99063,can't wait to fake tan tonight ! hate being pale
99064,<user> darling i lost my internet connection ....
99065,kanguru defender basic 4 gb usb 2.0 flash driv...
99066,rizan is sad now


In [30]:
df_train_neg["label"] = -1
df_train_pos["label"] = 1
df_train = pd.concat([df_train_pos,df_train_neg])

In [31]:
df_train

Unnamed: 0,tweets,label
0,<user> i dunno justin read my mention or not ....,1
1,"because your logic is so dumb , i won't even c...",1
2,<user> just put casper in a box ! looved the...,1
3,<user> <user> thanks sir > > don't trip lil ma...,1
4,visiting my brother tmr is the bestest birthda...,1
...,...,...
99063,can't wait to fake tan tonight ! hate being pale,-1
99064,<user> darling i lost my internet connection ....,-1
99065,kanguru defender basic 4 gb usb 2.0 flash driv...,-1
99066,rizan is sad now,-1


In [32]:
def remove_tags(df):
    df_cleaned = df.copy()
    df_cleaned['tweets'] = df_cleaned['tweets'].apply(lambda tweet: re.sub(r'<.*?>', '', tweet).strip())
    return df_cleaned

In [33]:
def tokenize_and_preprocess(df, stop_words = False, stemming = False, lemmatization = False):
    df_cleaned = df.copy()
    df_cleaned['tokens'] = df_cleaned['tweets'].apply(lambda tweet: word_tokenize(tweet))
    # remove stop words
    if stop_words:
        stop_words = stopwords.words('english')
        df_cleaned['tokens'] = df_cleaned['tokens'].apply(lambda tokens: [token for token in tokens if token.lower() not in stop_words])
    # stemming
    if stemming:
        ps = PorterStemmer()
        df_cleaned['tokens'] = df_cleaned['tokens'].apply(lambda tokens: [ps.stem(token) for token in tokens])
    # lemmatization
    if lemmatization:
        wordnet_lemmatizer = WordNetLemmatizer()
        df_cleaned['tokens'] = df_cleaned['tokens'].apply(lambda tokens: [wordnet_lemmatizer.lemmatize(token) for token in tokens])
    # remove the tweets columns
    df_cleaned.drop(['tweets'], axis=1, inplace=True)
    df_cleaned = df_cleaned.reindex(columns=['tokens', 'label'])
    return df_cleaned

In [34]:
df_train = tokenize_and_preprocess(remove_tags(df_train),stop_words=True,stemming=True,lemmatization=True)

In [35]:
df_train

Unnamed: 0,tokens,label
0,"[dunno, justin, read, mention, ., justin, god,...",1
1,"[logic, dumb, ,, wo, n't, even, crop, name, ph...",1
2,"[put, casper, box, !, loov, battl, !, #, crakk...",1
3,"[thank, sir, >, >, n't, trip, lil, mama, ..., ...",1
4,"[visit, brother, tmr, bestest, birthday, gift,...",1
...,...,...
99063,"[ca, n't, wait, fake, tan, tonight, !, hate, p...",-1
99064,"[darl, lost, internet, connect, .., 's, seem, ...",-1
99065,"[kanguru, defend, basic, 4, gb, usb, 2.0, flas...",-1
99066,"[rizan, sad]",-1


## Compute feature vectors from Glove embeddings

In [10]:
#Load pre-trained model
embeddings_index = dict()

with open("data/glove.6B.100d.txt", "r", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [11]:
def toVector(tokens):
    embeddings = []
    for token in tokens:
        embedding = embeddings_index.get(token)
        if embedding is not None:
            embeddings.append(embedding) 
    if len(embeddings) == 0:
        return np.zeros(100)
    return np.average(embeddings,axis=0) 

In [12]:
df_train["vectors"] = df_train["tokens"].apply(lambda tokens: toVector(tokens))          # TAKES TIME !!                 

In [13]:
df_train.shape

(196970, 3)

In [14]:
df_train.dropna(inplace=True)

In [15]:
df_train.shape

(196970, 3)

In [16]:
def train_test_model(model_to_use,x_train,y_train,x_test):
    
    if model_to_use == "LogisticRegression":
        logisticRegr = LogisticRegression(max_iter=4000)
        logisticRegr.fit(x_train, y_train) 
        return logisticRegr.predict(x_test)
    
    if model_to_use == "SVM":
        # Use linear SVM for optimization (text classification is often linear)
        lin_clf = svm.LinearSVC()
        lin_clf.fit(x_train,y_train)
        return lin_clf.predict(x_test)
    
    if model_to_use == "NeuralNetwork":
        clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1,max_iter=4000)
        clf.fit(x_train, y_train)
        return clf.predict(x_test)

In [17]:
x_train = np.array(df_train["vectors"].tolist())
y_train =  df_train["label"].tolist()

In [18]:
tweets = []
ids = []
with open("data/twitter-datasets/test_data.txt") as f:
    for line in f:
        id, tweet = line.split(',', 1)

        tweets.append(tweet)
        ids.append(id)

    df_test = pd.DataFrame(list(zip(ids, tweets)), columns=['id', 'tweets'])

In [19]:
#df_test = pd.read_csv("data/twitter-datasets/test_data.txt",delimiter="\t", header=None, names = ['tweets'])

In [20]:
df_test = tokenize_and_preprocess(remove_tags(df_test))

In [21]:
df_test

Unnamed: 0,tokens,label
0,"[sea, doo, pro, sea, scooter, (, sports, with,...",
1,"[shucks, well, i, work, all, week, so, now, i,...",
2,"[i, cant, stay, away, from, bug, thats, my, baby]",
3,"[no, ma'am, !, !, !, lol, im, perfectly, fine,...",
4,"[whenever, i, fall, asleep, watching, the, tv,...",
...,...,...
9995,"[had, a, nice, time, w, /, my, friend, lastnite]",
9996,"[no, it, 's, not, !, please, stop, !]",
9997,"[not, without, my, daughter, (, dvd, two-time,...",
9998,"[have, fun, in, class, sweetcheeks]",


In [22]:
df_test["vectors"] = df_test["tokens"].apply(lambda tokens: toVector(tokens)) 

In [23]:
df_test

Unnamed: 0,tokens,label,vectors
0,"[sea, doo, pro, sea, scooter, (, sports, with,...",,"[-0.24224886, 0.16940916, 0.34369066, -0.28017..."
1,"[shucks, well, i, work, all, week, so, now, i,...",,"[-0.17008193, 0.37526414, 0.49038523, -0.61395..."
2,"[i, cant, stay, away, from, bug, thats, my, baby]",,"[0.041630443, 0.2151209, 0.4816622, -0.4981244..."
3,"[no, ma'am, !, !, !, lol, im, perfectly, fine,...",,"[0.10430769, 0.32844484, 0.34208447, -0.657104..."
4,"[whenever, i, fall, asleep, watching, the, tv,...",,"[-0.12725353, 0.20051783, 0.41270673, -0.32512..."
...,...,...,...
9995,"[had, a, nice, time, w, /, my, friend, lastnite]",,"[-0.08659225, 0.1811985, 0.34321496, -0.270625..."
9996,"[no, it, 's, not, !, please, stop, !]",,"[0.022221629, 0.17239875, 0.5937738, -0.713384..."
9997,"[not, without, my, daughter, (, dvd, two-time,...",,"[-0.05038822, 0.15663749, 0.5017437, -0.356154..."
9998,"[have, fun, in, class, sweetcheeks]",,"[0.06353325, 0.41262323, 0.19513872, -0.304543..."


In [24]:
x_test = df_test["vectors"].tolist()
x_test

[array([-0.24224886,  0.16940916,  0.34369066, -0.2801762 ,  0.03957678,
         0.10983294,  0.10651181,  0.22016242, -0.24410787,  0.01245263,
         0.27894488, -0.18642053,  0.24788699,  0.1778929 ,  0.08103074,
        -0.40516055,  0.24452396, -0.12744685, -0.3932698 ,  0.01478264,
         0.29672304,  0.15079725,  0.2890201 ,  0.16347325,  0.23063998,
         0.14461757, -0.34250996, -0.3296455 ,  0.1309542 ,  0.10010816,
        -0.16025227,  0.29478815, -0.19100718,  0.12731354,  0.07148781,
         0.37782136,  0.17495583,  0.40592396,  0.08006516, -0.14188103,
        -0.20877841, -0.39223364, -0.01672374, -0.34302473,  0.13757333,
         0.13381507,  0.11213882, -0.24112195,  0.04242631, -0.40419093,
        -0.06768728,  0.09087121,  0.11795755,  0.9939499 , -0.29169866,
        -2.0750663 ,  0.13266957, -0.05362405,  1.5650873 ,  0.4855795 ,
        -0.15443222,  0.56036824, -0.2003331 ,  0.09413759,  0.6628084 ,
         0.13448015,  0.37827894, -0.07082548,  0.1

In [25]:
#predictions = train_test_model("LogisticRegression",x_train,y_train,x_test)

In [26]:
predictions = train_test_model("NeuralNetwork",x_train,y_train,x_test)

In [41]:
#predictions = train_test_model("SVM",x_train,y_train,x_test)

In [42]:
ids=[i for i in range(1,len(predictions)+1)]
with open("data/submission.csv", 'w', newline='') as csvfile:
    fieldnames = ['Id', 'Prediction']
    writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
    writer.writeheader()
    for r1, r2 in zip(ids, predictions):
        writer.writerow({'Id':int(r1),'Prediction':int(r2)})