In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
import io
import re
import csv
from numpy import savetxt
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import simplemma

# SVM imports
from sklearn import svm
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# K-NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier

# NN imports
from sklearn.neural_network import MLPClassifier

## Load & Pre-process the training datasets

In [None]:
df_train_neg = pd.read_csv("data/twitter-datasets/train_neg.txt", delimiter="\t", header=None, names = ['tweets'], error_bad_lines=False)
df_train_pos = pd.read_csv("data/twitter-datasets/train_pos.txt", delimiter="\t", header=None, names = ['tweets'], error_bad_lines=False)

In [None]:
df_train_neg

In [None]:
df_train_neg["label"] = -1
df_train_pos["label"] = 1
df_train = pd.concat([df_train_pos,df_train_neg])

In [None]:
df_train

In [None]:
def remove_tags(df):
    df_cleaned = df.copy()
    df_cleaned['tweets'] = df_cleaned['tweets'].apply(lambda tweet: re.sub(r'<.*?>', '', tweet).strip())
    return df_cleaned

In [None]:
def tokenize_and_preprocess(df, stop_words = False, stemming = False, lemmatization = False):
    df_cleaned = df.copy()
    df_cleaned['tokens'] = df_cleaned['tweets'].apply(lambda tweet: word_tokenize(tweet))
    # remove stop words
    if stop_words:
        stop_words = stopwords.words('english')
        df_cleaned['tokens'] = df_cleaned['tokens'].apply(lambda tokens: [token for token in tokens if token.lower() not in stop_words])
    # stemming
    if stemming:
        ps = PorterStemmer()
        df_cleaned['tokens'] = df_cleaned['tokens'].apply(lambda tokens: [ps.stem(token) for token in tokens])
    # lemmatization
    if lemmatization:
        wordnet_lemmatizer = WordNetLemmatizer()
        df_cleaned['tokens'] = df_cleaned['tokens'].apply(lambda tokens: [wordnet_lemmatizer.lemmatize(token) for token in tokens])
    # remove the tweets columns
    df_cleaned.drop(['tweets'], axis=1, inplace=True)
    df_cleaned = df_cleaned.reindex(columns=['tokens', 'label'])
    return df_cleaned

In [None]:
df_train = tokenize_and_preprocess(remove_tags(df_train),stop_words=True,stemming=True,lemmatization=True)

In [None]:
df_train

## Compute feature vectors from Glove embeddings

In [None]:
#Load pre-trained model
embeddings_index = dict()

with open("data/glove.6B.100d.txt", "r", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [None]:
def toVector(tokens):
    embeddings = []
    for token in tokens:
        embedding = embeddings_index.get(token)
        if embedding is not None:
            embeddings.append(embedding) 
    if len(embeddings) == 0:
        return np.zeros(100)
    return np.average(embeddings,axis=0) 

In [None]:
df_train["vectors"] = df_train["tokens"].apply(lambda tokens: toVector(tokens))          # TAKES TIME !!                 

In [None]:
df_train.shape

In [None]:
df_train.dropna(inplace=True)

In [None]:
df_train.shape

In [None]:
def train_test_model(model_to_use,x_train,y_train,x_test):
    
    if model_to_use == "LogisticRegression":
        logisticRegr = LogisticRegression(max_iter=4000)
        logisticRegr.fit(x_train, y_train) 
        return logisticRegr.predict(x_test)
    
    if model_to_use == "SVM":
        # Use linear SVM for optimization (text classification is often linear)
        lin_clf = svm.LinearSVC()
        lin_clf.fit(x_train,y_train)
        return lin_clf.predict(x_test)
    
    if model_to_use == "NeuralNetwork":
        clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1,max_iter=4000)
        clf.fit(x_train, y_train)
        return clf.predict(x_test)

In [None]:
x_train = np.array(df_train["vectors"].tolist())
y_train =  df_train["label"].tolist()

In [None]:
tweets = []
ids = []
with open("data/twitter-datasets/test_data.txt") as f:
    for line in f:
        id, tweet = line.split(',', 1)

        tweets.append(tweet)
        ids.append(id)

    df_test = pd.DataFrame(list(zip(ids, tweets)), columns=['id', 'tweets'])

In [None]:
#df_test = pd.read_csv("data/twitter-datasets/test_data.txt",delimiter="\t", header=None, names = ['tweets'])

In [None]:
df_test = tokenize_and_preprocess(remove_tags(df_test))

In [None]:
df_test

In [None]:
df_test["vectors"] = df_test["tokens"].apply(lambda tokens: toVector(tokens)) 

In [None]:
df_test

In [None]:
x_test = df_test["vectors"].tolist()
x_test

In [None]:
#predictions = train_test_model("LogisticRegression",x_train,y_train,x_test)

In [None]:
predictions = train_test_model("NeuralNetwork",x_train,y_train,x_test)

In [None]:
#predictions = train_test_model("SVM",x_train,y_train,x_test)

In [None]:
ids=[i for i in range(1,len(predictions)+1)]
with open("data/submission.csv", 'w', newline='') as csvfile:
    fieldnames = ['Id', 'Prediction']
    writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
    writer.writeheader()
    for r1, r2 in zip(ids, predictions):
        writer.writerow({'Id':int(r1),'Prediction':int(r2)})