In [None]:
import pandas as pd
import numpy as np 
import re
from nltk.tokenize import word_tokenize, WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from gensim.models import Word2Vec 
import multiprocessing

# SVM imports
from sklearn import svm
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import csv

# K-NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier

# NN imports
from sklearn.neural_network import MLPClassifier

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
nltk.download('omw-1.4')

## Load the dataset as pandas

In [None]:
df_train_pos = pd.read_csv("twitter-datasets/train_pos.txt",sep="\t",names=['tweets'])
df_train_pos['label'] = 1
df_train_neg = pd.read_csv("twitter-datasets/train_neg.txt",sep="\t",names=['tweets'])
df_train_neg['label'] = -1
df_train = pd.concat([df_train_pos, df_train_neg])
print(df_train_pos.shape)
print(df_train_neg.shape)
print(df_train.shape)

In [None]:
df_train.head()

## Remove tags and urls

In [None]:
def remove_tags(df):
    df_cleaned = df.copy()
    df_cleaned['tweets'] = df_cleaned['tweets'].apply(lambda tweet: re.sub(r'<.*?>', '', tweet).strip())
    return df_cleaned

In [None]:
remove_tags(df_train)

## Removing stop words + stemming + lemmatization

In [None]:
def tokenize_and_preprocess(df, stop_words = False, stemming = False, lemmatization = False):
    df_cleaned = df.copy()
    df_cleaned['tokens'] = df_cleaned['tweets'].apply(lambda tweet: word_tokenize(tweet))
    # remove stop words
    if stop_words:
        stop_words = stopwords.words('english')
        df_cleaned['tokens'] = df_cleaned['tokens'].apply(lambda tokens: [token for token in tokens if token.lower() not in stop_words])
    # stemming
    if stemming:
        ps = PorterStemmer()
        df_cleaned['tokens'] = df_cleaned['tokens'].apply(lambda tokens: [ps.stem(token) for token in tokens])
    # lemmatization
    if lemmatization:
        wordnet_lemmatizer = WordNetLemmatizer()
        df_cleaned['tokens'] = df_cleaned['tokens'].apply(lambda tokens: [wordnet_lemmatizer.lemmatize(token) for token in tokens])
    # remove the tweets columns
    df_cleaned.drop(['tweets'], axis=1, inplace=True)
    df_cleaned = df_cleaned.reindex(columns=['tokens', 'label'])
    return df_cleaned

In [None]:
tokens = list(tokenize_and_preprocess(remove_tags(df_train), stop_words = False, stemming = False, lemmatization = False).tokens)

#### Load the World2Vec model

In [None]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
cores

In [None]:
def construct_word2vec_model(tokens, size, model_name):
    model = Word2Vec(tokens, min_count = 5, window = 10, vector_size = size, workers=cores-1)
    model.save('word2vec.model')
    
construct_word2vec_model(tokens, 100, 'word2vec.model')

In [None]:
model = Word2Vec.load('word2vec.model')

In [None]:
word_vectors = model.wv
print(word_vectors)

In [None]:
def construct_vector(tokens):
    embeddings = []
    for token in tokens:
        if token in model.wv:
            embeddings.append(model.wv[token])
    if len(embeddings) == 0:
        return np.zeros(100)
    return np.average (embeddings,axis=0)

## Test with word2vec

In [None]:
df_cleaned_tweets = remove_tags(df_train)
df_cleaned_tweets = tokenize_and_preprocess(df_cleaned_tweets)
df_cleaned_tweets.head()

## Constructing the vectors embeddings

In [None]:
df_cleaned_tweets['vectors'] = df_cleaned_tweets['tokens'].apply(lambda token: construct_vector(token))

In [None]:
df_cleaned_tweets

## Test models

In [None]:
def train_test_model(model_to_use,x_train,y_train,x_test):
    
    if model_to_use == "LogisticRegression":
        logisticRegr = LogisticRegression(max_iter=4000)
        logisticRegr.fit(x_train, y_train) 
        return logisticRegr.predict(x_test)
    
    if model_to_use == "SVM":
        # Use linear SVM for optimization (text classification is often linear)
        lin_clf = svm.LinearSVC()
        lin_clf.fit(x_train,y_train)
        return lin_clf.predict(x_test)
    
    if model_to_use == "NeuralNetwork":
        clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1,max_iter=4000)
        clf.fit(x_train, y_train)
        return clf.predict(x_test)

In [None]:
x_train = np.array(df_cleaned_tweets["vectors"].tolist())
y_train =  df_train["label"].tolist()

In [None]:
tweets = []
ids = []
with open("twitter-datasets/test_data.txt") as f:
    for line in f:
        id, tweet = line.split(',', 1)

        tweets.append(tweet)
        ids.append(id)

    df_test = pd.DataFrame(list(zip(ids, tweets)), columns=['id', 'tweets'])

In [None]:
df_test = tokenize_and_preprocess(remove_tags(df_test))

In [None]:
df_test.head()

In [None]:
df_test["vectors"] = df_test["tokens"].apply(lambda tokens: construct_vector(tokens)) 

In [None]:
df_test

In [None]:
x_test = df_test["vectors"].tolist()
x_test

In [None]:
predictions = train_test_model("NeuralNetwork",x_train,y_train,x_test)

In [None]:
ids=[i for i in range(1,len(predictions)+1)]
with open("submission.csv", 'w', newline='') as csvfile:
    fieldnames = ['Id', 'Prediction']
    writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
    writer.writeheader()
    for r1, r2 in zip(ids, predictions):
        writer.writerow({'Id':int(r1),'Prediction':int(r2)})