# Models Evaluation

In [56]:
from embeddings import *
from tools import *
import pandas as pd
import numpy as np
from nltk.tokenize import TweetTokenizer
from gensim import *
import pickle
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import gensim.downloader as api

# Data input and output paths
POS_TRAIN_PATH = '../data/twitter-datasets/train_pos_full.txt' 
NEG_TRAIN_PATH = '../data/twitter-datasets/train_neg_full.txt' 
DATA_TEST_PATH = '../data/twitter-datasets/test_data.txt'
OUTPUT_PATH = 'predictions_out.csv'
TOKENS_PATH = "../saved_gen_files/all_tokens.txt"
W2V_MODEL_PATH = "../saved_gen_files/w2v.model"
FastText_MODEL_PATH = "../saved_gen_files/fasttext.model"

FULL_TRAIN_TWEET_VECTORS = "../saved_gen_files/train_tweet_vectors.txt"

## Import data

In [3]:
pos_ids, pos_text_train = load_csv_test_data(POS_TRAIN_PATH)
neg_ids, neg_text_train = load_csv_test_data(NEG_TRAIN_PATH)
full_dataset = np.concatenate((pos_text_train, neg_text_train), axis=None)
full_labels = np.concatenate((np.ones(len(pos_text_train)), -np.ones(len(pos_text_train))), axis=None)

## Tokenization

In [None]:
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=False)
#all_tokens = [tknzr.tokenize(tweet) for tweet in full_dataset]

import re # for regular expressions
from nltk.stem.porter import *
from nltk import stem
import nltk

def stemHelper(words):
    res = []
    for word in words:
        #print(word + " " + stemmer.lemmatize(word))
        res.append(stemmer.lemmatize(word))

    return res

def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt

pos_train_df = pd.DataFrame({'Label' : np.concatenate([np.array([-1 for _ in range(len(neg_text_train))]), np.ones(len(pos_text_train))]),
                            'Text' : np.concatenate([neg_text_train, pos_text_train])})
pos_train_df['clean_text'] = np.vectorize(remove_pattern)(pos_train_df['Text'], "<[\w]*>") 

pos_train_df['clean_text'] = pos_train_df['clean_text'].str.replace("[^a-zA-Z#]", " ")

pos_train_df['clean_text'] = pos_train_df['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

pos_train_df['clean_text'].replace('', np.nan, inplace=True)
pos_train_df.dropna(subset=['clean_text'], inplace=True)

tokenized_tweet = [tknzr.tokenize(tweet) for tweet in pos_train_df['clean_text']] #tokenizing
stemmer = stem.wordnet.WordNetLemmatizer()

all_tokens = tokenized_tweet.apply(stemHelper) # stemming



# Save 
#with open(TOKENS_PATH, "wb") as fp:   #Pickling
#    pickle.dump(all_tokens, fp)

In [None]:
all_tokens

## Word Embedding
### Word2Vec

In [73]:
with open(TOKENS_PATH, "rb") as fp:   # Unpickling
    all_tokens = pickle.load(fp)

# Train a word2vec model to generate embedding
model = models.Word2Vec(
        all_tokens,
        size=50,
        window=10,
        min_count=2,
        workers=10,
        iter=10)
model.save(W2V_MODEL_PATH)

### Glove

In [60]:
model = api.load("glove-twitter-50")



### Fastext

In [55]:
with open(TOKENS_PATH, "rb") as fp:   # Unpickling
    all_tokens = pickle.load(fp)

# Train a word2vec model to generate embedding
model = models.FastText(
        all_tokens,
        size=50,
        window=10,
        min_count=2,
        workers=10,
        iter=10)

model.save(FastText_MODEL_PATH)

## Feature Engineering

In [74]:
def generateTweetVector(word_dic, words):
    num_words = len(words)
    vector = np.zeros(word_dic.vector_size)
    for word in words:
        if word in word_dic.vocab:
            vector += word_dic[word]
    vector /= num_words
    return vector

all_tweets_vectors = np.array([generateTweetVector(model.wv, words) for words in all_tokens])

# Save 
with open(FULL_TRAIN_TWEET_VECTORS, "wb") as fp:   #Pickling
    pickle.dump(all_tweets_vectors, fp)

## Model Selection

## Test

In [75]:
X_train, X_test, y_train, y_test = train_test_split(all_tweets_vectors, full_labels)

In [76]:
clf = linear_model.Ridge(alpha=0.1)
#clf = linear_model.LogisticRegression()
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print(classification_report(y_test, predict_labels(predictions)))

              precision    recall  f1-score   support

        -1.0       0.74      0.69      0.71    312419
         1.0       0.71      0.75      0.73    312581

    accuracy                           0.72    625000
   macro avg       0.72      0.72      0.72    625000
weighted avg       0.72      0.72      0.72    625000



## Predict

In [39]:
# Retrain the model on the entire dataset
clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
clf.fit(all_tweets_vectors, full_labels)

# Load the data to predict
test_ids, test_x = load_csv_test_data(DATA_TEST_PATH, has_ID=True)

# Tokenize it
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=False)
test_tokens = [tknzr.tokenize(tweet) for tweet in test_x]

# Generate vector representation
all_tweets_vectors = np.array([generateTweetVector(model.wv, words) for words in test_tokens])

# Predict
predictions = clf.predict(all_tweets_vectors)

# Save predictions
create_csv_submission(test_ids, predictions, OUTPUT_PATH)