In [None]:
from embeddings import *
from preprocessing import *
from models import *
from fasttext_model import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

## Load & Pre-process the training datasets

In [None]:
df_train, df_test = load_datasets(full=True)

In [None]:
df_train_cleaned = tokenize_and_preprocess(df_train, stop_words = False, stemming = False, lemmatization = False, unslang_bool = True, remove_tags_bool = True, unelongate_bool = True, uncase_bool = True, smiley_to_word_bool = True)
df_train_cleaned["label"] = df_train["label"]
df_test_cleaned = tokenize_and_preprocess(df_test, stop_words = False, stemming = False, lemmatization = False, unslang_bool = True, remove_tags_bool = True, unelongate_bool = True, uncase_bool = True, smiley_to_word_bool = True)

In [None]:
df_train_cleaned["label"] = df_train_cleaned["label"].apply(lambda label: 1 if label == 1 else 0)

# Glove

In [None]:
# Compute Glove embeddings 
df_train_vecs = tweet2vec(df_train_cleaned, "glove")
df_test_vecs = tweet2vec(df_test_cleaned, "glove")

In [None]:
# Split in train/test (cross-validation ?)
x_train, x_test, y_train, y_test = train_test_split(df_train_vecs["vectors"], df_train_vecs["label"], test_size=0.4, random_state=0)

### Logistic Regression

In [None]:
preds = train_test_model("LogisticRegression", x_train.tolist(),y_train.tolist(),x_test.tolist())

In [None]:
accuracy_score(y_test,preds)  #0.748

In [None]:
preds_submission = train_test_model("LogisticRegression", df_train_vecs["vectors"].tolist(),df_train_vecs["label"].tolist(),df_test_vecs["vectors"].tolist())

In [None]:
preds_submission = np.where(preds_submission < 0.5, -1, 1)

In [None]:
write_submission(preds_submission,"submission_logreg_glove_.csv") #0.729 0.729

### SVM

In [None]:
preds = train_test_model("SVM", x_train.tolist(),y_train.tolist(),x_test.tolist())

In [None]:
accuracy_score(y_test,preds) #O.74

In [None]:
preds_submission = train_test_model("SVM", df_train_vecs["vectors"].tolist(),df_train_vecs["label"].tolist(),df_test_vecs["vectors"].tolist())

In [None]:
preds_submission = np.where(preds_submission < 0.5, -1, 1) 

In [None]:
preds_submission

In [None]:
write_submission(preds_submission,"submission_svm_glove.csv") #0.729 0.730

### NN

In [None]:
preds, loss, best_params = train_test_model("MLPClassifier", x_train.tolist(),y_train.tolist(),x_test.tolist())

In [None]:
accuracy_score(y_test,preds) 
#solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1,max_iter=4000 
#0.768 

#solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(10, 4), random_state=1,max_iter=4000 
#0.780

In [None]:
preds_submission = train_test_model("NeuralNetwork", df_train_vecs["vectors"].tolist(),df_train_vecs["label"].tolist(),df_test_vecs["vectors"].tolist())

In [None]:
preds_submission = np.where(preds_submission < 0.5, -1, 1)

In [None]:
write_submission(preds_submission,"submission_nn_glove.csv")
#solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1,max_iter=4000 
#0.730 0.734

#solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(10, 4), random_state=1,max_iter=4000 
#0.763 0.769

# Fast-text

In [None]:
df_train_fasttext = fasttext_label(df_train_cleaned)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df_train_fasttext["tweets"], df_train_fasttext["label"], test_size=0.4, random_state=0)
train = pd.concat([x_train,y_train],axis=1)
test = pd.concat([x_test,y_test],axis=1)

In [None]:
preds = train_test_fasttext_model(train,test)

In [None]:
true_labels = []
for label in y_test:
    if label == "__label__1":
        true_labels.append(1)
    else:
        true_labels.append(-1)

In [None]:
accuracy_score(true_labels,preds) 
# 0.837 wiht no pre-process
# 0.82 with no stop words, no stemming, no lemmatization
# 0.67 with everything

In [None]:
preds = train_test_fasttext_model(df_train_fasttext,df_test_cleaned)

In [None]:
write_submission(preds,"submission_fasttext.csv")

# Word2Vec

In [None]:
# Compute Glove embeddings 
df_train_vecs = tweet2vec(df_train_cleaned, "word2vec")
df_test_vecs = tweet2vec(df_test_cleaned, "word2vec")

In [None]:
# Split in train/test (cross-validation ?)
x_train, x_test, y_train, y_test = train_test_split(df_train_vecs["vectors"], df_train_vecs["label"], test_size=0.4, random_state=0)

### Logistic Regression

In [None]:
preds = train_test_model("LogisticRegression", x_train.tolist(),y_train.tolist(),x_test.tolist())

In [None]:
accuracy_score(y_test,preds) #0.748

In [None]:
preds_submission = train_test_model("LogisticRegression", df_train_vecs["vectors"].tolist(),df_train_vecs["label"].tolist(),df_test_vecs["vectors"].tolist())

In [None]:
preds_submission

In [None]:
preds_submission = np.where(preds_submission < 0.5, -1, 1)

In [None]:
write_submission(preds_submission,"submission_logreg_word2vec.csv")

### SVM

In [None]:
preds = train_test_model("SVM", x_train.tolist(),y_train.tolist(),x_test.tolist())

In [None]:
accuracy_score(y_test,preds) #0.749

In [None]:
preds_submission = train_test_model("SVM", df_train_vecs["vectors"].tolist(),df_train_vecs["label"].tolist(),df_test_vecs["vectors"].tolist())

In [None]:
preds_submission = np.where(preds_submission < 0.5, -1, 1)

In [None]:
write_submission(preds_submission,"submission_svm_word2vec.csv")

### NN

In [None]:
preds = train_test_model("NeuralNetwork", x_train.tolist(),y_train.tolist(),x_test.tolist())

In [None]:
accuracy_score(y_test,preds)

In [None]:
preds_submission = train_test_model("NeuralNetwork", df_train_vecs["vectors"].tolist(),df_train_vecs["label"].tolist(),df_test_vecs["vectors"].tolist())

In [None]:
preds_submission = np.where(preds_submission < 0.5, -1, 1)

In [None]:
write_submission(preds_submission,"submission_nn_word2vec.csv")

# BERT

In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TBertModel, BertConfig, TFBertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased",do_lower_case=True)   

In [None]:
def mask_inputs_bert(tweets,max_len):
    input_ids = []
    attention_masks = []
    for tweet in tweets:
        encoded_dict = tokenizer.encode_plus(
            tweet,
            add_special_tokens=True,
            max_length=max_len,
            pad_to_max_length=True,
            return_attention_mask=True
        )
        input_ids.append(encoded_dict["input_ids"]) 
        attention_masks.append(encoded_dict["attention_mask"])
    # Return tensors    
    input_ids = tf.convert_to_tensor(input_ids)
    attention_masks = tf.convert_to_tensor(attention_masks)
    return input_ids, attention_masks

In [None]:
# No pre-process
x_train,x_test,y_train,y_test = train_test_split(df_train["tweets"], df_train["label"], test_size=0.4, random_state=0)

In [None]:
max_len = df_train.tweets.str.len().max() #Find longest tweet to define max length
train_inp,train_masks = mask_inputs_bert(x_train,max_len)
test_inp,test_masks = mask_inputs_bert(x_test,max_len)
train_label = tf.convert_to_tensor(y_train)
test_label = tf.convert_to_tensor(y_test)

In [None]:
bert_model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=2)

In [None]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-0.8)

bert_model.compile(loss=loss,optimizer=optimizer,metrics=[metrics])

In [None]:
history = bert_model.fit([train_inp,train_mask],train_label,batch_size=32,epochs=4,validation_data=([test_inp,test_mask],test_label))

In [None]:
preds = bert_model.predict(x_test)

# Recurrent Neural Networks

In [None]:
x = df_train["tweets"]
y = df_train["label"]

In [None]:
token = Tokenizer()

In [None]:
token.fit_on_texts(x)
seq = token.texts_to_sequences(x)

In [None]:
pad_seq = pad_sequences(seq,maxlen=200)

In [None]:
vocab_size = len(token.word_index)+1

In [None]:
embedding_vector = {}
f = open('data/glove.twitter.27B.200d.txt',"r",encoding="utf8")
for line in tqdm(f):
    value = line.split(' ')
    word = value[0]
    coef = np.array(value[1:],dtype = 'float32')
    embedding_vector[word] = coef

In [None]:
embedding_matrix = np.zeros((vocab_size,200))
for word,i in tqdm(token.word_index.items()):
    embedding_value = embedding_vector.get(word)
    if embedding_value is not None:
        embedding_matrix[i] = embedding_value

In [None]:
x_test = df_test['tweets']
x_test = token.texts_to_sequences(x_test)
test_seq = pad_sequences(x_test,maxlen=200)

In [None]:
preds = train_test_rnn("RNN LSTM", embedding_matrix, vocab_size, pad_seq, y, test_seq)

In [None]:
preds  = np.where(preds < 0.5, -1, 1)

In [None]:
write_submission(preds,"submission_rnn_lstm.csv")