In [7]:
!pip install gensim



In [64]:
from baseline import *
from embeddings import * 
from preprocessing import * 
from submission import *
from helpers import *

In [28]:
%load_ext autoreload
%autoreload 2

In [8]:
PATH_DATA = './data/'
PATH_PREPROCESSING = PATH_DATA + 'preprocessing/'

# Preprocessing

In [11]:
train_pos, train_neg = load_tweets(PATH_DATA, small_dataset=1)

In [12]:
# preparing files used in the preprocessing
# slang 
lines = slang_dict_to_tuple(PATH_PREPROCESSING)
slang_dict = slang_tuple_to_dict(lines)
slang_tuple_to_json(slang_dict,PATH_PREPROCESSING )
slang_list = json.loads(open(PATH_PREPROCESSING + 'slang.json', 'r').read())
# stopwords vocab
nltk.download('stopwords')
# tokenizer used to remove punctuation
tokenizer = nltk.RegexpTokenizer(r"\w+")
# Remove words with low occurences 
nltk.download('webtext') #https://www.pythonprogramming.in/find-frequency-of-each-word-from-a-text-file-using-nltk.html

file saved


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Younes\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package webtext to
[nltk_data]     C:\Users\Younes\AppData\Roaming\nltk_data...
[nltk_data]   Package webtext is already up-to-date!


True

In [18]:
# counting the words in the train_pos and train_neg dataframes and saving them to a text file
# beware to produce the file for the correct ( full or small) dataset before using them later
word_occ_tofile(PATH_PREPROCESSING, file_name = 'count_word_pos.txt', df = train_pos )
word_occ_tofile(PATH_PREPROCESSING, file_name = 'count_word_neg.txt', df = train_neg )

In [19]:
# loading some files required for the preprocessing
contraction_list = load_contractions(PATH_PREPROCESSING)
stemmer = SnowballStemmer("english")
all_stopwords = load_stopwords()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Younes\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
filter_words_pos = low_occuring_words(PATH_PREPROCESSING, file_name = 'count_word_pos.txt' )
filter_words_neg = low_occuring_words(PATH_PREPROCESSING, file_name = 'count_word_neg.txt' )

In [21]:
filter_words_all = filter_words_pos + filter_words_neg

In [23]:
test_data = pd.read_fwf(PATH_DATA + 'twitter-datasets/test_data.txt', header = None, names = ['Tweet'], colspecs = [(0,280)])

In [24]:
train_pos = preprocessing(train_pos, contraction_list, stemmer, 
                          filter_words_all, slang_list, all_stopwords, tokenizer)
train_neg = preprocessing(train_neg, contraction_list, stemmer, 
                          filter_words_all, slang_list, all_stopwords, tokenizer) 
test_data = preprocessing(test_data, contraction_list, stemmer, 
                          filter_words_all, slang_list, all_stopwords, tokenizer)

100%|██████████████████████████████████████████████████████████████████████| 100000/100000 [00:00<00:00, 869951.11it/s]
100%|██████████████████████████████████████████████████████████████████████| 100000/100000 [00:00<00:00, 433156.26it/s]
100%|██████████████████████████████████████████████████████████████████████| 100000/100000 [00:00<00:00, 418624.97it/s]
100%|██████████████████████████████████████████████████████████████████████| 100000/100000 [00:00<00:00, 901244.98it/s]
100%|██████████████████████████████████████████████████████████████████████| 100000/100000 [00:00<00:00, 162955.39it/s]
100%|███████████████████████████████████████████████████████████████████████| 100000/100000 [00:07<00:00, 13956.82it/s]
100%|██████████████████████████████████████████████████████████████████████| 100000/100000 [00:00<00:00, 272132.27it/s]
100%|██████████████████████████████████████████████████████████████████████| 100000/100000 [00:00<00:00, 423971.56it/s]
100%|███████████████████████████████████

# Embeddings

## Glove

In [55]:
glove_model = load_glove_model(path_pretrained_embeddings = 'data/twitter-datasets/glove-twitter-25.gz')

In [56]:
train_pos_embeddings = df_to_GloVe(train_pos, model1=glove_model)
train_neg_embeddings = df_to_GloVe(train_neg, model1=glove_model)
test_data_embeddings = df_to_GloVe(test_data, model1=glove_model)

100%|███████████████████████████████████████████████████████████████████████| 100000/100000 [00:03<00:00, 30648.51it/s]
100%|███████████████████████████████████████████████████████████████████████| 100000/100000 [00:03<00:00, 26543.98it/s]
100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 26470.17it/s]


In [57]:
clean_cols(train_pos_embeddings)
clean_cols(train_neg_embeddings)
clean_cols(test_data_embeddings)

In [58]:
x, y = df_to_numpy_features(train_pos_embeddings,train_neg_embeddings)
x, means,stds = standardize_cols(x, mean_x=None, std_x=None)

In [59]:
test_d = test_data_embeddings.to_numpy(copy=True)
test_x, _, _ = standardize_cols(test_d, means, stds)

## TF-IDF

In [42]:
vectors, vectorizer = tf_idf_embedding(train_pos, train_neg, max_features=10000, ngram_range=(1,1))

In [43]:
x, y = add_label_tfidf(vectors)

In [45]:
x

<200000x9121 sparse matrix of type '<class 'numpy.float64'>'
	with 1076769 stored elements in Compressed Sparse Row format>

# Baseline models

## Naive bayes

In [None]:
naive_bayes_cv(x, y, splits=5, glove=False)

In [49]:
gnb = naive_bayes(x, y, glove=False)
gnb.score(x,y)  # cell was run with tf-idf embeddings

0.750245

# Logistic regression

In [None]:
logistic_regression_cv(x, y, 
                       solvers = ['lbfgs', 'saga'],
                       penalty = ['l2'], 
                       c_values = [100, 10, 1.0, 0.1, 0.01], 
                       splits = 5)

In [52]:
lr = logistic_regression(x, y, solver='saga', penalty='l2', c_value=1.0)
lr.score(x,y) # cell was run with tf-idf embeddings

0.78546

# MLP

In [None]:
mlp_cv(x, y, 
       solvers =['adam'], 
       lrs = [0.1, 0.01,0.001], 
       activations = ['tanh','relu','logistic'],
       max_iters=[10,50], splits = 5)

In [67]:
mlp1 = mlp(x, y, solver='adam', lr=0.001, act='tanh', max_iters=50)
mlp1.score(x,y) # cell was run with glove embeddings

0.640955