# Exploration Notebook v4 - Jacopo

Writing this concurrently as v3 and v3_side

Lets go back and work on word embedding.

We have a god grasp of the whole problem and confidence that with the right amount of tweaking around we can pass the 0.9 treshold, nonetheless while we train a robust classifier we can also try to improve the word embedding here in this notebook. More precisely, we can leveage huge models like BERT and try to use them to improve our word embedding.

In [12]:
# helpers
def build_feature_matrix(df, vocab, embeddings, mode='avg'):
    X = np.zeros((df.shape[0], embeddings.shape[1]))
    for i, tweet in enumerate(df['tweet']):
        words = tweet.split()
        for word in words:
            if word in vocab:
                X[i] += embeddings[vocab[word]]
        if mode == 'avg':
            X[i] /= len(words)
        elif mode == 'sum':
            pass
        else:
            raise ValueError('Unknown mode: {}'.format(mode))
    return X
    
def load_train_data(path_pos='data/twitter-datasets/train_pos_full.txt', path_neg='data/twitter-datasets/train_neg_full.txt'):
    # Load data, txt as csv
    #data_path = 'data/twitter-datasets/'
    df_train_pos = pd.read_csv(path_pos, sep = '\t', names = ['tweet'])
    df_train_pos['label'] = 1
    df_train_neg = pd.read_csv(path_neg, sep = '\t', names = ['tweet'], on_bad_lines='skip')
    df_train_neg['label'] = -1
    df_train = pd.concat([df_train_pos, df_train_neg], ignore_index=True)
    print('Train set: ', df_train.shape)
    print('Train set positives: ', df_train_pos.shape)
    print('Train set negatives: ', df_train_neg.shape)
    return df_train   

def load_test_data():
    # Load test data: id, tweet for each row
    data_path = 'data/twitter-datasets/'
    df_test = pd.read_csv(data_path + 'test_data.txt', header=None, names=['line'], sep='\t')
    # Extract id and tweet, limit split by 1 so we don't split the tweet (this is v0, at least we keep it intact)
    df_test['id'] = df_test['line'].apply(lambda x: x.split(',',1)[0]) 
    df_test['tweet'] = df_test['line'].apply(lambda x: x.split(',',1)[1])
    df_test = df_test.drop('line', axis=1)
    return df_test

def predict_test_data(X_test, classifier, filename='submission.csv'):
    # Predict test data and save to csv
    y_pred = classifier.predict(X_test)
    df_test['Prediction'] = y_pred
    df_test.rename(columns={'id': 'Id'}, inplace=True)
    df_test['Prediction'] = df_test['Prediction'].apply(lambda x: -1 if x == 0 else x)
    df_test.to_csv(filename, columns=['Id', 'Prediction'], index=False)
    return df_test
    
def predict_test_data_pipeline(df_test, pipe, filename='submission.csv'):
    # Predict test data and save to csv
    y_pred = pipe.predict(df_test['tweet'])
    df_test['Prediction'] = y_pred
    df_test.rename(columns={'id': 'Id'}, inplace=True)
    df_test['Prediction'] = df_test['Prediction'].apply(lambda x: -1 if x == 0 else x)
    df_test.to_csv(filename, columns=['Id', 'Prediction'], index=False)
    return df_test

def train_test(clf, X_train, y_train, X_eval=None, y_eval=None, cv=None):
    from sklearn.metrics import accuracy_score, f1_score
    if X_eval is None:
        from sklearn.model_selection import train_test_split
        X_train, X_eval, y_train, y_eval = train_test_split(X_train, y_train, test_size=0.2)
    if cv is not None:
        from sklearn.model_selection import cross_val_score
        scores = cross_val_score(clf, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1, shuffle=True)
        print('Cross validation Accuracy Scores: ', scores)
        print('Cross validation mean score: ', scores.mean())
        print('Cross validation std score: ', scores.std())
        clf.fit(X_train, y_train)
        return clf
    clf.fit(X_train, y_train)
    print('Training set size: ', X_train.shape, ' Evaluation set size: ', X_eval.shape)
    print('Metrics on evaluation set: ')
    test(clf, X_eval, y_eval)
    print('Metrics on training set to check overfitting/triviality of model: ')
    test(clf, X_train, y_train)
    from sklearn.metrics import confusion_matrix
    print('Confusion matrix: ')
    print(confusion_matrix(y_eval, clf.predict(X_eval)))
    return clf

def test(clf, X_eval, y_eval):
    from sklearn.metrics import accuracy_score, f1_score
    y_pred = clf.predict(X_eval)
    print('Accuracy: ', accuracy_score(y_eval, y_pred))
    print('F1 score: ', f1_score(y_eval, y_pred))
    return y_pred

In [3]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

# sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC, LinearSVC

In [4]:
# XGBoost
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier


  from pandas import MultiIndex, Int64Index


In [21]:
# Load data, lets work on partial data for now, we'll come back to this later
df_train = load_train_data(path_pos='data/twitter-datasets/train_pos.txt', path_neg='data/twitter-datasets/train_neg.txt')
# Load vectorization and classifier obtained from the previous notebook as a reference
with open('data/out/trained/tfidf_vectorizer-linSVC-pipeline-v2_4.pickle', 'rb') as f:
    pipe = pickle.load(f)

# we know that this achieves acc: 0.848	f1: 0.850 and runs in less than 3min on full data from scratch
# let's train a linear SVC on the partial data and see how it performs so we can compare and iterate
svm = LinearSVC()
vec_pipe = pipe.steps[0][1]
svm_pipe = pipe.steps[1][1]
print(vec_pipe, svm_pipe)

# check svm_pipe performance on partial data, don't even modify the vectorizer
X_train, X_eval, y_train, y_eval = train_test_split(df_train['tweet'], df_train['label'], test_size=0.2)
X_train = vec_pipe.transform(X_train)
X_eval = vec_pipe.transform(X_eval)


Train set:  (196970, 2)
Train set positives:  (97902, 2)
Train set negatives:  (99068, 2)
TfidfVectorizer(binary=True, min_df=3, ngram_range=(1, 4)) LinearSVC()


In [22]:
# straight up xgboost
xgb = XGBClassifier()
xgb = train_test(xgb, X_train, y_train, X_eval, y_eval)





KeyboardInterrupt: 

In [23]:
# that's neat, fast and not overfitting, let's try to improve it
# naive bayes
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb = train_test(nb, X_train, y_train, X_eval, y_eval)

Training set size:  (157576, 3018685)  Evaluation set size:  (39394, 3018685)
Metrics on evaluation set: 
Accuracy:  0.8108341371782505
F1 score:  0.8196515004840271
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.930896837081789
F1 score:  0.9329127415886785
Confusion matrix: 
[[15008  4973]
 [ 2479 16934]]


In [13]:
# honestly bayes is fucking king
# whatever, let's keep xgboost and try to improve it just for fun at this point
# aparently xgboost is not advised when data is overparametrized but ok
xgb = XGBClassifier(
    max_depth=10,
    alpha=0,
)
xgb = train_test(xgb, X_train, y_train, X_eval, y_eval)
    



Training set size:  (157576, 3018685)  Evaluation set size:  (39394, 3018685)
Metrics on evaluation set: 
Accuracy:  0.8019495354622531
F1 score:  0.8114730330562537
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.8461821597197543
F1 score:  0.8539703578744426
Confusion matrix: 
[[14801  5054]
 [ 2748 16791]]


In [14]:
xgb = XGBClassifier(
    max_depth=10,
    alpha=0.1,
    use_label_encoder=False,
    eval_metric='error',
    subsample=0.5,
    verbosity=2,
    num_parallel_tree=1,
    binary='logistic',

)
xgb = train_test(xgb, X_train, y_train, X_eval, y_eval)

Parameters: { "binary" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[00:26:20] INFO: /var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_eek2t0c4ro/croots/recipe/xgboost-split_1659548960591/work/src/tree/updater_prune.cc:101: tree pruning end, 360 extra nodes, 0 pruned nodes, max_depth=10
[00:26:21] INFO: /var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_eek2t0c4ro/croots/recipe/xgboost-split_1659548960591/work/src/tree/updater_prune.cc:101: tree pruning end, 388 extra nodes, 0 pruned nodes, max_depth=10
[00:26:22] INFO: /var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_eek2t0c4ro/croots/recipe/xgboost-split_1659548960591/work/src/tree/updater_prune.cc:101: tree pruning end, 314 extra nodes, 0 pruned nodes, max_depth=10
[00:26:22] INFO: /

In [15]:
xgb = XGBClassifier(
    max_depth=10,
    alpha=0.1,
    use_label_encoder=False,
    eval_metric='error',
    subsample=0.2,
    verbosity=0,
    num_parallel_tree=10,
    objective='binary:logistic',

)
xgb = train_test(xgb, X_train, y_train, X_eval, y_eval)

Training set size:  (157576, 3018685)  Evaluation set size:  (39394, 3018685)
Metrics on evaluation set: 
Accuracy:  0.8049956846220236
F1 score:  0.8127985183741105
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.8289841092552165
F1 score:  0.8357950667836599
Confusion matrix: 
[[15035  4820]
 [ 2862 16677]]


In [4]:
# Ok, let's try to improve the vectorizer
# let's first use again the whole thing on full data
df_train = load_train_data(path_pos='data/twitter-datasets/train_pos_full.txt', path_neg='data/twitter-datasets/train_neg_full.txt')
X_train, X_eval, y_train, y_eval = train_test_split(df_train['tweet'], df_train['label'], test_size=0.2)
X_train_vec = vec_pipe.transform(X_train)
X_eval_vec = vec_pipe.transform(X_eval)
nb = MultinomialNB()
nb = train_test(nb, X_train, y_train, X_eval, y_eval)
linsvc = LinearSVC()
linsvc = train_test(linsvc, X_train, y_train, X_eval, y_eval)

Train set:  (2458295, 2)
Train set positives:  (1218655, 2)
Train set negatives:  (1239640, 2)


NameError: name 'vec_pipe' is not defined

In [17]:
# at least now we know bayes is king but still a bit less complex
# than linear SVC,
# and we also know linear svc is overfitting but not as much as it could be
# let's try with xgboost
xgb = XGBClassifier()
xgb = train_test(xgb, X_train, y_train, X_eval, y_eval)



Training set size:  (1966636, 3018685)  Evaluation set size:  (491659, 3018685)
Metrics on evaluation set: 
Accuracy:  0.7926164272392044
F1 score:  0.8062163127846071
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.7955818972092447
F1 score:  0.8090283253875811
Confusion matrix: 
[[177596  70222]
 [ 31740 212101]]


In [18]:
xgb = XGBClassifier(
    max_depth=10,
    alpha=0.1,
    use_label_encoder=False,
    eval_metric='error',
    subsample=0.2,
    verbosity=0,
    objective='binary:hinge',
)
xgb = train_test(xgb, X_train, y_train, X_eval, y_eval)

Training set size:  (1966636, 3018685)  Evaluation set size:  (491659, 3018685)
Metrics on evaluation set: 
Accuracy:  0.7765768550967236
F1 score:  0.7981469979676514
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.7829135640759144
F1 score:  0.8038922967104545
Confusion matrix: 
[[164636  83182]
 [ 26666 217175]]


In [20]:
xgb = XGBClassifier(
    max_depth=6,
    alpha=0,
    use_label_encoder=False,
    eval_metric='logloss',
    subsample=0.5,
    verbosity=0,
    objective='binary:logistic',
)
xgb = train_test(xgb, X_train, y_train, X_eval, y_eval)

In [5]:
import gensim
#word_vectors = gensim.models.KeyedVectors.load_word2vec_format('data/glove/glove.twitter.27B.200d.txt', binary=False)
#word_vectors.save('data/glove_custom/glove.twitter.27B.200d.txt')

In [6]:


v0_embeddings = np.load('data/glove_custom/embeddings.npy')
v4_embeddings = np.load('data/glove_custom/glove.twitter.27B.200d.txt.vectors.npy')
print(v0_embeddings.shape)
print(v4_embeddings.shape)


(101298, 150)
(1193517, 200)


In [7]:
# fasttext
from gensim.models import FastText
#fasttexxt = FastText()
#fasttexxt.build_vocab(corpus_iterable=df_train['tweet'])
#fasttexxt.train(corpus_iterable=df_train['tweet'], total_examples=fasttexxt.corpus_count, epochs=10)

In [11]:
#fasttexxt.save('data/fasttext/fasttext.model')

glove = gensim.models.KeyedVectors.load_word2vec_format('data/glove/glove.twitter.27B.200d.txt', binary=False)
print(glove.most_similar('love'))

[('you', 0.8460860252380371), ('much', 0.7890047430992126), ('always', 0.7601684331893921), ('know', 0.7598055005073547), ('my', 0.7519950270652771), ('and', 0.7513090372085571), ('loves', 0.7512385249137878), ('life', 0.7443934679031372), ('it', 0.7426838874816895), ("n't", 0.7408117055892944)]


  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]


In [12]:
print(glove.most_similar('hate'))
print(glove.most_similar('happy'))
print(glove.most_similar(':)'))
print(glove.most_similar(':('))

[('people', 0.7922294735908508), ("n't", 0.7866110801696777), ('why', 0.7847471237182617), ('fuck', 0.7806913256645203), ('dont', 0.770850419998169), ('when', 0.7697439193725586), ('swear', 0.7657259702682495), ('really', 0.7627530694007874), ('stupid', 0.7585508227348328), ('seriously', 0.7504260540008545)]
[('birthday', 0.8998554944992065), ('day', 0.8070886731147766), ('bday', 0.7734537720680237), ('wish', 0.7571845650672913), ('merry', 0.726503849029541), ('love', 0.7250142693519592), ('year', 0.7109401226043701), ('you', 0.7065563797950745), ('hope', 0.7004973292350769), ('thank', 0.6997925043106079)]


KeyError: "Key ':)' not present in vocabulary"

In [13]:
# augment glove with n-grams
from gensim.models import Phrases
from gensim.models.phrases import Phraser

# add bigrams and trigrams to docs (only ones that appear 5 times or more).
bigram = Phrases(df_train['tweet'], min_count=5)
trigram = Phrases(bigram[df_train['tweet']], min_count=10)
quadrigram = Phrases(trigram[bigram[df_train['tweet']]], min_count=100)

print(trigram[bigram['i love you']])

['i', ' ', 'l', 'o', 'v', 'e', ' ', 'y', 'o', 'u']


In [15]:
print(bigram['i love you'])
print(trigram['i love you baby'])


['i', ' ', 'l', 'o', 'v', 'e', ' ', 'y', 'o', 'u']
['i', ' ', 'l', 'o', 'v', 'e', ' ', 'y', 'o', 'u', ' ', 'b', 'a', 'b', 'y']


In [16]:
print(bigram[df_train['tweet'][0]])

['<user> i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15', 'vinco tresorpack 6 ( difficulty 10 of 10 object : disassemble and reassemble the wooden pieces this beautiful wo ... <url>']


In [17]:
print(bigram['i_love_you'])

['i', '_', 'l', 'o', 'v', 'e', '_', 'y', 'o', 'u']


# Skipping a step but trying directly to combine word embeddings with tfidf vectorizer
idea: word embedding + tfidf vectorizer on top to have a phrase embedding/vector

In [29]:
# first lets settle on an embedding and make sure we can use it
# let's try again the glove one, but at this point training a fasttext model on our data
# seems less of a hussle than trying to make the glove one work
import gensim
from gensim.models import FastText
# HERE we include grams and subword windows -- actualllly gemsim does not support this but the original faceboook fasttext does
# whatever, with this word embedding we are still retaiing context and a certain 'meaning' of the words
# the real context intended as phrases (tweet) will be recovered hopefully with a combination of word embeddings in tf-idf that we'll try
# we can there use n-grams on the word embeddings :)
df_train = load_train_data(path_pos='data/twitter-datasets/train_pos_full.txt', path_neg='data/twitter-datasets/train_neg_full.txt')
# fasttext = FastText(
#     corpus_file=None, # df_train['tweet'] ?
#     vector_size=20, 
#     window=5, 
#     epochs=10,
#     min_count=5,
#     sg=0, # sg ({1, 0}, optional) – Training algorithm: skip-gram if sg=1, otherwise CBOW.
#     negative=0, # negative (int, optional) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.
#     cbow_mean=0, # cbow_mean ({1,0}, optional) – If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
#     min_n=None, # min_n (int, optional) – Min length of char ngrams to be used for training word representations.
#     max_n=None, # max_n (int, optional) – Max length of char ngrams to be used for training word representations. Set min_n to be greater than max_n to avoid char ngrams being used.
#     word_ngrams=1, #  (int, optional) – In Facebook’s FastText, “max length of word ngram” - but gensim only supports the default of 1 (regular unigram word handling).
# )

fasttext = FastText(vector_size=20, window=5, min_count=5, sg=0, negative=0, cbow_mean=0, word_ngrams=1)
fasttext.build_vocab(corpus_iterable=df_train['tweet'])

#fasttext.save('data/fasttext/fasttext_20.model')
print(fasttext.wv.most_similar('love'))

Train set:  (2458295, 2)
Train set positives:  (1218655, 2)
Train set negatives:  (1239640, 2)
[('“', 0.5666552186012268), ('h', 0.3975861072540283), ('>', 0.3211832642555237), (':', 0.2996082007884979), ('x', 0.29522085189819336), ('{', 0.28220418095588684), ('}', 0.2754124701023102), ('r', 0.25764912366867065), ('(', 0.2549425959587097), ('^', 0.23890458047389984)]


In [31]:
# noisy as hell but at least we got it to work
fasttext.train(corpus_iterable=df_train['tweet'], total_examples=fasttext.corpus_count, epochs=10)
print(fasttext.wv.most_similar('love'))

[('“', 0.5666552186012268), ('h', 0.3975861072540283), ('>', 0.3211832642555237), (':', 0.2996082007884979), ('x', 0.29522085189819336), ('{', 0.28220418095588684), ('}', 0.2754124701023102), ('r', 0.25764912366867065), ('(', 0.2549425959587097), ('^', 0.23890458047389984)]


In [32]:
# so absolutely nothing changed??
print(fasttext.wv.most_similar('hate'))

[('2', 0.41747480630874634), ('{', 0.33628779649734497), ('®', 0.32545962929725647), ('r', 0.2515483796596527), (';', 0.24647396802902222), ('0', 0.23436297476291656), ('w', 0.2108132392168045), ('3', 0.19439783692359924), ('~', 0.19127322733402252), ('“', 0.18923039734363556)]


In [34]:
fasttext = FastText(
    min_count = 5,
    window = 5,
    vector_size = 20,
    min_n=3,
)
fasttext.build_vocab(corpus_iterable=df_train['tweet'])
print(fasttext.max_vocab_size)
print(fasttext.wv.most_similar('love'))
fasttext.train(corpus_iterable=df_train['tweet'], total_examples=fasttext.corpus_count, epochs=10)
print(fasttext.wv.most_similar('love'))

None
[('“', 0.5666552186012268), ('h', 0.3975861072540283), ('>', 0.3211832642555237), (':', 0.2996082007884979), ('x', 0.29522085189819336), ('{', 0.28220418095588684), ('}', 0.2754124701023102), ('r', 0.25764912366867065), ('(', 0.2549425959587097), ('^', 0.23890458047389984)]
[('&', 0.5322758555412292), (' ', 0.4602575898170471), (',', 0.456903874874115), ('z', 0.4248853325843811), ('y', 0.42446547746658325), ("'", 0.41426488757133484), ('g', 0.3792472779750824), ('j', 0.3564785122871399), ('d', 0.3537110388278961), ('n', 0.3388735055923462)]


In [47]:
# ok so, clearly we need to build a better vocab anyway
# lets print a few words and see what we get
print(fasttext.wv.index_to_key[:10])
print(fasttext.wv.index_to_key[-10:])
print(fasttext.wv.vectors_ngrams.shape)
print(fasttext.wv.vectors.shape) # we have 75 words in the vocab?
print(fasttext.wv.get_mean_vector('love'))
print(fasttext.wv.key_to_index['love'])
print(fasttext.wv.key_to_index['hate'])



[' ', 'e', 'o', 't', 'a', 's', 'i', 'r', 'n', 'l']
['@', '\\', '`', '}', '{', '“', '®', '’', '\x13', '×']
(2000000, 20)
(75, 20)
[ 0.03919867 -0.25482583  0.21448834  0.22237524  0.12658215  0.17024063
  0.27242953  0.08672871  0.062164    0.08897415  0.01590549 -0.10961515
 -0.0808576   0.10215715  0.26773533  0.09722336 -0.32588804  0.15129927
  0.1893925  -0.18371277]


KeyError: 'love'

In [49]:
# tokenize
from nltk.tokenize import TweetTokenizer
#tokenized = TweetTokenizer().tokenize(df_train['tweet'][])
#print(tokenized)
#fasttext.build_vocab(corpus_iterable=df_train['tweet'].apply(lambda x: x.split(' ')))

TypeError: expected string or buffer

In [52]:
df_train['tweet']
print(df_train.info())
df_train['tweet'] = df_train['tweet'].astype(str)
print(df_train.info())


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2458295 entries, 0 to 1239639
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   tweet   object
 1   label   int64 
dtypes: int64(1), object(1)
memory usage: 88.5+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2458295 entries, 0 to 1239639
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   tweet   object
 1   label   int64 
dtypes: int64(1), object(1)
memory usage: 88.5+ MB
None


In [54]:
from nltk.tokenize import TweetTokenizer
tokenized = TweetTokenizer().tokenize(df_train['tweet'].iloc[0])
print(tokenized)

['<user>', 'i', 'dunno', 'justin', 'read', 'my', 'mention', 'or', 'not', '.', 'only', 'justin', 'and', 'god', 'knows', 'about', 'that', ',', 'but', 'i', 'hope', 'you', 'will', 'follow', 'me', '#believe', '15']


In [55]:
tokenizer = TweetTokenizer(
    preserve_case=True,
    reduce_len=True,
)
df_train['tweet'] = df_train['tweet'].apply(lambda x: tokenizer.tokenize(x))
print(df_train['tweet'].iloc[0])

['<user>', 'i', 'dunno', 'justin', 'read', 'my', 'mention', 'or', 'not', '.', 'only', 'justin', 'and', 'god', 'knows', 'about', 'that', ',', 'but', 'i', 'hope', 'you', 'will', 'follow', 'me', '#believe', '15']


In [57]:
# let's go back to fasttext
fasttext = FastText(
    min_count = 5,
    window = 5,
    vector_size = 20,
    min_n=3,
)
fasttext.build_vocab(corpus_iterable=df_train['tweet'])
print(fasttext.wv.vectors_ngrams.shape)
print(fasttext.wv.vectors.shape) 

(2000000, 20)
(101317, 20)


In [70]:
print(fasttext.corpus_count)
print(fasttext.corpus_total_words)
#fasttext.train(corpus_iterable=df_train['tweet'], total_examples=fasttext.corpus_count, epochs=10)
print(fasttext.wv.most_similar('love'))

2458295
39285957
[("m'love", 0.9681973457336426), ('i.love', 0.9674108624458313), ('jlove', 0.9663768410682678), ('1love', 0.9662020802497864), ('lovvve', 0.9587792158126831), ('iilove', 0.9561951160430908), ('llove', 0.9544649124145508), ('lovve', 0.9524462819099426), ('looove', 0.9511415362358093), ('loveeed', 0.9445867538452148)]


In [59]:
# now we are talking
print(fasttext.wv.most_similar('hate'))


[('ihate', 0.960334837436676), ('#nohate', 0.9171327352523804), ('dislike', 0.9118636250495911), ('hatee', 0.911791205406189), ('#ihate', 0.9037554860115051), ('haterz', 0.8958828449249268), ('haterzz', 0.8894347548484802), ('#dislike', 0.886340320110321), ('hateee', 0.8822832703590393), ('lovehate', 0.87751305103302)]


In [67]:
print(fasttext.wv.index_to_key[:10])
print(fasttext.wv.index_to_key[-10:])
print(fasttext.wv.most_similar(':)')) # these shouldn't be too acurate, afterall they should not be present
print(fasttext.wv.most_similar(':('))
print(fasttext.wv.most_similar(':-)')) # but this should, and is!
print(fasttext.wv.most_similar(':-('))

['<user>', '!', 'i', 'the', ',', '.', 'to', 'you', '(', '<url>']
['tapers', 'penuh', '19001', 'boaters', 'frazzle', '#422', 'beantown', 'farida', 'christiana', 'freeney']
[("8':", 0.8468958735466003), ('#nss28', 0.8268639445304871), ('3q', 0.8250125646591187), ('6u', 0.8229467272758484), ('[=', 0.8187487721443176), ('_7', 0.8165414929389954), ('n_n', 0.8092526793479919), ('’', 0.8038870096206665), (':{', 0.7867059707641602), ('pz', 0.7862887382507324)]
[('9j', 0.9037322402000427), (':*)', 0.8949961066246033), ("8':", 0.8809409737586975), ('p0', 0.8714581727981567), ('6u', 0.8621630072593689), ('p:', 0.862082302570343), ('pq', 0.8602790236473083), ('r51e', 0.8591496348381042), ('7v', 0.8559269905090332), ('{:', 0.854961633682251)]
[(':-p', 0.8968167901039124), (':-\\', 0.8957145810127258), (':-|', 0.8839971423149109), (':-@', 0.8579592704772949), (':-[', 0.8515523076057434), (':-d', 0.8502748608589172), ('eeyah', 0.8284174203872681), ('eeeh', 0.8256227374076843), ('aahw', 0.802894055843

In [68]:
# Great, let's now expand this to include the ngrams
print(fasttext.wv.most_similar('I love you'))

[('weloveyou', 0.9377278089523315), ('loveyou', 0.9368027448654175), ('iloveyoussosomuch', 0.9240853190422058), ('#istillloveyou', 0.9231034517288208), ('loveyousomuch', 0.9222390651702881), ('loveeyou', 0.9198563694953918), ('iloveyou', 0.9183106422424316), ('iloveyousomuch', 0.9162368774414062), ('#iloveyousomuch', 0.9134881496429443), ('iloveyouguys', 0.9127840399742126)]


In [73]:

fasttext = FastText(
    min_count = 5,
    window = 5,
    vector_size = 100,
    min_n=3, 
    max_n=6,
    sg=0, #cbow
)
fasttext.build_vocab(corpus_iterable=df_train['tweet'])
fasttext.train(corpus_iterable=df_train['tweet'], total_examples=fasttext.corpus_count, epochs=10)
print(fasttext.wv.most_similar('love'))
print(fasttext.wv.most_similar("I don't love you"))
print(fasttext.wv.most_similar('I love you'))
print(fasttext.wv.most_similar('I hate wanting you so much'))
print(fasttext.wv.most_similar('I hate loving you so much <3'))



[("m'love", 0.9359018206596375), ('1love', 0.9302533864974976), ('loveeed', 0.9280291795730591), ('jlove', 0.9263998866081238), ('i.love', 0.9255800247192383), ('loveed', 0.9235557317733765), ('lovehate', 0.9079862833023071), ('llove', 0.898972749710083), ('loveu', 0.8952510356903076), ('#1love', 0.8940567970275879)]
[('loveeyou', 0.8116405010223389), ('loveyou', 0.8003208041191101), ('#youloveme', 0.7956029176712036), ("idon't", 0.7909086346626282), ('me.you', 0.7894436120986938), ('too.you', 0.7875821590423584), ("don'ts", 0.7851004004478455), ("don't", 0.7849775552749634), ('4you', 0.783201277256012), ("d'you", 0.782477617263794)]
[('loveyou', 0.8963254690170288), ('loveeyou', 0.893829345703125), ('weloveyou', 0.8708831071853638), ('#istillloveyou', 0.858667254447937), ('iloveyou', 0.8488462567329407), ('loveyouu', 0.8444809317588806), ('me.you', 0.8437607288360596), ('#iloveyou', 0.8336917161941528), ('#weloveyou', 0.83107590675354), ('loveyouliam', 0.8277154564857483)]
[('wanting'

In [1]:
# save
#fasttext.save('data/fasttext/fasttext_100.model')
# load
from gensim.models import FastText
fasttext = FastText.load('data/fasttext/fasttext_100.model')
# let's use this with simple

In [6]:
import numpy as np
import pandas as pd
# let's test  this, later we will try to combine it to expand to ngrams for words
# for now we do a simple average of the word vectors in the tweet
def get_tweet_vector(tweet, vectorizer, dim=100):
    tweet_vector = np.zeros(dim)
    vec = vectorizer
    for word in tweet:
        tweet_vector += vec.get_vector(word)
    tweet_vector /= len(tweet)
    return tweet_vector

def bld_feature_matrix(df, vectorizer, dim=100, tokenizer=None):
    from nltk.tokenize import TweetTokenizer
    if tokenizer is None:
        tokenizer = TweetTokenizer(
            preserve_case=True,
            reduce_len=True,
        )
    df['tweet'] = df['tweet'].apply(lambda x: tokenizer.tokenize(x))
    X = np.zeros((df.shape[0], dim))
    for i, tweet in enumerate(df['tweet']):
        X[i] = get_tweet_vector(tweet, vectorizer, dim)
    return X

df_train = load_train_data(path_pos='data/twitter-datasets/train_pos_full.txt', path_neg='data/twitter-datasets/train_neg_full.txt')
X_train = bld_feature_matrix(df_train, fasttext.wv, dim=100)

print(X_train.shape)
print(X_train[0])

Train set:  (2458295, 2)
Train set positives:  (1218655, 2)
Train set negatives:  (1239640, 2)
(2458295, 100)
[ 0.97016688  0.20003064 -4.03440614  0.45517573 -0.28249551  0.35427921
  1.8911416   0.20286558  0.63157106 -2.87167854 -3.39766136  1.92036216
 -1.14403208  3.09840214  0.55869466 -1.10047139 -1.8481549   0.70465623
  0.27437835  1.03170613 -2.53220977 -1.50201934 -0.24134134 -0.37451546
 -0.80676103 -1.66658795 -2.42787469 -0.95312001  2.69939766 -1.00389883
 -1.17416897 -0.82312816 -0.32662999  0.2079889   1.88435278 -0.76844877
 -0.02880284  2.11518678 -0.04662244  0.1312566   2.15882026  0.29642264
 -1.70965011 -0.59289174  0.21062202  2.45253244 -1.8116421  -0.15546217
  1.31620138 -0.63307249 -0.22296252  1.92281728 -0.52396292  0.18152309
  0.40592332 -1.27735976  1.10320234  0.878496   -0.15603699 -1.65937777
  0.06945011 -0.87143017 -1.3542627   0.80353592  0.74737335 -0.04481394
 -2.40723957  2.40495597  0.0172052  -2.15293321  0.04629602 -2.45589297
  2.59419857  

In [7]:
# now usual split and train and eval
X_train, X_eval, y_train, y_eval = train_test_split(X_train, df_train['label'], test_size=0.2)
print(X_train.shape)
print(X_eval.shape)
print(y_train.shape)
print(y_eval.shape)

(1966636, 100)
(491659, 100)
(1966636,)
(491659,)


In [8]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
nb = GaussianNB()
nb = train_test(nb, X_train, y_train, X_eval, y_eval)
linsvc = LinearSVC()
#linsvc = train_test(linsvc, X_train, y_train, X_eval, y_eval)

Training set size:  (1966636, 100)  Evaluation set size:  (491659, 100)
Metrics on evaluation set: 
Accuracy:  0.6260741692921313
F1 score:  0.6828372885800447
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.6255956872547843
F1 score:  0.682921703751583
Confusion matrix: 
[[109911 138399]
 [ 45445 197904]]


In [9]:
# linear svc usually is very fast but it's having problems with scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_eval_scaled = scaler.transform(X_eval)
linsvc = LinearSVC()
# still not working in reasonable time
#linsvc = train_test(linsvc, X_train_scaled, y_train, X_eval_scaled, y_eval)

In [15]:
print(X_train_scaled.shape)
print(X_eval_scaled.shape)
print(X_train_scaled[0])
# map to -1, 1
y_train_1 = y_train.apply(lambda x: -1 if x == 0 else 1)
y_eval_1 = y_eval.apply(lambda x: -1 if x == 0 else 1)


(1966636, 100)
(491659, 100)
[-0.30796533 -0.41258597 -1.46436788  1.65222853 -0.53770074 -0.09499411
  0.71472956 -1.10555175  0.56355063 -0.78662993 -0.16649955  1.2987345
 -0.81053144  0.66438871  0.22367088 -1.16044089 -0.8451649   0.57563337
  1.67690981  0.24567452 -0.46736754  0.58348306  0.74724952  1.26649956
 -0.1019182  -0.7953835  -0.22728432  0.4571953  -0.21853914 -0.78806941
  0.33144348 -0.29990412 -0.26121149 -0.65861397 -0.22771421 -0.30817866
 -1.65371003 -0.52256453  0.3242577   0.48413643 -0.23213804 -0.89732991
 -0.10198301  1.19259531  0.83455859  0.90904943  0.40933952 -1.22993366
  0.24259977  0.99966029 -2.03361107  0.54586261  1.51129443  1.11514888
  1.38356981 -0.31188875  0.35826367  0.28131472 -0.34615435 -0.53410088
  0.06221348  0.67561666 -0.01961437 -0.27392226 -1.49538253 -0.62687008
 -0.70066072  0.33942782  0.08098166 -0.14730184 -0.51601347  0.65913079
 -0.19285041  0.61625509  1.26309728 -0.47064715 -0.1737989   0.00784422
 -0.15227795 -0.3568840

In [16]:
nb = GaussianNB()
nb = train_test(nb, X_train_scaled, y_train_1, X_eval_scaled, y_eval_1)

Training set size:  (1966636, 100)  Evaluation set size:  (491659, 100)
Metrics on evaluation set: 
Accuracy:  0.6260741692921313
F1 score:  0.6828372885800447
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.6255956872547843
F1 score:  0.682921703751583
Confusion matrix: 
[[109911 138399]
 [ 45445 197904]]


In [17]:
# as of now this is definitely worse than the LSI on tfidf
# let's build a better feature matrix which includes the ngrams
# instead of doing it retroactively we should go back and extract phrases
# we then join bi, tri and quadgrams from the phrases
# and feed them back to the fasttext model with something like _ to separate the words
# we can then actually expect vectors that should be close if they are similar, as phrases

from gensim.models import Phrases
from gensim.models.phrases import Phraser
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer(
    preserve_case=True,
    reduce_len=True,
)
df_train = load_train_data(path_pos='data/twitter-datasets/train_pos_full.txt', path_neg='data/twitter-datasets/train_neg_full.txt')
df_train['tweet'] = df_train['tweet'].apply(lambda x: tokenizer.tokenize(x))
phrases = Phrases(df_train['tweet'], min_count=5, threshold=10)
bigram = Phraser(phrases)
print(bigram[df_train['tweet'][0]])
print(bigram[df_train['tweet'][1]])

Train set:  (2458295, 2)
Train set positives:  (1218655, 2)
Train set negatives:  (1239640, 2)
<gensim.interfaces.TransformedCorpus object at 0x16c043df0>
<gensim.interfaces.TransformedCorpus object at 0x16c041960>


In [23]:
print(df_train['tweet'][0])
# revert them back to strings

df_train['tweet'] = df_train['tweet'].apply(lambda x: ' '.join(x))
print(df_train['tweet'][0])

# now, let's build the phrases
df_train = load_train_data(path_pos='data/twitter-datasets/train_pos_full.txt', path_neg='data/twitter-datasets/train_neg_full.txt')
tweets = df_train['tweet'].apply(lambda x: tokenizer.tokenize(x))
phrases = Phrases(tweets, min_count=5, threshold=10)
bigram = Phraser(phrases)
trigram = Phrases(bigram[tweets], min_count=5, threshold=10)
trigram = Phraser(trigram)
quadgram = Phrases(trigram[bigram[tweets]], min_count=5, threshold=10)
quadgram = Phraser(quadgram)

# let's join the various ngrams
print(tweets[0])
print(' '.join(bigram[tweets[0]]))
print(' '.join(trigram[bigram[tweets[0]]]))
print(' '.join(quadgram[trigram[bigram[tweets[0]]]]))

0    <user> i dunno justin read my mention or not ....
0    vinco tresorpack 6 ( difficulty 10 of 10 objec...
Name: tweet, dtype: object
0    < u s e r >   i   d u n n o   j u s t i n   r ...
0    v i n c o   t r e s o r p a c k   6   (   d i ...
Name: tweet, dtype: object
Train set:  (2458295, 2)
Train set positives:  (1218655, 2)
Train set negatives:  (1239640, 2)
0    [<user>, i, dunno, justin, read, my, mention, ...
0    [vinco, tresorpack, 6, (, difficulty, 10, of, ...
Name: tweet, dtype: object


TypeError: sequence item 0: expected str instance, list found

In [30]:
tweets_ = tweets.apply(lambda x: ' '.join(x))
print(' '.join(bigram[tweets_[0]]))
print(' '.join(trigram[bigram[tweets_[0]]]))
print(' '.join(quadgram[trigram[bigram[tweets_[0]]]]))
print(bigram[tweets_[0]])
print(len(bigram[tweets_[0]]))
print(trigram[bigram[tweets_[0]]])
print(len(trigram[bigram[tweets_[0]]]))
print(quadgram[trigram[bigram[tweets_[0]]]])
print(len(quadgram[trigram[bigram[tweets_[0]]]]))

print(tweets[0])
print(len(tweets[0]))
print(bigram[tweets[0]])
print(len(bigram[tweets[0]]))
print(trigram[bigram[tweets[0]]])
print(len(trigram[bigram[tweets[0]]]))

<user> i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15 vinco tresorpack 6 ( difficulty 10 of 10 object : disassemble and reassemble the wooden pieces this beautiful wo ... <url>
<user> i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15 vinco tresorpack 6 ( difficulty 10 of 10 object : disassemble and reassemble the wooden pieces this beautiful wo ... <url>
<user> i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15 vinco tresorpack 6 ( difficulty 10 of 10 object : disassemble and reassemble the wooden pieces this beautiful wo ... <url>
['<user> i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15', 'vinco tresorpack 6 ( difficulty 10 of 10 object : disassemble and reassemble the wooden pieces this beautiful wo .

In [32]:
bigram = Phrases(tweets, min_count=5, threshold=10)
print(bigram[tweets[0]])
print(len(bigram[tweets[0]]))
txts = [bigram[tweet] for tweet in tweets]
print(txts[0])
print(len(txts[0]))

<gensim.interfaces.TransformedCorpus object at 0x2d7ca63b0>
2
['<user>', 'i', 'dunno', 'justin', 'read', 'my', 'mention', 'or', 'not', '.', 'only', 'justin', 'and', 'god_knows', 'about', 'that', ',', 'but', 'i', 'hope', 'you', 'will', 'follow', 'me', '#believe', '15']
26


In [38]:
print(df_train['tweet'][0])
print(tweets[0])
print(txts[0]) # ok nice finnally, we see 'god_knows'
print(len(txts[0]))
print(len(tweets[0]))

0    <user> i dunno justin read my mention or not ....
0    vinco tresorpack 6 ( difficulty 10 of 10 objec...
Name: tweet, dtype: object
0    [<user>, i, dunno, justin, read, my, mention, ...
0    [vinco, tresorpack, 6, (, difficulty, 10, of, ...
Name: tweet, dtype: object
['<user>', 'i', 'dunno', 'justin', 'read', 'my', 'mention', 'or', 'not', '.', 'only', 'justin', 'and', 'god_knows', 'about', 'that', ',', 'but', 'i', 'hope', 'you', 'will', 'follow', 'me', '#believe', '15']
26
2


In [39]:
#unigram = [w for w in df_train['tweet'].apply(lambda x: tokenizer.tokenize(x))]
bigram_ = Phrases(df_train['tweet'])
print(bigram_[df_train['tweet'][0]])

['<user> i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15', 'vinco tresorpack 6 ( difficulty 10 of 10 object : disassemble and reassemble the wooden pieces this beautiful wo ... <url>']


In [41]:
bigram.corpus_word_count
bigram.vocab

{'<user>': 1605564,
 'i': 993728,
 '<user>_i': 130014,
 'dunno': 1585,
 'i_dunno': 908,
 'justin': 7326,
 'dunno_justin': 3,
 'read': 9545,
 'justin_read': 4,
 'my': 401454,
 'read_my': 977,
 'mention': 3158,
 'my_mention': 135,
 'or': 48578,
 'mention_or': 42,
 'not': 123274,
 'or_not': 1552,
 '.': 741613,
 'not_.': 992,
 'only': 42126,
 '._only': 807,
 'only_justin': 20,
 'and': 456724,
 'justin_and': 876,
 'god': 16044,
 'and_god': 260,
 'knows': 4329,
 'god_knows': 194,
 'about': 64390,
 'knows_about': 57,
 'that': 201425,
 'about_that': 2167,
 ',': 745845,
 'that_,': 2344,
 'but': 157940,
 ',_but': 28853,
 'but_i': 31660,
 'hope': 36913,
 'i_hope': 14351,
 'you': 601241,
 'hope_you': 8696,
 'will': 82101,
 'you_will': 5997,
 'follow': 92157,
 'will_follow': 1840,
 'me': 342534,
 'follow_me': 33612,
 '#believe': 1057,
 'me_#believe': 39,
 '15': 5627,
 '#believe_15': 2,
 'because': 26275,
 'your': 138981,
 'because_your': 423,
 'logic': 795,
 'your_logic': 6,
 'is': 311182,
 'logic_

In [42]:
bigram_.corpus_word_count

189894680

In [43]:
bigram_.vocab

{'<': 2220023,
 'u': 6352276,
 '<_u': 2132461,
 's': 9423600,
 'u_s': 2132379,
 'e': 15722332,
 's_e': 2352706,
 'r': 9004658,
 'e_r': 3202920,
 '>': 2177463,
 'r_>': 1605608,
 ' ': 36855599,
 '>_ ': 1635158,
 'i': 9326672,
 ' _i': 2503335,
 'i_ ': 1116302,
 'd': 4491249,
 ' _d': 978049,
 'd_u': 71340,
 'n': 8105634,
 'u_n': 319824,
 'n_n': 186712,
 'o': 11239805,
 'n_o': 612469,
 'o_ ': 1638143,
 'j': 386471,
 ' _j': 285612,
 'j_u': 195989,
 't': 11052088,
 's_t': 1047468,
 't_i': 716456,
 'i_n': 2146456,
 'n_ ': 1769823,
 ' _r': 662124,
 'r_e': 1420409,
 'a': 10585680,
 'e_a': 927614,
 'a_d': 401356,
 'd_ ': 2034462,
 'm': 4222840,
 ' _m': 1730630,
 'y': 3869942,
 'm_y': 443504,
 'y_ ': 2049492,
 'm_e': 1168077,
 'e_n': 907948,
 'n_t': 589392,
 'i_o': 292769,
 'o_n': 1394011,
 ' _o': 1234596,
 'o_r': 1100030,
 'r_ ': 1550022,
 ' _n': 809412,
 'o_t': 484825,
 't_ ': 3343784,
 '.': 2618227,
 ' _.': 1391286,
 '._ ': 1175769,
 'l': 6697549,
 'n_l': 66339,
 'l_y': 368754,
 ' _a': 2376356,

# start here - tokenization and word embedding

In [5]:
df_train = load_train_data(path_pos='data/twitter-datasets/train_pos_full.txt', path_neg='data/twitter-datasets/train_neg_full.txt')
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer(
    preserve_case=True,
    reduce_len=True,
)
from gensim.models import Phrases
tkns = df_train['tweet'].apply(lambda x: tokenizer.tokenize(x))
bigram = Phrases(tkns)
bigram.save('data/grams/bigram.model')
#ok so we want the first version, bigram from tokenized tweets
# we should then add it to the tokenized tweets
print(bigram[tkns[0]])
print(bigram[tkns[1]])

Train set:  (2458295, 2)
Train set positives:  (1218655, 2)
Train set negatives:  (1239640, 2)
<gensim.interfaces.TransformedCorpus object at 0x371c87220>
<gensim.interfaces.TransformedCorpus object at 0x371c731c0>


In [10]:
print(bigram.corpus_word_count)
#print(bigram.vocab) - same here
tkns_bigram = [bigram[tweet] for tweet in tkns]
#print(tkns_bigram[0])
#print(len(tkns_bigram[0])) - same, huge output!
#print(tweets) - I mitakenly left this running, its been printing for 50min lol

len(tkns_bigram[0])
len(tkns_bigram)
len(tkns_bigram[1])

39285957


2458295

In [16]:
print(len(tkns_bigram[1]))
print(len(tkns_bigram[0]))
print(len(tkns_bigram[2]))
print(len(tkns_bigram))
print(bigram[tkns[0]])
print(tkns_bigram[0])
print(tkns[0])

21
26
13
2458295
<gensim.interfaces.TransformedCorpus object at 0x32a43c040>
['<user>', 'i', 'dunno', 'justin', 'read', 'my', 'mention', 'or', 'not', '.', 'only', 'justin', 'and', 'god_knows', 'about', 'that', ',', 'but', 'i', 'hope', 'you', 'will', 'follow', 'me', '#believe', '15']
0    [<user>, i, dunno, justin, read, my, mention, ...
0    [vinco, tresorpack, 6, (, difficulty, 10, of, ...
Name: tweet, dtype: object


In [32]:
print(tkns[0])
print(tkns_bigram[0])
print(tkns[1])
print(tkns_bigram[1])
print(len(tkns[0]))
print(len(tkns_bigram[0]))
print(len(tkns[1]))
print(len(tkns_bigram[1]))
print(df_train['tweet'][0])
print(df_train['tweet'][1])

# why are they twice per tweet? are we loding them badly?
df_train.info

0    [<user>, i, dunno, justin, read, my, mention, ...
0    [vinco, tresorpack, 6, (, difficulty, 10, of, ...
Name: tweet, dtype: object
['<user>', 'i', 'dunno', 'justin', 'read', 'my', 'mention', 'or', 'not', '.', 'only', 'justin', 'and', 'god_knows', 'about', 'that', ',', 'but', 'i', 'hope', 'you', 'will', 'follow', 'me', '#believe', '15']
1    [because, your, logic, is, so, dumb, ,, i, won...
1    [glad, i, dot, have, taks, tomorrow, !, !, #th...
Name: tweet, dtype: object
['because', 'your', 'logic', 'is', 'so', 'dumb', ',', 'i', "won't", 'even', 'crop', 'out', 'your', 'name', 'or', 'your', 'photo', '.', 'tsk', '.', '<url>']
2
26
2
21
0    <user> i dunno justin read my mention or not ....
0    vinco tresorpack 6 ( difficulty 10 of 10 objec...
Name: tweet, dtype: object
1    because your logic is so dumb , i won't even c...
1    glad i dot have taks tomorrow ! ! #thankful #s...
Name: tweet, dtype: object


<bound method DataFrame.info of                                                      tweet  label
0        <user> i dunno justin read my mention or not ....      1
1        because your logic is so dumb , i won't even c...      1
2         <user> just put casper in a box !  looved the...      1
3        <user> <user> thanks sir > > don't trip lil ma...      1
4        visiting my brother tmr is the bestest birthda...      1
...                                                    ...    ...
1239635  im so sorry ! <user> & to <user> & <user> u gu...      0
1239636                i can't find food coloring anywhere      0
1239637  <user> same here ! ! but tort ! ! wonder why y...      0
1239638  keyless entry remote fob clicker for 2005 buic...      0
1239639  <user> yeap . doctor don't know what's wrong w...      0

[2458295 rows x 2 columns]>

In [36]:
print(df_train.head())
print(df_train['tweet'][0])
# I think we might be loading the data in a weird way.


                                               tweet  label
0  <user> i dunno justin read my mention or not ....      1
1  because your logic is so dumb , i won't even c...      1
2   <user> just put casper in a box !  looved the...      1
3  <user> <user> thanks sir > > don't trip lil ma...      1
4  visiting my brother tmr is the bestest birthda...      1
0    <user> i dunno justin read my mention or not ....
0    vinco tresorpack 6 ( difficulty 10 of 10 objec...
Name: tweet, dtype: object


In [40]:
# I added 'ignore index=True' in the append function, oh my god we were
# mixing the tweets, half of the phrase was from positive, half from negative. Jesus.
df_train = load_train_data(path_pos='data/twitter-datasets/train_pos_full.txt', path_neg='data/twitter-datasets/train_neg_full.txt')
print(df_train.head())
print(df_train['tweet'][0])
# holy, indeed we were literally mixing them up in the worst way possible


Train set:  (2458295, 2)
Train set positives:  (1218655, 2)
Train set negatives:  (1239640, 2)
                                               tweet  label
0  <user> i dunno justin read my mention or not ....      1
1  because your logic is so dumb , i won't even c...      1
2   <user> just put casper in a box !  looved the...      1
3  <user> <user> thanks sir > > don't trip lil ma...      1
4  visiting my brother tmr is the bestest birthda...      1
<user> i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15


In [41]:
tkns = df_train['tweet'].apply(lambda x: tokenizer.tokenize(x))
bigram = Phrases(tkns)
bigram.save('data/grams/bigram.model')
tkns_bigram = [bigram[tweet] for tweet in tkns]
print(tkns_bigram[0])
print(bigram[tkns[0]])
print(tkns[0])
print(df_train['tweet'][0])

['<user>', 'i', 'dunno', 'justin', 'read', 'my', 'mention', 'or', 'not', '.', 'only', 'justin', 'and', 'god_knows', 'about', 'that', ',', 'but', 'i', 'hope', 'you', 'will', 'follow', 'me', '#believe', '15']
['<user>', 'i', 'dunno', 'justin', 'read', 'my', 'mention', 'or', 'not', '.', 'only', 'justin', 'and', 'god_knows', 'about', 'that', ',', 'but', 'i', 'hope', 'you', 'will', 'follow', 'me', '#believe', '15']
['<user>', 'i', 'dunno', 'justin', 'read', 'my', 'mention', 'or', 'not', '.', 'only', 'justin', 'and', 'god', 'knows', 'about', 'that', ',', 'but', 'i', 'hope', 'you', 'will', 'follow', 'me', '#believe', '15']
<user> i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15


In [43]:
trigram = Phrases(tkns_bigram)
trigram.save('data/grams/trigram.model')
tkns_trigram = [trigram[tweet] for tweet in tkns_bigram]
quadgram = Phrases(tkns_trigram)
quadgram.save('data/grams/quadgram.model')
tkns_quadgram = [quadgram[tweet] for tweet in tkns_trigram]
print(trigram.corpus_word_count)
print(len(trigram.vocab))
print(quadgram.corpus_word_count)
print(len(quadgram.vocab))

37007683
6280053
35728076
6563097


In [82]:
# in theory they really should include the bigram and trigram and unigram as well
# at this point, so we should have something similar to the (1,4) ngram in vectorizer of sklearn
# let's now use this new vocabulary to train a fasttext model, with many more features like 300
# or even 500 (maybe now let's keep it at 300) or even test with just 25 - 100
# this should be a very dense matrix, so we should be able to use the same model as before
# and now it would actually make sense to average the vectors of each word
# we can also use the vectors of the phrases up to quadgrams since we should have them
# if indeed they are considered worthy of being bigrams and trigrams etc
print(len(tkns_quadgram))
print(len(tkns_quadgram[0]))
print(len(tkns_quadgram[1]))
print(len(tkns_quadgram[2]))
print(len(tkns_trigram[0]))
print(len(tkns_trigram[1]))
print(len(tkns_trigram[2]))
print(len(tkns_bigram[0]))
print(len(tkns_bigram[1]))
print(len(tkns_bigram[2]))
print(len(tkns[0]))
print(len(tkns[1]))
print(len(tkns[2]))
# let's save the bigram, trigram and quadgram models
from gensim.models import FastText
fasttext = FastText(
    vector_size=100, 
    window=5, 
    min_n=3,
    max_n=6,
    sg=0,
    )

# at this stage i want to make sure the tri and quadgram are actually working
# so i want to see if they are actually in the vocabulary

#let's query the vocabulary
print(trigram.vocab.get('i_love'))
print(trigram.vocab.get('i_love_you'))
print(trigram.vocab.get('only_god_knows')) # gotcha!
# ok seems like they are working, let's now train the model
# we can actually export the phrases at each stage!
#print(len(trigram.export_phrases()))
#print(len(quadgram.export_phrases()))

# let's save theses phrases to dataframe to then export them to csv
# dct_bigrams = dict(bigram.export_phrases())
# dct_trigrams = dict(trigram.export_phrases())
# dct_quadgrams = dict(quadgram.export_phrases())

# why are they not working? they are empty
# dct_bigrams = dict()
# dct_trigrams = dict()
# dct_quadgrams = dict()
dct_bigrams = bigram.export_phrases()
dct_trigrams = trigram.export_phrases()
dct_quadgrams = quadgram.export_phrases()

# dct_bigrams = {k:[v] for k,v in bigram.export_phrases().items()}
# dct_trigrams = {k:[v] for k,v in trigram.export_phrases().items()}
# dct_quadgrams = {k:[v] for k,v in quadgram.export_phrases().items()}
print(dct_bigrams.get('i_love'))
print(dct_trigrams.get('i_love_you'))
print(dct_quadgrams.get('only_god_knows'))
print(dct_trigrams.get('only_god_knows'))
print(dct_quadgrams.get('i_love'))

df_bigrams = pd.DataFrame(dct_bigrams.items(), columns=['bigram', 'score'])
df_trigrams = pd.DataFrame(dct_trigrams.items(), columns=['trigram', 'score'])
df_quadgrams = pd.DataFrame(dct_quadgrams.items(), columns=['quadgram', 'score'])
print(df_bigrams.head())
print(df_trigrams.head())
print(df_quadgrams.head())
# incredible I can't put the dictionary into a dataframe nor in a dictionary?????

dictionary = {'halo': 1, 'halo2': 2, 'halo3': 3}
df = pd.DataFrame(dictionary.items(), columns=['w', 'v'])
print(df.head())

#df_bigram = pd.DataFrame(bigram.export_phrases(), columns=['bigram', 'score'])
#df_trigram = pd.DataFrame(trigram.export_phrases(), columns=['trigram', 'score'])

# finally! let's save them in csv
df_bigrams.to_csv('data/grams/bigrams_phrases.csv', index=False, sep= ' ')
df_trigrams.to_csv('data/grams/trigrams_phrases.csv', index=False, sep= ' ')
df_quadgrams.to_csv('data/grams/quadgrams_phrases.csv', index=False, sep= ' ')



2458295
26
21
13
26
21
13
26
21
13
27
21
13
50621
None
38
None
None
None
None
None
          bigram      score
0      god_knows  15.648932
1            >_>  48.607685
2       lil_mama  22.311112
3       ya_thang  10.728545
4  birthday_gift  12.794833
         trigram      score
0      god_knows  69.396746
1            >_>  82.314324
2       lil_mama  36.256913
3       ya_thang  19.838160
4  birthday_gift  49.708501
          quadgram      score
0        god_knows  43.859900
1              >_>  44.844841
2         lil_mama  40.265468
3    birthday_gift  49.967300
4  becoming_famous  46.053912
       w  v
0   halo  1
1  halo2  2
2  halo3  3


# idea for v5:
The big remaining annoying thing to this moment is the following:
tfidf does not givemmuch semantic meaning, but appllying it with ngrams it does, furthermore the big 'problems' with having this huge matrix are actually nullified by the optimizations of numpy and the various libraries we use. We won't really have to deal with a 8digit feature vector since it's a huge sparse matrix. Usually the non-zero values will be a few tens, this including 1 to 4 grams!

Embeddings might be useful for more difficult tasks, but here we are really trying to divide the space in binary, not much. Furthermore everything falls the second we try to just 'average' the vectors over the sentences.

Idea: let's leverage both methods. We have the luxury of having enough data and nowadays there are very complex models that can learn to ignore the zero-values in various features. Let's encode each word and the various grams with embeddings that are trained on the dataset such that themselves they even retain notion of similarities. Instead of alining them in the same space or 'squashing them' by averaging for every tweet, let's reduce the dimensionality and add them side by side.
We leverage that twitter has a maximum number of characters per tweet, and more simply we can find an optimal balance. We recover the property of sparce matrices while maintaining the extra meaning given by the embeddings.

The remaining element would be to normalize based on our own dataset. That is, our embeddings might have learned that 'love' is similar to 'iloveuyou', but we could waight them exactly in the same way that we would go from BOW to tfidf.


It's fundamentally a different dimension than before:
- before we had a very large, sparse vector, with features = vocabulary
- now we have embeddings to encode each word in vocabulary, and features = embeddings of each word in the tweet

By weighting the embeddings we can recover some of the fundamental info carried by the tfidf: each data point carries the info about its own words in comparison to the other words in the dataset. This is a very powerful property.

The model will hopefully be able to learn that 'first 20 features = first word of the tweet' and so on.
Technically it should have all the info to know which word it is as well as 'the meaning' of the word, 'its context' and so on, given that the embeddings are trained to capture this info.

Let's see if this makes sense. the only final problem would be that tfidf can allow itself to just throw extra features for n-grams, while here dimensionality will be a problem so we can try different strategies. We built a model of embeddings that doesn't just randomly pick all grams, but decides which are worth bundling and which not. This handles word-grams. Using Fasttext we handle semantic meaning of grams inside the words themselves!

Tying everything together we should finally have all the info necessary to split the ata in a meaningful way.

In [86]:
# let's put it together with the grams and fasttext
from gensim.models import FastText
fasttext = FastText(
    vector_size=100,
    window=5,
    min_n=3,
    max_n=10,
    sg=0,
)
fasttext.build_vocab(corpus_iterable=tkns_quadgram)
fasttext.train(corpus_iterable=tkns_quadgram, total_examples=fasttext.corpus_count, epochs=fasttext.epochs)
print(fasttext.wv.most_similar('i_love'))
print(fasttext.wv.most_similar('i_love_you'))



[('#1love', 0.960010290145874), ('jlove', 0.9505100250244141), ('llove', 0.9482436776161194), ('i.love', 0.9428744912147522), ("m'love", 0.9425817728042603), ('1love', 0.941887617111206), ('#alllove', 0.9286666512489319), ('yveslove', 0.9282400012016296), ('#mylove', 0.9260808825492859), ('#bfflove', 0.91941899061203)]
[('?_love_youuu_xxx', 0.9408364295959473), ('love_you.let', 0.9395909309387207), ('love_youuu_xxx', 0.9277811646461487), ('love_your_icon', 0.9266449213027954), ('love_youuu', 0.9185110330581665), ('ya_love_your_#britishmahomies', 0.9147262573242188), ('love_yooou', 0.9137768745422363), ('love_youte_amo.belieber_forever', 0.9066047072410583), ("you_love_niall's_laugh", 0.9031429290771484), ('looove_youuu', 0.8993458151817322)]


In [87]:
fasttext.save('data/fasttext/fasttext_100_4grams.model')

In [88]:
fasttext = FastText(
    vector_size=20,
    window=5,
    min_n=3,
    max_n=10,
    sg=0,
)
fasttext.build_vocab(corpus_iterable=tkns_quadgram)
fasttext.train(corpus_iterable=tkns_quadgram, total_examples=fasttext.corpus_count, epochs=fasttext.epochs)
print(fasttext.wv.most_similar('i_love'))

fasttext.save('data/fasttext/fasttext_20_4grams.model')

[('#1love', 0.9798915982246399), ('jlove', 0.9722762107849121), ('llove', 0.966873824596405), ("m'love", 0.9620018601417542), ('te_amooo_love', 0.9604809880256653), ('smilers_love', 0.9565942883491516), ('i.love', 0.9565492272377014), ('1love', 0.9565232396125793), ('#alllove', 0.9555274844169617), ('#mylove', 0.9511960744857788)]


In [89]:
fasttext = FastText(
    vector_size=300,
    window=5,
    min_n=3,
    max_n=10,
    sg=0,
)
fasttext.build_vocab(corpus_iterable=tkns_quadgram)
fasttext.train(corpus_iterable=tkns_quadgram, total_examples=fasttext.corpus_count, epochs=fasttext.epochs)
print(fasttext.wv.most_similar('i_love'))

fasttext.save('data/fasttext/fasttext_300_4grams.model')

[('#1love', 0.9569266438484192), ('jlove', 0.9501928091049194), ('llove', 0.9441788792610168), ("m'love", 0.9438387155532837), ('i.love', 0.9435019493103027), ('1love', 0.9431397318840027), ('#mylove', 0.9317726492881775), ('#alllove', 0.9172578454017639), ('#bfflove', 0.9172359108924866), ('yveslove', 0.9163663983345032)]


In [9]:
from gensim.models import FastText
fasttext = FastText.load('data/fasttext/fasttext_20_4grams.model')
df_train = load_train_data()
# Great, let's see what we can do with the fasttext model
# we could even do some crazy approssimation like entire tweets 
print(fasttext.wv.get_sentence_vector(df_train['tweet'][0]).shape)
print(df_train['tweet'][0])
 # oh ok there's a built in method?? nice

Train set:  (2458295, 2)
Train set positives:  (1218655, 2)
Train set negatives:  (1239640, 2)
(20,)
<user> i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15


In [10]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(
    preserve_case=True,
    reduce_len=True,
)

df_train['tweet'] = df_train['tweet'].apply(lambda x: tknzr.tokenize(x)).apply(lambda x: ' '.join(x))
print(df_train['tweet'][0])
X_train_vec = np.zeros((df_train.shape[0], 20))
for i, tweet in enumerate(df_train['tweet']):
    X_train_vec[i] = fasttext.wv.get_sentence_vector(tweet)

from sklearn.model_selection import train_test_split
X_train, X_eval, y_train, y_eval = train_test_split(X_train_vec, df_train['label'], test_size=0.2)

<user> i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15


In [11]:
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
#nb = train_test(nb, X_train, y_train, X_eval, y_eval)
linsvc = LinearSVC()
linsvc = train_test(linsvc, X_train, y_train, X_eval, y_eval)
# multinomialNB can't handle negative 


Training set size:  (1966636, 20)  Evaluation set size:  (491659, 20)
Metrics on evaluation set: 
Accuracy:  0.645256163316445
F1 score:  0.6456151187527557
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.6451626025354972
F1 score:  0.6441819768959666
Confusion matrix: 
[[158374  88739]
 [ 85674 158872]]


In [13]:
# uhuhuh now its time for trees and forests
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
y_train = y_train.apply(lambda x: -1 if x == 0 else x)
y_eval = y_eval.apply(lambda x: -1 if x == 0 else x)

lil_tree = DecisionTreeClassifier(
    splitter='best',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
)
lil_tree = train_test(lil_tree, X_train, y_train, X_eval, y_eval)

Training set size:  (1966636, 20)  Evaluation set size:  (491659, 20)
Metrics on evaluation set: 
Accuracy:  0.6264931588763757
F1 score:  0.6233761972148733
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.9985635369229486
F1 score:  0.9985499770306095
Confusion matrix: 
[[156045  91068]
 [ 92570 151976]]


In [14]:
# noice, lets boost this lil tree
xgb = XGBClassifier()
xgb = train_test(xgb, X_train, y_train, X_eval, y_eval)



Training set size:  (1966636, 20)  Evaluation set size:  (491659, 20)
Metrics on evaluation set: 
Accuracy:  0.6717847125751791
F1 score:  0.6801488951787455
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.6794490693753191
F1 score:  0.6865292097928699
Confusion matrix: 
[[158716  88397]
 [ 72973 171573]]


In [15]:
print(xgb.get_params())

{'objective': 'binary:logistic', 'use_label_encoder': True, 'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1, 'enable_categorical': False, 'gamma': 0, 'gpu_id': -1, 'importance_type': None, 'interaction_constraints': '', 'learning_rate': 0.300000012, 'max_delta_step': 0, 'max_depth': 6, 'min_child_weight': 1, 'missing': nan, 'monotone_constraints': '()', 'n_estimators': 100, 'n_jobs': 10, 'num_parallel_tree': 1, 'predictor': 'auto', 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'subsample': 1, 'tree_method': 'exact', 'validate_parameters': 1, 'verbosity': None}


In [16]:

y_train = y_train.apply(lambda x: 0 if x != 1 else x)
y_eval = y_eval.apply(lambda x: 0 if x != 1 else x)
xgb = XGBClassifier(
    max_depth=10,
    n_estimators=100,
    use_label_encoder=False,
)
xgb = train_test(xgb, X_train, y_train, X_eval, y_eval)

Training set size:  (1966636, 20)  Evaluation set size:  (491659, 20)
Metrics on evaluation set: 
Accuracy:  0.6824384380231013
F1 score:  0.6918017512968914
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.7420539438920064
F1 score:  0.7484386700057029
Confusion matrix: 
[[160295  86818]
 [ 69314 175232]]


In [17]:
#honestly since we are using 20 feautures this is prretty nice
# lets work on small dataset to iterate faster
df_train = load_train_data(path_pos='data/twitter-datasets/train_pos.txt', path_neg='data/twitter-datasets/train_neg.txt')
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(
    preserve_case=True,
    reduce_len=True,
)
from gensim.models import FastText
fasttext = FastText.load('data/fasttext/fasttext_300_4grams.model')
df_train['tweet'] = df_train['tweet'].apply(lambda x: tknzr.tokenize(x)).apply(lambda x: ' '.join(x))
print(df_train['tweet'][0])
X_train_vec = np.zeros((df_train.shape[0], 300))
for i, tweet in enumerate(df_train['tweet']):
    X_train_vec[i] = fasttext.wv.get_sentence_vector(tweet)

from sklearn.model_selection import train_test_split
X_train, X_eval, y_train, y_eval = train_test_split(X_train_vec, df_train['label'], test_size=0.2)

Train set:  (196970, 2)
Train set positives:  (97902, 2)
Train set negatives:  (99068, 2)
<user> i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15


In [18]:
y_train = y_train.apply(lambda x: -1 if x != 1 else x)
y_eval = y_eval.apply(lambda x: -1 if x != 1 else x)
linsvc = LinearSVC()
linsvc = train_test(linsvc, X_train, y_train, X_eval, y_eval)

Training set size:  (157576, 300)  Evaluation set size:  (39394, 300)
Metrics on evaluation set: 
Accuracy:  0.6849774077270651
F1 score:  0.6978182526541347
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.6815949129309031
F1 score:  0.6958714456304971
Confusion matrix: 
[[12655  7225]
 [ 5185 14329]]


In [20]:
y_eval = y_eval.apply(lambda x: 0 if x != 1 else x)
y_train = y_train.apply(lambda x: 0 if x != 1 else x)
xgb = XGBClassifier(
    max_depth=15,
    n_estimators=100,
    use_label_encoder=False,
)
xgb = train_test(xgb, X_train, y_train, X_eval, y_eval)

Training set size:  (157576, 300)  Evaluation set size:  (39394, 300)
Metrics on evaluation set: 
Accuracy:  0.7133065949129309
F1 score:  0.7238225656575537
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.9999175001269229
F1 score:  0.9999170807314754
Confusion matrix: 
[[13300  6580]
 [ 4714 14800]]


In [21]:
xgb = XGBClassifier(
    max_depth=10,
    n_estimators=150,
    use_label_encoder=False,
)
xgb = train_test(xgb, X_train, y_train, X_eval, y_eval)

Training set size:  (157576, 300)  Evaluation set size:  (39394, 300)
Metrics on evaluation set: 
Accuracy:  0.7096512159212063
F1 score:  0.718677750996114
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.9999048078387571
F1 score:  0.9999043263619142
Confusion matrix: 
[[13346  6534]
 [ 4904 14610]]


In [22]:
# I think we are reachin the limit on the info that a 300 dim vector can give us
# let's see if the strategy mentioned could work
df_train['tweet'].apply(lambda x: len(x.split(' '))).describe()

count    196970.000000
mean         15.957329
std          14.567898
min           1.000000
25%          10.000000
50%          15.000000
75%          22.000000
max        1737.000000
Name: tweet, dtype: float64

In [24]:
# where the hell is a 1737 words tweet??
df_train_eplore = df_train.copy()
df_train_eplore['tweet'] = df_train_eplore['tweet'].apply(lambda x: tknzr.tokenize(x))
df_train_eplore['tweet_len'] = df_train_eplore['tweet'].apply(lambda x: len(x))
df_train_eplore

Unnamed: 0,tweet,label,tweet_len
0,"[<user>, i, dunno, justin, read, my, mention, ...",1,27
1,"[because, your, logic, is, so, dumb, ,, i, won...",1,21
2,"[<user>, just, put, casper, in, a, box, !, loo...",1,13
3,"[<user>, <user>, thanks, sir, >, >, don't, tri...",1,17
4,"[visiting, my, brother, tmr, is, the, bestest,...",1,13
...,...,...,...
196965,"[can't, wait, to, fake, tan, tonight, !, hate,...",-1,10
196966,"[<user>, darling, i, lost, my, internet, conne...",-1,33
196967,"[kanguru, defender, basic, 4, gb, usb, 2.0, fl...",-1,23
196968,"[rizan, is, sad, now]",-1,4


In [26]:
df_train_eplore.describe()

Unnamed: 0,label,tweet_len
count,196970.0,196970.0
mean,-0.00592,15.952135
std,0.999985,14.561441
min,-1.0,1.0
25%,-1.0,10.0
50%,-1.0,15.0
75%,1.0,22.0
max,1.0,1737.0


In [28]:
# let's see outliers
lon_tweets = df_train_eplore[df_train_eplore['tweet_len'] > 100]
lon_tweets

Unnamed: 0,tweet,label,tweet_len
492,"[<user>, happy, #thongthursday, tweetybirds, <...",1,215
866,"[<user>, babe, came, over, to, comfort, me, sh...",1,165
1084,"[<user>, today, is, national, stalking, awaren...",1,118
1379,"[<user>, looks, like, chicken, night, ., so, v...",1,600
6198,"[<user>, rt, if, you, get, it, <url>, this, is...",1,172
...,...,...,...
180407,"[<user>, what, up, bubbles, !, miss, ko, na, s...",-1,205
183478,"[<user>, <url>, -, italy, probing, 51, mn, usd...",-1,159
183609,"[<user>, recovery, money, redirected, to, supe...",-1,491
188539,"[<user>, directionator, .., rt, when, you, see...",-1,108


In [29]:
lon_tweets.describe()

Unnamed: 0,label,tweet_len
count,110.0,110.0
mean,0.2,431.136364
std,0.98428,334.756586
min,-1.0,102.0
25%,-1.0,177.25
50%,1.0,327.0
75%,1.0,570.5
max,1.0,1737.0


In [31]:
lon_tweets.sort_values(by='tweet_len', ascending=False).head(10)

Unnamed: 0,tweet,label,tweet_len
16066,"[<user>, me, &, my, mama, <url>, her, thick, a...",1,1737
87657,"[<user>, now, following, ), follow, back, .., ...",1,1628
48645,"[<user>, <url>, damn, that's, a, sexc, ass, pi...",1,1527
30139,"[<user>, #ff, to, the, amazing, <user>, becky,...",1,1300
90586,"[<user>, u, cab, come, get, it, from, me, ill,...",1,1156
63673,"[<user>, <user>, pretty, white, girl, tried, t...",1,1083
86731,"[<user>, hello, ,, ust, ., i, could, get, used...",1,1031
27204,"[<user>, ahhahaahhaha, loools, i, put, my, pho...",1,1025
163499,"[<user>, that's, a, wrap, !, end, of, an, era,...",-1,977
81963,"[<user>, i, been, thinking, bout, yah, .., <ur...",1,922


In [32]:
lon_tweets['tweet'] = lon_tweets['tweet'].apply(lambda x: ' '.join(x))
lon_tweets.sort_values(by='tweet_len', ascending=False).to_csv('data/out/lon_tweets.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lon_tweets['tweet'] = lon_tweets['tweet'].apply(lambda x: ' '.join(x))


In [33]:
# lets see if theres more
df_train_eplore['tweet'] = df_train_eplore['tweet'].apply(lambda x: ' '.join(x))
lon_tweets = df_train_eplore[df_train_eplore['tweet_len'] > 50].sort_values(by='tweet_len', ascending=False)
lon_tweets.describe()

Unnamed: 0,label,tweet_len
count,138.0,138.0
mean,0.086957,356.644928
std,0.999841,333.40834
min,-1.0,51.0
25%,-1.0,114.25
50%,1.0,223.5
75%,1.0,527.5
max,1.0,1737.0


In [34]:
# great, i say we just drop all tweeets longer than 40 words
lon_tweets = df_train_eplore[df_train_eplore['tweet_len'] > 40].sort_values(by='tweet_len', ascending=False)
lon_tweets.describe()

Unnamed: 0,label,tweet_len
count,222.0,222.0
mean,0.207207,237.761261
std,0.980508,303.703984
min,-1.0,41.0
25%,-1.0,42.0
50%,1.0,95.0
75%,1.0,323.25
max,1.0,1737.0


In [35]:
df_train_eplore[df_train_eplore['tweet_len'] <= 40].describe()

Unnamed: 0,label,tweet_len
count,196748.0,196748.0
mean,-0.00616,15.701857
std,0.999984,7.286423
min,-1.0,1.0
25%,-1.0,10.0
50%,-1.0,15.0
75%,1.0,22.0
max,1.0,40.0


In [39]:
lon_tweets = df_train_eplore[df_train_eplore['tweet_len'] > 40].sort_values(by='tweet_len', ascending=False)
lon_tweets.describe()

Unnamed: 0,label,tweet_len
count,222.0,222.0
mean,0.207207,237.761261
std,0.980508,303.703984
min,-1.0,41.0
25%,-1.0,42.0
50%,1.0,95.0
75%,1.0,323.25
max,1.0,1737.0


In [40]:
# 40 words is a good cut off point
# lets see if we can get a better model with this
df_train = load_train_data(path_pos='data/twitter-datasets/train_pos.txt', path_neg='data/twitter-datasets/train_neg.txt')
df_train['tweet'] = df_train['tweet'].apply(lambda x: tknzr.tokenize(x)).apply(lambda x: ' '.join(x))
df_train = df_train[df_train['tweet'].apply(lambda x: len(x.split(' '))) <= 40]
X_train_vec = np.zeros((df_train.shape[0], 300))
for i, tweet in enumerate(df_train['tweet']):
    X_train_vec[i] = fasttext.wv.get_sentence_vector(tweet)

from sklearn.model_selection import train_test_split
X_train, X_eval, y_train, y_eval = train_test_split(X_train_vec, df_train['label'], test_size=0.2)

Train set:  (196970, 2)
Train set positives:  (97902, 2)
Train set negatives:  (99068, 2)


In [41]:
linsvc = LinearSVC()
linsvc = train_test(linsvc, X_train, y_train, X_eval, y_eval)

Training set size:  (157398, 300)  Evaluation set size:  (39350, 300)
Metrics on evaluation set: 
Accuracy:  0.6818297331639136
F1 score:  0.6966760344994669
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.6811458849540655
F1 score:  0.6942743836692923
Confusion matrix: 
[[12452  7323]
 [ 5197 14378]]


In [43]:
y_train = y_train.apply(lambda x: 0 if x != 1 else x)
y_eval = y_eval.apply(lambda x: 0 if x != 1 else x)
xgb = XGBClassifier(
    max_depth=6,
    n_estimators=150,
    use_label_encoder=False,
)
xgb = train_test(xgb, X_train, y_train, X_eval, y_eval)

Training set size:  (157398, 300)  Evaluation set size:  (39350, 300)
Metrics on evaluation set: 
Accuracy:  0.7122236340533672
F1 score:  0.7228992316351002
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.8413258110014105
F1 score:  0.8459293897014823
Confusion matrix: 
[[13255  6520]
 [ 4804 14771]]


In [45]:
# and now lets try to use the strategy mentioned
# we will use the first 30 words of the tweet to predict the label
# lets encode them with the 20 dim fasttext model to check if it works - 30*20 = 600 dim with lots of sparse vectors
df_train = load_train_data(path_pos='data/twitter-datasets/train_pos.txt', path_neg='data/twitter-datasets/train_neg.txt')
df_train['tweet'] = df_train['tweet'].apply(lambda x: tknzr.tokenize(x))
X_eval_vec = np.zeros((df_train.shape[0], 600))
from gensim.models import FastText
fasttext = FastText.load('data/fasttext/fasttext_20_4grams.model')
iter = 0
for i, tweet in enumerate(df_train['tweet']):
    for j, word in enumerate(tweet[:30]):
        X_eval_vec[i][j*20:j*20+20] = fasttext.wv.get_vector(word)
    iter += 1
    if iter % 10000 == 0:
        print(iter)

from sklearn.model_selection import train_test_split
X_train, X_eval, y_train, y_eval = train_test_split(X_eval_vec, df_train['label'], test_size=0.2)

Train set:  (196970, 2)
Train set positives:  (97902, 2)
Train set negatives:  (99068, 2)
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000


In [46]:
linsvc = LinearSVC()
linsvc = train_test(linsvc, X_train, y_train, X_eval, y_eval)

Training set size:  (157576, 600)  Evaluation set size:  (39394, 600)
Metrics on evaluation set: 
Accuracy:  0.5950144692085089
F1 score:  0.49313762866946254
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.5935104330608723
F1 score:  0.49343192692474996
Confusion matrix: 
[[15679  4236]
 [11718  7761]]




In [47]:
y_train = y_train.apply(lambda x: 0 if x != 1 else x)
y_eval = y_eval.apply(lambda x: 0 if x != 1 else x)
xgb = XGBClassifier(
    max_depth=6,
    n_estimators=150,
    use_label_encoder=False,
)
xgb = train_test(xgb, X_train, y_train, X_eval, y_eval)

Training set size:  (157576, 600)  Evaluation set size:  (39394, 600)
Metrics on evaluation set: 
Accuracy:  0.7784941869320201
F1 score:  0.7861169665179666
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.8534421485505407
F1 score:  0.859684298786045
Confusion matrix: 
[[14632  5283]
 [ 3443 16036]]


In [48]:
# ok so it makes sense!!!! next: 
# actually tokenize ngrams after the tokenization of words, 
# increaees the dim of the vector to 100 per word

df_train = load_train_data(path_pos='data/twitter-datasets/train_pos.txt', path_neg='data/twitter-datasets/train_neg.txt')
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(
    preserve_case=True,
    reduce_len=True,

)

df_train['tweet'] = df_train['tweet'].apply(lambda x: tknzr.tokenize(x))
# recover the ngrams we saved and on which we trained the fasttext model
# we saved a phrase object with the ngrams
from gensim.models.phrases import Phrases
quadgrams = Phrases.load('data/grams/quadgram.model')
df_train['tweet'] = df_train['tweet'].apply(lambda x: quadgrams[x])
X_eval_vec = np.zeros((df_train.shape[0], 600))
from gensim.models import FastText
fasttext = FastText.load('data/fasttext/fasttext_20_4grams.model')
iter = 0
for i, tweet in enumerate(df_train['tweet']):
    for j, word in enumerate(tweet[:30]):
        X_eval_vec[i][j*20:j*20+20] = fasttext.wv.get_vector(word)
    iter += 1
    if iter % 10000 == 0:
        print(iter)

from sklearn.model_selection import train_test_split
X_train, X_eval, y_train, y_eval = train_test_split(X_eval_vec, df_train['label'], test_size=0.2)

Train set:  (196970, 2)
Train set positives:  (97902, 2)
Train set negatives:  (99068, 2)
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000


In [49]:
y_train = y_train.apply(lambda x: 0 if x != 1 else x)
y_eval = y_eval.apply(lambda x: 0 if x != 1 else x)
xgb = XGBClassifier(
    max_depth=6,
    n_estimators=150,
    use_label_encoder=False,
)
xgb = train_test(xgb, X_train, y_train, X_eval, y_eval)

Training set size:  (157576, 600)  Evaluation set size:  (39394, 600)
Metrics on evaluation set: 
Accuracy:  0.7742549626846728
F1 score:  0.7827531452302431
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.8566596436005484
F1 score:  0.861798268424756
Confusion matrix: 
[[14480  5342]
 [ 3551 16021]]


In [50]:
# and now augment to 100 dim * 30 words = 3000 dim
df_train = load_train_data(path_pos='data/twitter-datasets/train_pos.txt', path_neg='data/twitter-datasets/train_neg.txt')
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(
    preserve_case=True,
    reduce_len=True,

)

df_train['tweet'] = df_train['tweet'].apply(lambda x: tknzr.tokenize(x))
# recover the ngrams we saved and on which we trained the fasttext model
# we saved a phrase object with the ngrams
from gensim.models.phrases import Phrases
quadgrams = Phrases.load('data/grams/quadgram.model')
df_train['tweet'] = df_train['tweet'].apply(lambda x: quadgrams[x])
X_eval_vec = np.zeros((df_train.shape[0], 3000))
from gensim.models import FastText
fasttext = FastText.load('data/fasttext/fasttext_100_4grams.model')
iter = 0
for i, tweet in enumerate(df_train['tweet']):
    for j, word in enumerate(tweet[:30]):
        X_eval_vec[i][j*100:j*100+100] = fasttext.wv.get_vector(word)
    iter += 1
    if iter % 10000 == 0:
        print(iter)

from sklearn.model_selection import train_test_split
X_train, X_eval, y_train, y_eval = train_test_split(X_eval_vec, df_train['label'], test_size=0.2)

Train set:  (196970, 2)
Train set positives:  (97902, 2)
Train set negatives:  (99068, 2)
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000


In [51]:
y_train = y_train.apply(lambda x: 0 if x != 1 else x)
y_eval = y_eval.apply(lambda x: 0 if x != 1 else x)
xgb = XGBClassifier(
    max_depth=6,
    n_estimators=150,
    use_label_encoder=False,
)
xgb = train_test(xgb, X_train, y_train, X_eval, y_eval)

Training set size:  (157576, 3000)  Evaluation set size:  (39394, 3000)
Metrics on evaluation set: 
Accuracy:  0.792049550692999
F1 score:  0.79853425802961
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.8927057419911661
F1 score:  0.8953301924755612
Confusion matrix: 
[[14967  4802]
 [ 3390 16235]]


In [52]:
linsvc = LinearSVC()
linsvc = train_test(linsvc, X_train, y_train, X_eval, y_eval)



Training set size:  (157576, 3000)  Evaluation set size:  (39394, 3000)
Metrics on evaluation set: 
Accuracy:  0.6909173985886176
F1 score:  0.6634790779945829
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.6966796974158501
F1 score:  0.6684241196547993
Confusion matrix: 
[[15215  4554]
 [ 7622 12003]]


In [53]:
linsvc = LinearSVC(
    C=0.01,
)
linsvc = train_test(linsvc, X_train, y_train, X_eval, y_eval)



Training set size:  (157576, 3000)  Evaluation set size:  (39394, 3000)
Metrics on evaluation set: 
Accuracy:  0.7721226582728334
F1 score:  0.7859612312534275
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.7827587957556988
F1 score:  0.7951798578369194
Confusion matrix: 
[[13935  5834]
 [ 3143 16482]]


So, out strategy makes sense but we need to implement the last pieces: adding the tfidf info to the embeddings, and then training a model on top of this.

This should translate in adding a weight to each embedding that we add to the feature vector.

In [54]:
lil_tree = DecisionTreeClassifier(
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
)
lil_tree = train_test(lil_tree, X_train, y_train, X_eval, y_eval)

Training set size:  (157576, 3000)  Evaluation set size:  (39394, 3000)
Metrics on evaluation set: 
Accuracy:  0.6724120424430117
F1 score:  0.6697714885232477
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.9999365385591714
F1 score:  0.9999361242766074
Confusion matrix: 
[[13402  6367]
 [ 6538 13087]]


In [None]:
# and now augment to 100 dim * 30 words = 3000 dim
df_train = load_train_data(path_pos='data/twitter-datasets/train_pos.txt', path_neg='data/twitter-datasets/train_neg.txt')
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(
    preserve_case=True,
    reduce_len=True,

)

df_train['tweet'] = df_train['tweet'].apply(lambda x: tknzr.tokenize(x))
# recover the ngrams we saved and on which we trained the fasttext model
# we saved a phrase object with the ngrams
from gensim.models.phrases import Phrases
# tfidf = TfidfVectorizer()
# from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
# tfidf = TfidfVectorizer(
#     ngram_range=(1, 1),
#     min_df=5,
#     tokenizer=lambda x: quadgrams[tknzr.tokenize(x)],
# )
# # compute tfidf values, want to use them as weights for the fasttext vectors
# tfidf.fit(df_train['tweet'])
# # check api again of this to make sure we have access, we are concucting a potion here

quadgrams = Phrases.load('data/grams/quadgram.model')
df_train['tweet'] = df_train['tweet'].apply(lambda x: quadgrams[x])
df_train = df_train[df_train['tweet'].apply(lambda x: len(x.split(' '))) <= 40] # clean outliers already
X_eval_vec = np.zeros((df_train.shape[0], 3000))
from gensim.models import FastText
fasttext = FastText.load('data/fasttext/fasttext_100_4grams.model')
iter = 0
for i, tweet in enumerate(df_train['tweet']):
    for j, word in enumerate(tweet[:30]):
        X_eval_vec[i][j*100:j*100+100] = fasttext.wv.get_vector(word)
    iter += 1
    if iter % 10000 == 0:
        print(iter)

from sklearn.model_selection import train_test_split
X_train, X_eval, y_train, y_eval = train_test_split(X_eval_vec, df_train['label'], test_size=0.2)