In [38]:
def load_train_data(path_pos='data/twitter-datasets/train_pos_full.txt', path_neg='data/twitter-datasets/train_neg_full.txt'):
    # Load data, txt as csv
    #data_path = 'data/twitter-datasets/'
    df_train_pos = pd.read_csv(path_pos, sep = '\t', names = ['tweet'])
    df_train_pos['label'] = 1
    df_train_neg = pd.read_csv(path_neg, sep = '\t', names = ['tweet'], on_bad_lines='skip')
    df_train_neg['label'] = -1
    df_train = pd.concat([df_train_pos, df_train_neg], ignore_index=True)
    print('Train set: ', df_train.shape)
    print('Train set positives: ', df_train_pos.shape)
    print('Train set negatives: ', df_train_neg.shape)
    return df_train   

def load_test_data():
    # Load test data: id, tweet for each row
    data_path = 'data/twitter-datasets/'
    df_test = pd.read_csv(data_path + 'test_data.txt', header=None, names=['line'], sep='\t')
    # Extract id and tweet, limit split by 1 so we don't split the tweet (this is v0, at least we keep it intact)
    df_test['id'] = df_test['line'].apply(lambda x: x.split(',',1)[0]) 
    df_test['tweet'] = df_test['line'].apply(lambda x: x.split(',',1)[1])
    df_test = df_test.drop('line', axis=1)
    return df_test

def predict_test_data(X_test, classifier, filename='submission.csv'):
    # Predict test data and save to csv
    y_pred = classifier.predict(X_test)
    df_test['Prediction'] = y_pred
    df_test.rename(columns={'id': 'Id'}, inplace=True)
    df_test['Prediction'] = df_test['Prediction'].apply(lambda x: -1 if x != 1 else x)
    df_test.to_csv(filename, columns=['Id', 'Prediction'], index=False)
    return df_test
    
def predict_test_data_pipeline(df_test, pipe, filename='submission.csv'):
    # Predict test data and save to csv
    y_pred = pipe.predict(df_test['tweet'])
    df_test['Prediction'] = y_pred
    df_test.rename(columns={'id': 'Id'}, inplace=True)
    df_test['Prediction'] = df_test['Prediction'].apply(lambda x: -1 if x != 1 else x)
    df_test.to_csv(filename, columns=['Id', 'Prediction'], index=False)
    return df_test

def train_test(clf, X_train, y_train, X_eval=None, y_eval=None, cv=None):
    from sklearn.metrics import accuracy_score, f1_score
    if X_eval is None:
        from sklearn.model_selection import train_test_split
        X_train, X_eval, y_train, y_eval = train_test_split(X_train, y_train, test_size=0.2)
    if cv is not None:
        from sklearn.model_selection import cross_val_score
        scores = cross_val_score(clf, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1, shuffle=True)
        print('Cross validation Accuracy Scores: ', scores)
        print('Cross validation mean score: ', scores.mean())
        print('Cross validation std score: ', scores.std())
        clf.fit(X_train, y_train)
        return clf
    clf.fit(X_train, y_train)
    print('Training set size: ', X_train.shape, ' Evaluation set size: ', X_eval.shape)
    print('Metrics on evaluation set: ')
    test(clf, X_eval, y_eval)
    print('Metrics on training set to check overfitting/triviality of model: ')
    test(clf, X_train, y_train)
    from sklearn.metrics import confusion_matrix
    print('Confusion matrix: ')
    print(confusion_matrix(y_eval, clf.predict(X_eval)))
    return clf

def test(clf, X_eval, y_eval):
    from sklearn.metrics import accuracy_score, f1_score
    y_pred = clf.predict(X_eval)
    print('Accuracy: ', accuracy_score(y_eval, y_pred))
    print('F1 score: ', f1_score(y_eval, y_pred))
    return y_pred

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

In [5]:
# and now augment to 300 dim * 30 words = 9000 dim
df_train = load_train_data(path_pos='data/twitter-datasets/train_pos.txt', path_neg='data/twitter-datasets/train_neg.txt')
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(
    preserve_case=True,
    reduce_len=True,

)

df_train['tweet'] = df_train['tweet'].apply(lambda x: tknzr.tokenize(x))
# recover the ngrams we saved and on which we trained the fasttext model
# we saved a phrase object with the ngrams
from gensim.models.phrases import Phrases
# tfidf = TfidfVectorizer()
# from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
# tfidf = TfidfVectorizer(
#     ngram_range=(1, 1),
#     min_df=5,
#     tokenizer=lambda x: quadgrams[tknzr.tokenize(x)],
# )
# # compute tfidf values, want to use them as weights for the fasttext vectors
# tfidf.fit(df_train['tweet'])
# # check api again of this to make sure we have access, we are concucting a potion here

quadgrams = Phrases.load('data/grams/quadgram.model')
df_train['tweet'] = df_train['tweet'].apply(lambda x: quadgrams[x])
df_train = df_train[df_train['tweet'].apply(lambda x: len(x)) <= 40] # clean outliers already
X_eval_vec = np.zeros((df_train.shape[0], 9000))
from gensim.models import FastText
fasttext = FastText.load('data/fasttext/fasttext_300_4grams.model')
iter = 0
for i, tweet in enumerate(df_train['tweet']):
    for j, word in enumerate(tweet[:30]):
        X_eval_vec[i][j*300:j*300+300] = fasttext.wv.get_vector(word)
    iter += 1
    if iter % 10000 == 0:
        print(iter)

from sklearn.model_selection import train_test_split
X_train, X_eval, y_train, y_eval = train_test_split(X_eval_vec, df_train['label'], test_size=0.2)

Train set:  (196970, 2)
Train set positives:  (97902, 2)
Train set negatives:  (99068, 2)
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000


In [7]:
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC

linsvc = LinearSVC(
    C=0.001,
)
linsvc = train_test(linsvc, X_train, y_train, X_eval, y_eval)



Training set size:  (157443, 9000)  Evaluation set size:  (39361, 9000)
Metrics on evaluation set: 
Accuracy:  0.8016310561215416
F1 score:  0.8069524798496761
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.8295065515773963
F1 score:  0.8336916452402341
Confusion matrix: 
[[15234  4506]
 [ 3302 16319]]


In [8]:
lil_tree = DecisionTreeClassifier()
lil_tree = train_test(lil_tree, X_train, y_train, X_eval, y_eval)

Training set size:  (157443, 9000)  Evaluation set size:  (39361, 9000)
Metrics on evaluation set: 
Accuracy:  0.6769136962983664
F1 score:  0.6735296382820323
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.9999428364550981
F1 score:  0.9999424511954166
Confusion matrix: 
[[13526  6214]
 [ 6503 13118]]


In [9]:
xgb = xgb.XGBClassifier()
xgb = train_test(xgb, X_train, y_train, X_eval, y_eval)



Training set size:  (157443, 9000)  Evaluation set size:  (39361, 9000)
Metrics on evaluation set: 
Accuracy:  0.7870734991489038
F1 score:  0.7948196930007099
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.8679395082664837
F1 score:  0.8717224189627728
Confusion matrix: 
[[14747  4993]
 [ 3388 16233]]


In [20]:
# Let's check how 'bad' the embeddings are and see what's the problem, let's also explore a bit what's going on and what the embeddings look like
print(X_train.shape)
print(X_eval.shape)
X_train[0]

(157443, 9000)
(39361, 9000)


array([-0.14510264,  0.21560919, -1.00325012, ...,  0.        ,
        0.        ,  0.        ])

In [23]:
# I think the fundamental idea is good: pick the first 30 words of the tweet (or less, padd with zeros the rest), 
# and then take the fasttext embedding of each word and concatenate them to get a 9000 dim vector, then
# train a classifier on that, hoping it will realize that the features are actually the words of the tweet, each of 300 dim.

# But why should the classifier make extra work to understand this substructure? Can't we just tell the model: each feature is a 300dim vector?
# What if we wanted to add other features? Like a tfidf score for each word? Or a sentiment score that we provide?
# There must be a way, why are we still working with a 2d matrix? We should be working with a 3d tensor, where the third dimension is the 300 dim vector of the word.

# Let's try to do this, let's try to make a 3d tensor, where the third dimension is the 300 dim vector of the word.
print(np.zeros((df_train.shape[0], 30, 300)).shape)
print(np.zeros((df_train.shape[0], 30, 300))[0].shape)


(196804, 30, 300)
(30, 300)


In [4]:
# 3D Tensors of shape (num_samples, num_words, num_features)
df_train = load_train_data(path_pos='data/twitter-datasets/train_pos.txt', path_neg='data/twitter-datasets/train_neg.txt')
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(
    preserve_case=True, # we can play around with this too, but for now we keep everything as with embeddings
    reduce_len=True,

)

# words
df_train['tweet'] = df_train['tweet'].apply(lambda x: tknzr.tokenize(x))

# ngrams
from gensim.models.phrases import Phrases
quadgrams = Phrases.load('data/grams/quadgram.model')
df_train['tweet'] = df_train['tweet'].apply(lambda x: quadgrams[x])

# clean outliers with too many words
df_train = df_train[df_train['tweet'].apply(lambda x: len(x)) <= 40]

# create 3d tensor
X_eval_vec = np.zeros((df_train.shape[0], 30, 300))

# load fasttext model x embeddings
from gensim.models import FastText
fasttext = FastText.load('data/fasttext/fasttext_300_4grams.model')

# fill tensor with embeddings
iter = 0
for i, tweet in enumerate(df_train['tweet']):
    for j, word in enumerate(tweet[:30]):
        X_eval_vec[i][j] = fasttext.wv.get_vector(word)
    iter += 1
    if iter % 10000 == 0:
        print(iter)

# create train and eval sets
from sklearn.model_selection import train_test_split
X_train, X_eval, y_train, y_eval = train_test_split(X_eval_vec, df_train['label'], test_size=0.2)
print('Train 3d tensor: ', X_train.shape)
print('Eval 3d tensor: ', X_eval.shape)


Train set:  (196970, 2)
Train set positives:  (97902, 2)
Train set negatives:  (99068, 2)
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
Train 3d tensor:  (157443, 30, 300)
Eval 3d tensor:  (39361, 30, 300)


In [26]:
# train model
from sklearn.svm import LinearSVC
linsvc = LinearSVC(
    C=0.001,
)
#linsvc = train_test(linsvc, X_train, y_train, X_eval, y_eval) # so, very simply, this doesn't work with 3d tensors...

In [28]:
# What if we have 30 different trees, each of which is trained on a different word of the tweet? 
# We concuct a voting on the 30 trees, and the tree that wins the most votes wins the prediction.
# is there a way of doing this in sklearn?
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

# create 30 trees
trees = []
for i in range(30):
    trees.append(DecisionTreeClassifier(
        max_depth=15,
        max_features=1,
        min_samples_leaf=1,
        min_samples_split=2,
        splitter='best',
    ))

# create voting classifier
voting = VotingClassifier(
    estimators=[('tree'+str(i), trees[i]) for i in range(30)],
    voting='soft',
    n_jobs=-1,
    verbose=True,
)

# train voting classifier
#voting = train_test(voting, X_train, y_train, X_eval, y_eval) # we need to train each tree on a different word of the tweet


# We are going into Neural Network territory now, this still does not work with 3d tensors, we can do this by hand, start to play around with interaction terms etc etc
# let's first do a digression and compare this to averaging tho...


In [4]:
# Afterall, we don't need to immediately increase complexity like this, we can think of better ways to average the embeddings first.
# simple averaging:
X_train_avg = np.mean(X_train, axis=1)
X_eval_avg = np.mean(X_eval, axis=1)
print(X_train_avg.shape)

(157443, 300)


In [6]:
from sklearn.svm import LinearSVC
linsvc = LinearSVC(
    C=0.001,
)
linsvc = train_test(linsvc, X_train_avg, y_train, X_eval_avg, y_eval)

Training set size:  (157443, 300)  Evaluation set size:  (39361, 300)
Metrics on evaluation set: 
Accuracy:  0.7932471227865145
F1 score:  0.8040075140889168
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.7897334273356071
F1 score:  0.800251006136376
Confusion matrix: 
[[14531  5212]
 [ 2926 16692]]


In [31]:
lil_tree = DecisionTreeClassifier()
lil_tree = train_test(lil_tree, X_train_avg, y_train, X_eval_avg, y_eval)

Training set size:  (157443, 300)  Evaluation set size:  (39361, 300)
Metrics on evaluation set: 
Accuracy:  0.6861868346840782
F1 score:  0.682337208106162
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.9999237819401308
F1 score:  0.999923414683958
Confusion matrix: 
[[13743  6146]
 [ 6206 13266]]


In [32]:
from sklearn.linear_model import LogisticRegression # long time no see buddy, please don't give me problems
logreg = LogisticRegression(
    n_jobs=-1, # ooooooooooo didnt know u were chill like that bro
)
logreg = train_test(logreg, X_train_avg, y_train, X_eval_avg, y_eval)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training set size:  (157443, 300)  Evaluation set size:  (39361, 300)
Metrics on evaluation set: 
Accuracy:  0.7992429054139885
F1 score:  0.8058381247235736
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.7993559573941046
F1 score:  0.8064717702411291
Confusion matrix: 
[[15061  4828]
 [ 3074 16398]]


In [34]:
# gotta admit logreg is pretty good
logreg = LogisticRegression(
    n_jobs=-1, 
    warm_start=True,
    max_iter=1000,
)
logreg = train_test(logreg, X_train_avg, y_train, X_eval_avg, y_eval)

Training set size:  (157443, 300)  Evaluation set size:  (39361, 300)
Metrics on evaluation set: 
Accuracy:  0.8011229389497218
F1 score:  0.8076752985111296
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.8012868149107931
F1 score:  0.8081508008535897
Confusion matrix: 
[[15096  4793]
 [ 3035 16437]]


In [35]:
logreg = LogisticRegression(
    n_jobs=-1,
    warm_start=True,
    max_iter=1000,
    C=0.01,
)
logreg = train_test(logreg, X_train_avg, y_train, X_eval_avg, y_eval)

Training set size:  (157443, 300)  Evaluation set size:  (39361, 300)
Metrics on evaluation set: 
Accuracy:  0.7934503696552425
F1 score:  0.8020742039146947
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.790438444389398
F1 score:  0.7997304974870713
Confusion matrix: 
[[14758  5131]
 [ 2999 16473]]


In [36]:
logreg = LogisticRegression(
    n_jobs=-1,
    warm_start=False,
    max_iter=1000,
    C=0.01,
)
logreg = train_test(logreg, X_train_avg, y_train, X_eval_avg, y_eval)

Training set size:  (157443, 300)  Evaluation set size:  (39361, 300)
Metrics on evaluation set: 
Accuracy:  0.7934503696552425
F1 score:  0.8020742039146947
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.790438444389398
F1 score:  0.7997304974870713
Confusion matrix: 
[[14758  5131]
 [ 2999 16473]]


In [39]:
logreg = LogisticRegression(
    n_jobs=-1,
    warm_start=False,
    max_iter=1000,
    C=0.1,
    penalty='elasticnet',
    l1_ratio=0.5,
    solver='saga',
)
logreg = train_test(logreg, X_train_avg, y_train, X_eval_avg, y_eval)

Training set size:  (157443, 300)  Evaluation set size:  (39361, 300)
Metrics on evaluation set: 
Accuracy:  0.8006910393536749
F1 score:  0.8076498712762045
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.799921241338135
F1 score:  0.8072779331061528
Confusion matrix: 
[[15046  4843]
 [ 3002 16470]]


In [40]:
# need more sophisticated logreg, we could augment the data with interaction terms
logreg = LogisticRegression(
    n_jobs=-1,
    warm_start=False,
    max_iter=1000,
    C=1,
    penalty='elasticnet',
    l1_ratio=0.5,
    solver='saga',
)
logreg = train_test(logreg, X_train_avg, y_train, X_eval_avg, y_eval)

Training set size:  (157443, 300)  Evaluation set size:  (39361, 300)
Metrics on evaluation set: 
Accuracy:  0.8011737506669038
F1 score:  0.8077055383556933
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.8012741119008149
F1 score:  0.8081267707553996
Confusion matrix: 
[[15099  4790]
 [ 3036 16436]]


In [7]:
# augment data with interaction terms
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)
X_train_avg_aug = poly.fit_transform(X_train_avg)
X_eval_avg_aug = poly.fit_transform(X_eval_avg)
print(X_train_avg_aug.shape)

(157443, 45451)


In [42]:
logreg = LogisticRegression(
    n_jobs=-1,
    warm_start=False,
    max_iter=1000,
    C=0.1,
    penalty='elasticnet',
    l1_ratio=0.5,
    solver='saga',
)
#logreg = train_test(logreg, X_train_avg_aug, y_train, X_eval_avg_aug, y_eval) #humongous

In [9]:
# back to linsvc which might benefit from interaction terms, bt fundamentally we are not really adding any information here...
linsvc = LinearSVC(
    C=0.001,
)
#linsvc = train_test(linsvc, X_train_avg_aug, y_train, X_eval_avg_aug, y_eval) # hopefully not too humongous - nope, 'error value too large to convert to npy_int32' after 8+min

In [3]:
# let's improve the averaging. initial idea: parse out stopwords. We can calculate them as the top ones that are in both classes.
#import nltk
#from nltk.corpus import stopwords
#stop_words = set(stopwords.words('english')) # tweets are a bit more specific than english, but let's just go with it
#print(len(stop_words))

df_train = load_train_data(path_pos='data/twitter-datasets/train_pos.txt', path_neg='data/twitter-datasets/train_neg.txt')
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(
    preserve_case=True, # we can play around with this too, but for now we keep everything as with embeddings
    reduce_len=True,

)

# words
df_train['tweet'] = df_train['tweet'].apply(lambda x: tknzr.tokenize(x))
df_train['tweet_len'] = df_train['tweet'].apply(lambda x: len(x))
print("Tokenized tweets: ", df_train['tweet_len'].describe(), end=' ')

# ngrams
from gensim.models.phrases import Phrases
quadgrams = Phrases.load('data/grams/quadgram.model')
df_train['tweet'] = df_train['tweet'].apply(lambda x: quadgrams[x])
df_train['tweet_len'] = df_train['tweet'].apply(lambda x: len(x))
print("Quadgrammed tweets: ", df_train['tweet_len'].describe())

# identify stopwords as the top ones that are in both classes, which basically translates in the onoes that give us less information
from collections import Counter
df_train_pos = df_train[df_train['label'] == 1]
df_train_neg = df_train[df_train['label'] != 1]
c_pos = Counter([word for tweet in df_train_pos['tweet'] for word in tweet])
c_neg = Counter([word for tweet in df_train_neg['tweet'] for word in tweet])
c_all = Counter([word for tweet in df_train['tweet'] for word in tweet])
# basically a cooccurence matrix of the top 1000 words in both classes, we want the count to be high in both classes
# actually we can do this by taking the most common words in one class and then check if the are in the other class's common words as well
# and take the ones that are roughly present in both classes in equal amounts so they are both very common and very uninformative for classification
# if this calculation is correct we have: say w has total count 100, we filter it out if there are more than 45 occurrances in pos AND in neg. 
# So the max unbalance is 45/55 = 0.82, which is the max ratio of occurrances in pos to occurrances in neg. after that we filter out words that are too common in one class
stop_words = [word for word, count in c_all.most_common(1000) if count*0.45 < c_pos[word] and count*0.45 < c_neg[word]]

print("Stop Words:", len(stop_words),
"Number of long tweets: ", df_train[df_train['tweet_len'] > 40].shape[0], 
"Number of short tweets (1-2 quadgrams): ", df_train[df_train['tweet_len'] < 3].shape[0] 
)

# clean outliers with too many words
print("Removing long tweets...")
df_train = df_train[df_train['tweet_len'] <= 40]
 # Consider creating a separate model for the ones with 1-3 words and train on those separately, we could inspect the results later

# remove stopwords
print("Removing stopwords...")
df_train['tweet'] = df_train['tweet'].apply(lambda x: [word for word in x if word not in stop_words]) # turns out there's some tweets that are just stopwords, so we need to filter those out

df_train['tweet_len'] = df_train['tweet'].apply(lambda x: len(x))
print("Number of null tweets: ", df_train[df_train['tweet_len'] == 0].shape[0])
print("Removing null tweets...")
df_train = df_train[df_train['tweet_len'] >= 1] 
df_train['tweet_len'] = df_train['tweet'].apply(lambda x: len(x))

# remove words that are too rare --- we kinda did with fasttext and they should not have a vector anyway so let's skip here
#rare_stop_words = [word for word, count in c_all.most_common() if count < 3] # very slow with most_common, would be better to just have access to the least_common but ok
#df_train['tweet'] = df_train['tweet'].apply(lambda x: [word for word in x if word not in rare_stop_words])
#print("Rare words: ", len(rare_stop_words))

df_train_pos = df_train[df_train['label'] == 1]
df_train_neg = df_train[df_train['label'] != 1]
print("Final tweets: ", df_train['tweet_len'].describe(), "Final tweets, pos: ", df_train_pos['tweet_len'].describe(), "Final tweets, neg: ", df_train_neg['tweet_len'].describe())


# create 2d array of word vectors
X_vec = np.zeros((df_train.shape[0], 300))

from gensim.models import FastText
fasttext = FastText.load('data/fasttext/fasttext_300_4grams.model')
iter = 0
for i, tweet in enumerate(df_train['tweet']):
    #print(tweet)
    #print(fasttext.wv.get_mean_vector(tweet))
    try: 
        vc = fasttext.wv.get_mean_vector(tweet)
    except ValueError:
        print('ValueError at index ', i, ' tweet:')
        print(tweet)
        continue
    X_vec[i] = vc # we could join and call get_phrase_vector, but lets try like this so maybe we use it for a weighting scheme later

    iter += 1
    if iter % 25000 == 0:
        print('Tweets processed: ', iter)

from sklearn.model_selection import train_test_split
X_train, X_eval, y_train, y_eval = train_test_split(X_vec, df_train['label'], test_size=0.2)

Train set:  (196970, 2)
Train set positives:  (97902, 2)
Train set negatives:  (99068, 2)
Tokenized tweets:  count    196970.000000
mean         15.952135
std          14.561441
min           1.000000
25%          10.000000
50%          15.000000
75%          22.000000
max        1737.000000
Name: tweet_len, dtype: float64 Quadgrammed tweets:  count    196970.000000
mean         14.971011
std          13.791920
min           1.000000
25%           9.000000
50%          14.000000
75%          20.000000
max        1660.000000
Name: tweet_len, dtype: float64
Stop Words: 248 Number of long tweets:  166 Number of short tweets (1-2 quadgrams):  559
Removing long tweets...
Removing stopwords...
Number of null tweets:  20
Removing null tweets...
Final tweets:  count    196784.000000
mean         10.781786
std           5.427897
min           1.000000
25%           6.000000
50%          10.000000
75%          15.000000
max          35.000000
Name: tweet_len, dtype: float64 Final tweets, pos:  c

In [33]:
# let's now train
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(
    C=0.1,
    n_jobs=-1,
    solver='saga',
    max_iter=1000,
    verbose=0,
    penalty='elasticnet',
    l1_ratio=0.5,
)
logreg = train_test(logreg, X_train, y_train, X_eval, y_eval)

Training set size:  (157427, 300)  Evaluation set size:  (39357, 300)
Metrics on evaluation set: 
Accuracy:  0.776227862896054
F1 score:  0.7805765254005033
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.7794469817756802
F1 score:  0.7849094006504569
Confusion matrix: 
[[14885  5020]
 [ 3787 15665]]


In [34]:
from sklearn.svm import LinearSVC
linsvc = LinearSVC()
linsvc = train_test(linsvc, X_train, y_train, X_eval, y_eval)

Training set size:  (157427, 300)  Evaluation set size:  (39357, 300)
Metrics on evaluation set: 
Accuracy:  0.7995782198846457
F1 score:  0.8036638789326961
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.8034136456897483
F1 score:  0.8083359137920356
Confusion matrix: 
[[15325  4580]
 [ 3308 16144]]


In [35]:
from sklearn.tree import DecisionTreeClassifier
lil_tree = DecisionTreeClassifier()
lil_tree = train_test(lil_tree, X_train, y_train, X_eval, y_eval)

Training set size:  (157427, 300)  Evaluation set size:  (39357, 300)
Metrics on evaluation set: 
Accuracy:  0.6986558934878166
F1 score:  0.694156480478622
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.9992377419375329
F1 score:  0.9992341175119032
Confusion matrix: 
[[14038  5867]
 [ 5993 13459]]


In [37]:
from xgboost import XGBClassifier
y_train = y_train.apply(lambda x: 0 if x != 1 else x)
y_eval = y_eval.apply(lambda x: 0 if x != 1 else x)
xgb = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    use_label_encoder=False,
)
xgb = train_test(xgb, X_train, y_train, X_eval, y_eval)

Training set size:  (157427, 300)  Evaluation set size:  (39357, 300)
Metrics on evaluation set: 
Accuracy:  0.8099702721243998
F1 score:  0.8135981855793435
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.8827456535410063
F1 score:  0.8851230668699629
Confusion matrix: 
[[15556  4349]
 [ 3130 16322]]


In [38]:
logreg = LogisticRegression(max_iter=1000, n_jobs=-1)
logreg = train_test(logreg, X_train, y_train, X_eval, y_eval)

Training set size:  (157427, 300)  Evaluation set size:  (39357, 300)
Metrics on evaluation set: 
Accuracy:  0.7939121376121148
F1 score:  0.7975741845316828
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.7968391699009699
F1 score:  0.8014452535712291
Confusion matrix: 
[[15267  4638]
 [ 3473 15979]]


In [None]:
# glove embeddings
from gensim.models import KeyedVectors
glove = KeyedVectors.load_word2vec_format('data/glove/glove.twitter.27B.200d.txt', binary=False)
# let's create a smaller version of the glove embeddings conatining only the words we need
# we can take the vocabulary we need directly form the fasttext model --- lets not complicate now, let's just use the whole thing for the moment 
# from gensim.models import FastText
# fasttext = FastText.load('data/fasttext/fasttext_300_4grams.model')
# glove_vocab = fasttext.wv.vocab.keys()
# glove.build_vocab(glove_vocab, update=True)
# glove.vocab = {k: v for k, v in glove.vocab.items() if k in glove_vocab}
# glove.save('data/glove/glove.twitter.27B.200d_small.model')
# glove = KeyedVectors.load('data/glove/glove.twitter.27B.200d_small.model')




In [6]:
# let's now use te glove embeddings since we are just averaging
X_vec_glove = np.zeros((df_train.shape[0], 200))
iter = 0
for i, tweet in enumerate(df_train['tweet']):
    #print(tweet)
    #print(fasttext.wv.get_mean_vector(tweet))
    try: 
        vc = glove.get_mean_vector(tweet)
    except ValueError:
        print('ValueError at index ', i, ' tweet:')
        print(tweet)
        continue
    X_vec_glove[i] = vc # we could join and call get_phrase_vector, but lets try like this so maybe we use it for a weighting scheme later

    iter += 1
    if iter % 25000 == 0:
        print('Tweets processed: ', iter)

from sklearn.model_selection import train_test_split
X_train, X_eval, y_train, y_eval = train_test_split(X_vec_glove, df_train['label'], test_size=0.2)


Tweets processed:  25000
Tweets processed:  50000
Tweets processed:  75000
Tweets processed:  100000
Tweets processed:  125000
Tweets processed:  150000
Tweets processed:  175000


In [7]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(n_jobs=-1, max_iter=1000)
logreg = train_test(logreg, X_train, y_train, X_eval, y_eval)

Training set size:  (157427, 200)  Evaluation set size:  (39357, 200)
Metrics on evaluation set: 
Accuracy:  0.7624056711639606
F1 score:  0.7700140190363756
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.7599522318280854
F1 score:  0.7649466324981963
Confusion matrix: 
[[14352  5294]
 [ 4057 15654]]


In [8]:
from sklearn.svm import LinearSVC
linsvc = LinearSVC()
linsvc = train_test(linsvc, X_train, y_train, X_eval, y_eval)

Training set size:  (157427, 200)  Evaluation set size:  (39357, 200)
Metrics on evaluation set: 
Accuracy:  0.7635744594354245
F1 score:  0.7726661943270382
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.7615466216087456
F1 score:  0.7679784413224469
Confusion matrix: 
[[14239  5407]
 [ 3898 15813]]


In [9]:
from sklearn.tree import DecisionTreeClassifier
lil_tree = DecisionTreeClassifier()
lil_tree = train_test(lil_tree, X_train, y_train, X_eval, y_eval)

Training set size:  (157427, 200)  Evaluation set size:  (39357, 200)
Metrics on evaluation set: 
Accuracy:  0.6786594506695124
F1 score:  0.6770676403748436
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.9929364086211386
F1 score:  0.9928781862431152
Confusion matrix: 
[[13452  6194]
 [ 6453 13258]]


In [10]:
from xgboost import XGBClassifier
y_train = y_train.apply(lambda x: 0 if x != 1 else x)
y_eval = y_eval.apply(lambda x: 0 if x != 1 else x)
xgb = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    use_label_encoder=False,
)
xgb = train_test(xgb, X_train, y_train, X_eval, y_eval)

  from pandas import MultiIndex, Int64Index


Training set size:  (157427, 200)  Evaluation set size:  (39357, 200)
Metrics on evaluation set: 
Accuracy:  0.7833168178468888
F1 score:  0.7903431999213294
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.8556156186677
F1 score:  0.8587409110682991
Confusion matrix: 
[[14755  4891]
 [ 3637 16074]]


In [12]:
# Ok so, no matter how we do it we just have less info than the tfidf.
# at this point we can try to do interactions from the embeddings or a weighted average
# let's try a weighted average by loading the tfidf we saved before in the pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
#pipe = pickle.load(open('data/out/trained/tfidf_vectorizer-linSVC-pipeline-v2_4.pickle', 'rb'))

with open('data/out/trained/tfidf_vectorizer-linSVC-pipeline-v2_4.pickle', 'rb') as f:
    pipe = pickle.load(f)

# we know that this achieves acc: 0.848	f1: 0.850 and runs in less than 3min on full data from scratch
# let's train a linear SVC on the partial data and see how it performs so we can compare and iterate
vec_pipe = pipe.steps[0][1]
svm_pipe = pipe.steps[1][1]
print(vec_pipe, svm_pipe)


TfidfVectorizer(binary=True, min_df=3, ngram_range=(1, 4)) LinearSVC()


In [35]:
# let's first check the output of the tfidf vectorizer
tweet0_vecs = vec_pipe.transform(df_train['tweet'][0])
print(tweet0_vecs.shape)
print(tweet0_vecs.toarray())
print(vec_pipe.get_feature_names()[0:10])
print(vec_pipe.get_feature_names()[100:110])
print(vec_pipe.transform(df_train['tweet'][0]).toarray().nonzero()[1])
print(vec_pipe.transform(df_train['tweet'][0]).toarray().nonzero()[0]) 
print(vec_pipe.transform(df_train['tweet'][0]).toarray().nonzero()[1].shape)
print(vec_pipe.transform(df_train['tweet'][0]).toarray().nonzero()[0].shape)

# so we can indeed use these as weights for the glove embeddings

(15, 3018685)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
['00', '00 00', '00 00 00', '00 00 am', '00 00 am kst', '00 00 battery', '00 00 battery biz', '00 00 happy', '00 00 in', '00 000']
['00 bid end', '00 bid end date', '00 bids', '00 bids end', '00 bids end date', '00 black', '00 black box', '00 black box url', '00 bst', '00 bst white']
[2593752  672735 1281430 1497736 1651918 1281430   76098 2264125 1080966
 2944833 2856707  787940  334095   18729]
[ 0  1  2  3  4  5  7  8  9 10 11 12 13 14]
(14,)
(14,)


In [41]:
#We should actually re-train a tfidf vectorizer that we are sure has our vocabulary...
# since we now tokenized and cleaned the tweets differently we don't need ngrams anymore

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    vocabulary= None,
    lowercase=False,
    ngram_range=(1,1),
    tokenizer=tknzr.tokenize,
)

X_tfidf = tfidf.fit_transform(df_train['tweet'].apply(lambda x: ' '.join(x)))

# import pickle
# with open('data/out/trained/tfidf_vectorizer_cleaned-v5.pickle', 'wb') as f:
#     pickle.dump(tfidf, f)


In [51]:
# let's check inconsistencies between the two vocabularies
# we can count the non-zero elements in the tfidf matrix and compare it to the len of tweets
print(X_tfidf.nonzero()[1].shape)
print(df_train['tweet_len'].sum())

# so we have less words in the tfidf than in the tweets, let's check the difference
print(X_tfidf.nonzero()[1].shape[0] - df_train['tweet_len'].sum())
# in which tweets are these words?
print(df_train['tweet'][df_train['tweet_len'] != X_tfidf.nonzero()[1].shape[0]])

# <user> not in tfidf?
print(df_train['tweet'][df_train['tweet_len'] != X_tfidf.nonzero()[1].shape[0]].apply(lambda x: '<user>' in x).sum())



(1968319,)
2121683
-153364
0         [<user>, dunno, justin, mention, not, justin, ...
1         [your, logic, is, so, dumb, won't, even, crop,...
2         [<user>, put, casper, in, box, !, looved, the,...
3         [<user>, <user>, thanks, sir, >_>, don't, trip...
4         [visiting, tmr, is, the, bestest, birthday_gif...
                                ...                        
196965    [can't_wait, fake_tan, tonight, !, hate_being,...
196966    [<user>, darling, lost, internet_connection, i...
196967    [kanguru, defender, basic, 4_gb, usb_2.0, flas...
196968                                     [rizan, is, sad]
196969                                    [no, ?, yea, mad]
Name: tweet, Length: 196784, dtype: object
101096


In [52]:
tfidf = TfidfVectorizer(
    vocabulary= None,
    lowercase=False,
    ngram_range=(1,4),
    tokenizer=tknzr.tokenize,
    stop_words=stop_words
)

X_tfidf = tfidf.fit_transform(df_train['tweet'].apply(lambda x: ' '.join(x)))



In [55]:
# let's check inconsistencies between the two vocabularies
# we can count the non-zero elements in the tfidf matrix and compare it to the len of tweets
print(X_tfidf.nonzero()[1].shape)
print(df_train['tweet_len'].sum())

# so we have less words in the tfidf than in the tweets, let's check the difference
print(X_tfidf.nonzero()[1].shape[0] - df_train['tweet_len'].sum())
# in which tweets are these words?
print(df_train['tweet'][df_train['tweet_len'] > X_tfidf.nonzero()[1].shape[0]]) # we looked for the tweets that have more words than the tfidf entries now

# <user> not in tfidf?
print(df_train['tweet'][df_train['tweet_len'] > X_tfidf.nonzero()[1].shape[0]].apply(lambda x: '<user>' in x).sum())


(7158536,)
2121683
5036853
Series([], Name: tweet, dtype: object)
0


In [59]:

# lets use the tfidf vectorizer values to weight the glove embeddings or the fasttext embeddings
# let's try with the glove embeddings - ok nah lets try with the fasttext embeddings that we are sure are working, else too many keyerrors to fix
from gensim.models import FastText
fasttext = FastText.load('data/fasttext/fasttext_300_4grams.model')

X_vec_fasttext_tfidf = np.zeros((df_train.shape[0], 300))
iter = 0
for i, tweet in enumerate(df_train['tweet']):
    #print(tweet)
    #print(fasttext.wv.get_mean_vector(tweet))
    try: 
        weights = X_tfidf[i].nonzero()[1]
    except ValueError:
        print('ValueError at index ', i, ' tweet:')
        print(tweet)
        continue
    
    for j, word in enumerate(tweet):
        try:
            if len(weights) > j:
                X_vec_fasttext_tfidf[i] += fasttext.wv.get_vector(word) * weights[j]
            else:
                X_vec_fasttext_tfidf[i] += fasttext.wv.get_vector(word) * weights.mean()
        except KeyError:
            print('KeyError at index ', j, ' of tweet: ', i, ' word: ', word)
            print(tweet)
            continue

    X_vec_fasttext_tfidf[i] = X_vec_fasttext_tfidf[i] / len(tweet)
    iter += 1
    if iter % 25000 == 0:
        print('Tweets processed: ', iter)

from sklearn.model_selection import train_test_split
X_train, X_eval, y_train, y_eval = train_test_split(X_vec_fasttext_tfidf, df_train['label'], test_size=0.2)


Tweets processed:  25000
Tweets processed:  50000
Tweets processed:  75000
Tweets processed:  100000
Tweets processed:  125000
Tweets processed:  150000
Tweets processed:  175000


In [60]:
from sklearn.svm import LinearSVC
linsvc = LinearSVC()
linsvc = train_test(linsvc, X_train, y_train, X_eval, y_eval)

Training set size:  (157427, 300)  Evaluation set size:  (39357, 300)
Metrics on evaluation set: 
Accuracy:  0.6673272861244506
F1 score:  0.7391468929930469
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.6737980143177472
F1 score:  0.7445442109190398
Confusion matrix: 
[[ 7714 12157]
 [  936 18550]]




In [62]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(
    n_jobs=-1,
    max_iter=1000,
    C=0.1,
)
logreg = train_test(logreg, X_train, y_train, X_eval, y_eval)

Training set size:  (157427, 300)  Evaluation set size:  (39357, 300)
Metrics on evaluation set: 
Accuracy:  0.7755926518789542
F1 score:  0.7803531459835862
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.779967858118366
F1 score:  0.7847533353632392
Confusion matrix: 
[[14836  5035]
 [ 3797 15689]]


In [63]:
from sklearn.model_selection import train_test_split
X_train_tfidf, X_eval_tfidf, y_train_tfidf, y_eval_tfidf = train_test_split(X_tfidf, df_train['label'], test_size=0.2)
from sklearn.svm import LinearSVC
linsvc_tfidf = LinearSVC()
linsvc_tfidf = train_test(linsvc_tfidf, X_train_tfidf, y_train_tfidf, X_eval_tfidf, y_eval_tfidf)

Training set size:  (157427, 3435629)  Evaluation set size:  (39357, 3435629)
Metrics on evaluation set: 
Accuracy:  0.8386055847752624
F1 score:  0.842881171465321
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.9984564274235043
F1 score:  0.9984491770427145
Confusion matrix: 
[[15967  3902]
 [ 2450 17038]]


In [64]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb = train_test(nb, X_train_tfidf, y_train_tfidf, X_eval_tfidf, y_eval_tfidf)

Training set size:  (157427, 3435629)  Evaluation set size:  (39357, 3435629)
Metrics on evaluation set: 
Accuracy:  0.8056508372081206
F1 score:  0.8199854086747782
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.9688617581482211
F1 score:  0.9694765812774754
Confusion matrix: 
[[14287  5582]
 [ 2067 17421]]


In [65]:
from sklearn.linear_model import LogisticRegression
logreg_tfidf = LogisticRegression(
    n_jobs=-1,
    max_iter=1000,
)
logreg_tfidf = train_test(logreg_tfidf, X_train_tfidf, y_train_tfidf, X_eval_tfidf, y_eval_tfidf)


Training set size:  (157427, 3435629)  Evaluation set size:  (39357, 3435629)
Metrics on evaluation set: 
Accuracy:  0.8220900983306655
F1 score:  0.8280789628756628
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.918883037852465
F1 score:  0.920906264322965
Confusion matrix: 
[[15492  4377]
 [ 2625 16863]]


In [66]:
linsvc_tfidf = LinearSVC(
    C=0.1,
)
linsvc_tfidf = train_test(linsvc_tfidf, X_train_tfidf, y_train_tfidf, X_eval_tfidf, y_eval_tfidf)

Training set size:  (157427, 3435629)  Evaluation set size:  (39357, 3435629)
Metrics on evaluation set: 
Accuracy:  0.8226999009070813
F1 score:  0.8300784103638045
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.919588126560247
F1 score:  0.9218893653780891
Confusion matrix: 
[[15335  4534]
 [ 2444 17044]]


In [67]:
linsvc_tfidf = LinearSVC(
    C=0.01,
)
linsvc_tfidf = train_test(linsvc_tfidf, X_train_tfidf, y_train_tfidf, X_eval_tfidf, y_eval_tfidf)

Training set size:  (157427, 3435629)  Evaluation set size:  (39357, 3435629)
Metrics on evaluation set: 
Accuracy:  0.7725944558782427
F1 score:  0.7835236068111456
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.7990941833357683
F1 score:  0.80975410230499
Confusion matrix: 
[[14210  5659]
 [ 3291 16197]]


In [81]:
linsvc_tfidf = LinearSVC(
    C=0.85,
)
linsvc_tfidf = train_test(linsvc_tfidf, X_train_tfidf, y_train_tfidf, X_eval_tfidf, y_eval_tfidf)

Training set size:  (157427, 3435629)  Evaluation set size:  (39357, 3435629)
Metrics on evaluation set: 
Accuracy:  0.8384023172497904
F1 score:  0.8427999406792228
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.9981451720479968
F1 score:  0.9981364239763098
Confusion matrix: 
[[15948  3921]
 [ 2439 17049]]


In [82]:
# honestly with a bit of preprocessing tfidf is king... we could even add the 5-grams and call it a day now that we preprocessed the tweets
# we can literally scrap the fasttext embeddings and just use the tfidf vectorizer
# note that we are working out of the partial dataset here, so we should probably retrain the tfidf vectorizer on the full dataset

tfidf = TfidfVectorizer(
    vocabulary= None,
    lowercase=False,
    ngram_range=(1,5),
    tokenizer=tknzr.tokenize,
    stop_words=None,
)

X_tfidf = tfidf.fit_transform(df_train['tweet'].apply(lambda x: ' '.join(x)))

from sklearn.model_selection import train_test_split
X_train_tfidf, X_eval_tfidf, y_train_tfidf, y_eval_tfidf = train_test_split(X_tfidf, df_train['label'], test_size=0.2)


In [85]:
linsvc_tfidf = LinearSVC(
    C=0.85,
)
linsvc_tfidf = train_test(linsvc_tfidf, X_train_tfidf, y_train_tfidf, X_eval_tfidf, y_eval_tfidf)

Training set size:  (157427, 4613061)  Evaluation set size:  (39357, 4613061)
Metrics on evaluation set: 
Accuracy:  0.8403079503010901
F1 score:  0.8465613632479676
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.9982976236604902
F1 score:  0.9982850195175018
Confusion matrix: 
[[15734  3917]
 [ 2368 17338]]


In [86]:
tfidf = TfidfVectorizer(
    vocabulary= None,
    lowercase=False,
    ngram_range=(1,6),
    tokenizer=tknzr.tokenize,
    stop_words=None,
)
X_tfidf = tfidf.fit_transform(df_train['tweet'].apply(lambda x: ' '.join(x)))
from sklearn.model_selection import train_test_split
X_train_tfidf, X_eval_tfidf, y_train_tfidf, y_eval_tfidf = train_test_split(X_tfidf, df_train['label'], test_size=0.2)

In [92]:
# disgustingly good and fast
linsvc_tfidf = LinearSVC(
    C=0.85,
)
linsvc_tfidf = train_test(linsvc_tfidf, X_train_tfidf, y_train_tfidf, X_eval_tfidf, y_eval_tfidf)

Training set size:  (157427, 5644917)  Evaluation set size:  (39357, 5644917)
Metrics on evaluation set: 
Accuracy:  0.8370556698935386
F1 score:  0.8439165672840557
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.9983547930151753
F1 score:  0.9983446142439872
Confusion matrix: 
[[15607  4155]
 [ 2258 17337]]


In [93]:
tfidf = TfidfVectorizer(
    vocabulary= None,
    lowercase=False,
    ngram_range=(1,6),
    tokenizer=tknzr.tokenize,
    stop_words=stop_words,
)
X_tfidf = tfidf.fit_transform(df_train['tweet'].apply(lambda x: ' '.join(x)))
from sklearn.model_selection import train_test_split
X_train_tfidf, X_eval_tfidf, y_train_tfidf, y_eval_tfidf = train_test_split(X_tfidf, df_train['label'], test_size=0.2)



In [94]:
linsvc_tfidf = LinearSVC(
    C=0.85,
)
linsvc_tfidf = train_test(linsvc_tfidf, X_train_tfidf, y_train_tfidf, X_eval_tfidf, y_eval_tfidf)

Training set size:  (157427, 5621570)  Evaluation set size:  (39357, 5621570)
Metrics on evaluation set: 
Accuracy:  0.8389613029448383
F1 score:  0.8447557928770881
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.998329384413093
F1 score:  0.998320229419617
Confusion matrix: 
[[15775  4038]
 [ 2300 17244]]


In [96]:
linsvc_tfidf = LinearSVC(
    C=1,
)
linsvc_tfidf = train_test(linsvc_tfidf, X_train_tfidf, y_train_tfidf, X_eval_tfidf, y_eval_tfidf)

Training set size:  (157427, 5621570)  Evaluation set size:  (39357, 5621570)
Metrics on evaluation set: 
Accuracy:  0.8391391620296262
F1 score:  0.8447561364360854
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.9985961747349565
F1 score:  0.9985885538744514
Confusion matrix: 
[[15801  4012]
 [ 2319 17225]]


In [5]:
# so let's just do this for the full dataset

df_train = load_train_data(path_pos='data/twitter-datasets/train_pos_full.txt', path_neg='data/twitter-datasets/train_neg_full.txt')
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(
    preserve_case=True, # we can play around with this too, but for now we keep everything as with embeddings
    reduce_len=True,
)

# words
df_train['tweet'] = df_train['tweet'].apply(lambda x: tknzr.tokenize(x))
df_train['tweet_len'] = df_train['tweet'].apply(lambda x: len(x))
print("Tokenized tweets: ", df_train['tweet_len'].describe(), end=' ')

# ngrams
from gensim.models.phrases import Phrases
quadgrams = Phrases.load('data/grams/quadgram.model')
df_train['tweet'] = df_train['tweet'].apply(lambda x: quadgrams[x])
df_train['tweet_len'] = df_train['tweet'].apply(lambda x: len(x))
print("Quadgrammed tweets: ", df_train['tweet_len'].describe())

# identify stopwords as the top ones that are in both classes, which basically translates in the onoes that give us less information
from collections import Counter
df_train_pos = df_train[df_train['label'] == 1]
df_train_neg = df_train[df_train['label'] != 1]
c_pos = Counter([word for tweet in df_train_pos['tweet'] for word in tweet])
c_neg = Counter([word for tweet in df_train_neg['tweet'] for word in tweet])
c_all = Counter([word for tweet in df_train['tweet'] for word in tweet])
# basically a cooccurence matrix of the top 1000 words in both classes, we want the count to be high in both classes
# actually we can do this by taking the most common words in one class and then check if the are in the other class's common words as well
# and take the ones that are roughly present in both classes in equal amounts so they are both very common and very uninformative for classification
# if this calculation is correct we have: say w has total count 100, we filter it out if there are more than 45 occurrances in pos AND in neg. 
# So the max unbalance is 45/55 = 0.82, which is the max ratio of occurrances in pos to occurrances in neg. after that we filter out words that are too common in one class
stop_words = [word for word, count in c_all.most_common(1000) if count*0.45 < c_pos[word] and count*0.45 < c_neg[word]]

print("Stop Words:", len(stop_words),
"Number of long tweets: ", df_train[df_train['tweet_len'] > 40].shape[0], 
"Number of short tweets (1-2 quadgrams): ", df_train[df_train['tweet_len'] < 3].shape[0] 
)

# clean outliers with too many words
print("Removing long tweets...")
df_train = df_train[df_train['tweet_len'] <= 40]
 # Consider creating a separate model for the ones with 1-3 words and train on those separately, we could inspect the results later

# remove stopwords
print("Removing stopwords...")
df_train['tweet'] = df_train['tweet'].apply(lambda x: [word for word in x if word not in stop_words]) # turns out there's some tweets that are just stopwords, so we need to filter those out

df_train['tweet_len'] = df_train['tweet'].apply(lambda x: len(x))
print("Number of null tweets: ", df_train[df_train['tweet_len'] == 0].shape[0])
print("Removing null tweets...")
df_train = df_train[df_train['tweet_len'] >= 1] 
df_train['tweet_len'] = df_train['tweet'].apply(lambda x: len(x))

# remove words that are too rare --- we kinda did with fasttext and they should not have a vector anyway so let's skip here
#rare_stop_words = [word for word, count in c_all.most_common() if count < 3] # very slow with most_common, would be better to just have access to the least_common but ok
#df_train['tweet'] = df_train['tweet'].apply(lambda x: [word for word in x if word not in rare_stop_words])
#print("Rare words: ", len(rare_stop_words))

df_train_pos = df_train[df_train['label'] == 1]
df_train_neg = df_train[df_train['label'] != 1]
print("Final tweets: ", df_train['tweet_len'].describe(), "Final tweets, pos: ", df_train_pos['tweet_len'].describe(), "Final tweets, neg: ", df_train_neg['tweet_len'].describe())


Train set:  (2458295, 2)
Train set positives:  (1218655, 2)
Train set negatives:  (1239640, 2)
Tokenized tweets:  count    2.458295e+06
mean     1.598098e+01
std      1.491048e+01
min      1.000000e+00
25%      1.000000e+01
50%      1.500000e+01
75%      2.200000e+01
max      3.304000e+03
Name: tweet_len, dtype: float64 Quadgrammed tweets:  count    2.458295e+06
mean     1.499350e+01
std      1.410102e+01
min      1.000000e+00
25%      9.000000e+00
50%      1.400000e+01
75%      2.000000e+01
max      3.229000e+03
Name: tweet_len, dtype: float64
Stop Words: 256 Number of long tweets:  2461 Number of short tweets (1-2 quadgrams):  12722
Removing long tweets...
Removing stopwords...
Number of null tweets:  276
Removing null tweets...
Final tweets:  count    2.455558e+06
mean     1.071552e+01
std      5.452992e+00
min      1.000000e+00
25%      6.000000e+00
50%      1.000000e+01
75%      1.500000e+01
max      3.900000e+01
Name: tweet_len, dtype: float64 Final tweets, pos:  count    1.21692

In [99]:
from sklearn.feature_extraction.text import TfidfVectorizer
# add quadgrams on top of tokenizer

from nltk.tokenize import TweetTokenizer
qutknzr = TweetTokenizer(
    preserve_case=True, 
    reduce_len=True,
)

def quadgram_tokenizer(tweet):
    return quadgrams[qutknzr.tokenize(tweet)]

tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    tokenizer=quadgram_tokenizer,
    min_df=3,
    stop_words=None,
)
X_train_tfidf = tfidf.fit_transform(df_train['tweet'].apply(lambda x: ' '.join(x)))
y_train_tfidf = df_train['label']

# save the tfidf model
import pickle
with open('data/out/trained/tfidf-v5.model', 'wb') as f:
    pickle.dump(tfidf, f)

In [100]:
X_train, X_eval, y_train, y_eval = train_test_split(X_train_tfidf, y_train_tfidf, test_size=0.2)
linsvc = LinearSVC()
linsvc = train_test(linsvc, X_train, y_train, X_eval, y_eval)

Training set size:  (1964446, 1080613)  Evaluation set size:  (491112, 1080613)
Metrics on evaluation set: 
Accuracy:  0.8524267376891626
F1 score:  0.8534831487931943
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.9332279940502309
F1 score:  0.9335508285249673
Confusion matrix: 
[[207548  40799]
 [ 31676 211089]]


In [101]:
# disgustingly good
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb = train_test(nb, X_train, y_train, X_eval, y_eval)

Training set size:  (1964446, 1080613)  Evaluation set size:  (491112, 1080613)
Metrics on evaluation set: 
Accuracy:  0.8184385639121016
F1 score:  0.8320610830795425
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.8477545323210717
F1 score:  0.8587643831502157
Confusion matrix: 
[[181054  67293]
 [ 21874 220891]]


In [103]:
# let's add grams
from sklearn.feature_extraction.text import TfidfVectorizer
# add quadgrams on top of tokenizer

from nltk.tokenize import TweetTokenizer
qutknzr = TweetTokenizer(
    preserve_case=True, 
    reduce_len=True,
)

def quadgram_tokenizer(tweet):
    return quadgrams[qutknzr.tokenize(tweet)]

tfidf = TfidfVectorizer(
    ngram_range=(1, 5),
    tokenizer=qutknzr.tokenize,
    min_df=3,
    stop_words=None,
)
X_train_tfidf = tfidf.fit_transform(df_train['tweet'].apply(lambda x: ' '.join(x)))
y_train_tfidf = df_train['label']

# save the tfidf model
import pickle
with open('data/out/trained/tfidf-v5-5gram.model', 'wb') as f:
    pickle.dump(tfidf, f)

In [104]:
X_train, X_eval, y_train, y_eval = train_test_split(X_train_tfidf, y_train_tfidf, test_size=0.2)
linsvc = LinearSVC()
linsvc = train_test(linsvc, X_train, y_train, X_eval, y_eval)

Training set size:  (1964446, 2798649)  Evaluation set size:  (491112, 2798649)
Metrics on evaluation set: 
Accuracy:  0.8554891755852025
F1 score:  0.8565926638391079
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.9582157005079295
F1 score:  0.9581954626826145
Confusion matrix: 
[[208181  39709]
 [ 31262 211960]]


In [105]:

linsvc = LinearSVC(
    C=0.1,
)
linsvc = train_test(linsvc, X_train, y_train, X_eval, y_eval)

Training set size:  (1964446, 2798649)  Evaluation set size:  (491112, 2798649)
Metrics on evaluation set: 
Accuracy:  0.8605389402010132
F1 score:  0.8634769903584151
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.8934748015471028
F1 score:  0.8954068333955605
Confusion matrix: 
[[206026  41864]
 [ 26627 216595]]


In [6]:
# load vectorizer
import pickle
with open('data/out/trained/tfidf-v5-5gram.model', 'rb') as f:
    tfidf = pickle.load(f)

X_train_tfidf = tfidf.transform(df_train['tweet'].apply(lambda x: ' '.join(x)))

In [9]:

# grid search
from sklearn.model_selection import GridSearchCV
# random search
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'C': [0.001, 0.005, 0.01, 0.05, 0.01, 0.03, 0.5, 0.8, 0.85, 0.95, 1]
}

from sklearn.svm import LinearSVC
linsvc = LinearSVC()
grid_search = RandomizedSearchCV(linsvc, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train_tfidf, df_train['label'])
print(grid_search.best_params_)
print(grid_search.best_score_)
print(grid_search.best_estimator_)
print(grid_search.cv_results_)
print(grid_search)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
{'C': 0.5}
0.8540905974111057
LinearSVC(C=0.5)
{'mean_fit_time': array([18.19162003, 52.87049405, 18.09167592, 16.33018398, 13.88037475,
       28.05644806, 43.33823458, 38.91799521, 12.90611966, 29.49451756]), 'std_fit_time': array([0.87336903, 8.08964656, 0.99546791, 1.76063246, 0.77400606,
       1.33520405, 4.219617  , 4.60410238, 0.50013605, 0.61895113]), 'mean_score_time': array([0.21216003, 0.23504821, 0.42306733, 0.20025937, 0.20223029,
       0.20049866, 0.15699005, 0.16616281, 0.18446406, 0.12592276]), 'std_score_time': array([0.02278829, 0.04378246, 0.33036282, 0.01055319, 0.02842651,
       0.01498232, 0.0024077 , 0.02891647, 0.02576474, 0.01276247]), 'param_C': masked_array(data=[0.01, 1, 0.005, 0.01, 0.03, 0.5, 0.95, 0.85, 0.05, 0.8],
             mask=[False, False, False, False, False, False, False, False,
                   False, False],
       fill_value='?',
            dtype=object), 'params': [{'C': 0.01

In [11]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

linsvc = LinearSVC(
    C=0.95,
)
X_train, X_eval, y_train, y_eval = train_test_split(X_train_tfidf, df_train['label'], test_size=0.2)
linsvc = train_test(linsvc, X_train, y_train, X_eval, y_eval)
linsvc = LinearSVC(
    C=0.5,
)
linsvc = train_test(linsvc, X_train, y_train, X_eval, y_eval)

linsvc = LinearSVC(
    C=0.1,
)
linsvc = train_test(linsvc, X_train, y_train, X_eval, y_eval)

Training set size:  (1964446, 2798649)  Evaluation set size:  (491112, 2798649)
Metrics on evaluation set: 
Accuracy:  0.8558475459772924
F1 score:  0.8571549635093936
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.9568244685779095
F1 score:  0.9568137843947941
Confusion matrix: 
[[207911  39763]
 [ 31032 212406]]
Training set size:  (1964446, 2798649)  Evaluation set size:  (491112, 2798649)
Metrics on evaluation set: 
Accuracy:  0.8603088501197282
F1 score:  0.8621379825894042
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.9388753877683581
F1 score:  0.9391490310548933
Confusion matrix: 
[[207996  39678]
 [ 28926 214512]]
Training set size:  (1964446, 2798649)  Evaluation set size:  (491112, 2798649)
Metrics on evaluation set: 
Accuracy:  0.8603149587059571
F1 score:  0.8634123711545469
Metrics on training set to check overfitting/triviality of model: 
Accuracy:  0.8937512153553725
F1 score:  0.8956330074114965
Confusi

In [16]:
# let's check average length of tweets we missclassified
df_train['pred'] = linsvc.predict(X_train_tfidf)
print(df_train['pred'].head())
print(df_train['label'].head())
print(len(df_train[(df_train['pred'] != df_train['label'])]))
print(df_train[(df_train['pred'] != df_train['label'])]['tweet_len'].describe())
print(df_train[(df_train['pred'] == df_train['label'])]['tweet_len'].describe()) # they are indeed slightly longer

0    1
1   -1
2    1
3    1
4    1
Name: pred, dtype: int64
0    1
1    1
2    1
3    1
4    1
Name: label, dtype: int64
277321
count    277321.000000
mean          9.783049
std           4.952792
min           1.000000
25%           6.000000
50%           9.000000
75%          13.000000
max          38.000000
Name: tweet_len, dtype: float64
count    2.178237e+06
mean     1.083424e+01
std      5.502091e+00
min      1.000000e+00
25%      6.000000e+00
50%      1.000000e+01
75%      1.500000e+01
max      3.900000e+01
Name: tweet_len, dtype: float64


In [25]:
print(df_train[(df_train['pred'] != df_train['label'])]['tweet_len'].mean())
print(df_train[(df_train['pred'] == df_train['label'])]['tweet_len'].mean())
print(df_train[(df_train['pred'] != df_train['label'])]['tweet_len'].median())
print(df_train[(df_train['pred'] == df_train['label'])]['tweet_len'].median())
print(df_train[(df_train['pred'] != df_train['label'] & (df_train['label'] == 0))]['tweet_len'].mean())
print(df_train[(df_train['pred'] != df_train['label'] & (df_train['label'] == 1))]['tweet_len'].mean()) # in particular we missclasify positive longer tweets

print(len(df_train[(df_train['pred'] != df_train['label']) & (df_train['label'] == 0)])) # what? it's 0 tweets?
print(len(df_train[(df_train['pred'] != df_train['label']) & (df_train['label'] == 1)]))
print(len(df_train[(df_train['pred'] == df_train['label']) & (df_train['label'] == 0)]))
print(len(df_train[(df_train['pred'] == df_train['label']) & (df_train['label'] == 1)])) # we only classify positive??

9.783049246180418
10.834239341265436
9.0
10.0
10.715522093145427
11.658679041455194
0
104527
0
1112397


In [28]:
print(df_train[df_train['pred'] != 1].head())
print(df_train.head())

                                                tweet  label  tweet_len  pred
1   [your, logic, is, dumb, won't, crop, your, nam...      1         11    -1
10          [<user>, anddd, cheer, #nationals2013, ?]      1          5    -1
11  [we, send, an_invitation, shop, on-line, !, yo...      1         17    -1
23             [<user>, no, doubts, that, ability, !]      1          6    -1
29         [<user>, seriously, it's, vanity, fairest]      1          5    -1
                                               tweet  label  tweet_len  pred
0  [<user>, dunno, justin, read, mention, not, ju...      1         16     1
1  [your, logic, is, dumb, won't, crop, your, nam...      1         11    -1
2  [<user>, just, put, casper, in, box, !, looved...      1         12     1
3  [<user>, <user>, thanks, sir, >_>, don't, trip...      1         15     1
4  [visiting, tmr, is, the, bestest, birthday_gif...      1         10     1


In [36]:
# honestly I'd missclassify them too wtf? Are we sure of the labels?
print(df_train[1:2]['tweet'].apply(lambda x: ' '.join(x)))
print(df_train[1:2]['tweet'].apply(lambda x: ' '.join(x)).values[0]) # honestly this is a very negative tweet, why is it in positive dataset?

# How to detect sarcasm?? ahahahah if this is sentiment analysis it actually performs well, but this is just missclassification!

1    your logic is dumb won't crop your name your t...
Name: tweet, dtype: object
your logic is dumb won't crop your name your tsk <url>


In [40]:
# Final model
linsvc = LinearSVC(
    C=0.1,
)
linsvc.fit(X_train_tfidf, df_train['label'])
# load test data
df_test = load_test_data()

from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(
    preserve_case=True,
    reduce_len=True,
)

# apply same preprocessing to test data
df_test['tweet'] = df_test['tweet'].apply(lambda x: tknzr.tokenize(x))
df_test['tweet'] = df_test['tweet'].apply(lambda x: [w for w in x if w not in stop_words])

# let's check if we have nulls...
print(df_test.isnull().sum())

# apply same vectorizer to test data
X_test_tfidf = tfidf.transform(df_test['tweet'].apply(lambda x: ' '.join(x)))

# make predictions and save them
predict_test_data(X_test_tfidf,linsvc, 'data/out/submission-v5.csv')


id       0
tweet    0
dtype: int64


Unnamed: 0,Id,tweet,Prediction
0,1,"[sea, doo, pro, sea, scooter, (, sports, the, ...",-1
1,2,"[<user>, shucks, well, work, can't, cheer, you...",1
2,3,"[cant, away, from, bug, thats]",-1
3,4,"[<user>, no, ma'am, !, !, !, lol, perfectly, f...",1
4,5,"[whenever, fall, asleep, watching, the, tv, al...",-1
...,...,...,...
9995,9996,"[had, nice, time, w, /, friend, lastnite]",1
9996,9997,"[<user>, no, it's, not, !, please, stop, !]",-1
9997,9998,"[not, without, daughter, (, dvd, two-time, osc...",-1
9998,9999,"[<user>, fun, in, sweetcheeks]",1


In [41]:
#acc: 0.845	 f1 score: 0.848

# so slightly worse than v2..... all that embedding work uff

In [None]:
# next: two different models mased on length of tweets. Embeddings for less than 3-4 words, tfidf for longer tweets maybe?