# Exploration Notebook v2 - Jacopo

**Version**: v2

## I just Realized I've been using the 10% datasets the whole time...

Use glove.twitter.27B as embeddings and/or better preprocessing 

In [1]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt


In [2]:
def build_feature_matrix(df, vocab, embeddings, mode='avg'):
    X = np.zeros((df.shape[0], embeddings.shape[1]))
    for i, tweet in enumerate(df['tweet']):
        words = tweet.split()
        for word in words:
            if word in vocab:
                X[i] += embeddings[vocab[word]]
        if mode == 'avg':
            X[i] /= len(words)
        elif mode == 'sum':
            pass
        else:
            raise ValueError('Unknown mode: {}'.format(mode))
    return X
def load_train_data(path_pos='data/twitter-datasets/train_pos_full.txt', path_neg='data/twitter-datasets/train_neg_full.txt'):
    # Load data, txt as csv
    #data_path = 'data/twitter-datasets/'
    df_train_pos = pd.read_csv(path_pos, sep = '\t', names = ['tweet'])
    df_train_pos['label'] = 1
    df_train_neg = pd.read_csv(path_neg, sep = '\t', names = ['tweet'], on_bad_lines='skip')
    df_train_neg['label'] = 0
    df_train = pd.concat([df_train_pos, df_train_neg])
    print('Train set: ', df_train.shape)
    print('Train set positives: ', df_train_pos.shape)
    print('Train set negatives: ', df_train_neg.shape)
    return df_train   
def load_test_data():
    # Load test data: id, tweet for each row
    data_path = 'data/twitter-datasets/'
    df_test = pd.read_csv(data_path + 'test_data.txt', header=None, names=['line'], sep='\t')
    # Extract id and tweet, limit split by 1 so we don't split the tweet (this is v0, at least we keep it intact)
    df_test['id'] = df_test['line'].apply(lambda x: x.split(',',1)[0]) 
    df_test['tweet'] = df_test['line'].apply(lambda x: x.split(',',1)[1])
    df_test = df_test.drop('line', axis=1)
    return df_test
def predict_test_data(X_test, classifier, filename='submission.csv'):
    # Predict test data and save to csv
    y_pred = classifier.predict(X_test)
    df_test['Prediction'] = y_pred
    df_test.rename(columns={'id': 'Id'}, inplace=True)
    df_test['Prediction'] = df_test['Prediction'].apply(lambda x: -1 if x == 0 else x)
    df_test.to_csv(filename, columns=['Id', 'Prediction'], index=False)
    return df_test
    
def predict_test_data_pipeline(df_test, pipe, filename='submission.csv'):
    # Predict test data and save to csv
    y_pred = pipe.predict(df_test['tweet'])
    df_test['Prediction'] = y_pred
    df_test.rename(columns={'id': 'Id'}, inplace=True)
    df_test['Prediction'] = df_test['Prediction'].apply(lambda x: -1 if x == 0 else x)
    df_test.to_csv(filename, columns=['Id', 'Prediction'], index=False)
    return df_test

In [4]:
# Load data, txt as csv
data_path = 'data/twitter-datasets/'
df_train_pos = pd.read_csv(data_path + 'train_pos_full.txt', sep = '\t', names = ['tweet'])
df_train_pos['label'] = 1
df_train_neg = pd.read_csv(data_path + 'train_neg_full.txt', sep = '\t', names = ['tweet'], on_bad_lines='skip')
df_train_neg['label'] = 0
df_train = pd.concat([df_train_pos, df_train_neg])
print('Train set: ', df_train.shape)
print('Train set positives: ', df_train_pos.shape)
print('Train set negatives: ', df_train_neg.shape)

Train set:  (2458295, 2)
Train set positives:  (1218655, 2)
Train set negatives:  (1239640, 2)


In [4]:
df_train = load_train_data()

Train set:  (2458295, 2)
Train set positives:  (1218655, 2)
Train set negatives:  (1239640, 2)


In [5]:
# # vecotizer
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer()
# X_train = vectorizer.fit_transform(df_train['tweet'])
# y_train = df_train['label']

In [6]:
# # classifier, naive bayes
# from sklearn.naive_bayes import MultinomialNB
# classifier = MultinomialNB()
# # k-fold cross validation
# from sklearn.model_selection import cross_val_score
# scores = cross_val_score(classifier, X_train, y_train, cv=5)
# print('Cross validation scores: ', scores)
# print('Mean cross validation score: ', np.mean(scores))
# # very fast and good results
# # Cross validation scores:  [0.73153344 0.73257685 0.72950358 0.73167175 0.73085818]
# # Mean cross validation score:  0.7312287581433473

In [7]:
# classifier.fit(X_train, y_train)
# # Load test data: id, tweet for each row
# df_test = load_test_data()
# X_test = vectorizer.transform(df_test['tweet'])
# # Predict test data and save to csv
# df_test = predict_test_data(X_test, classifier, filename='data/out/submission-v2.csv')
# # acc: 0.719 f1: 0.759

In [8]:
# lets now do the same but with n-grams and embeddings
# vectorizer
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer(ngram_range=(1, 2))
# X_train = vectorizer.fit_transform(df_train['tweet'])
# y_train = df_train['label']


In [3]:
# classifier, naive bayes, same as before, fast and good results
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
# k-fold cross validation
from sklearn.model_selection import cross_val_score
# scores = cross_val_score(classifier, X_train, y_train, cv=5)
# print('Cross validation scores: ', scores)
# print('Mean cross validation score: ', np.mean(scores))
# Cross validation scores:  [0.78193219 0.78204609 0.7791274  0.78229627 0.78140134]
# Mean cross validation score:  0.7813606585051834


In [10]:
# # tri-grams as well
# # vectorizer
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer(ngram_range=(1, 3))
# X_train = vectorizer.fit_transform(df_train['tweet'])
# y_train = df_train['label']

In [11]:
# # k-fold
# scores = cross_val_score(classifier, X_train, y_train, cv=5)
# print('Cross validation scores: ', scores)
# print('Mean cross validation score: ', np.mean(scores))
# # Cross validation scores:  [0.80305456 0.8033271  0.80066876 0.80417118 0.80257862]
# # Mean cross validation score:  0.8027600430379593

In [12]:
# classifier.fit(X_train, y_train)
# X_test = vectorizer.transform(df_test['tweet'])
# # Predict test data and save to csv
# df_test = predict_test_data(X_test, classifier, filename='data/out/submission-v2_1.csv')
# # acc: 0.799	f1: 0.820
# # nice, starting to get better

In [13]:
# # lets keep same vectorizer but use a different classifier
# # aparently it makes a big problem having thr n grams, let try with 1-2
# # vectorizer
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer(ngram_range=(1, 2))
# X_train = vectorizer.fit_transform(df_train['tweet'])
# y_train = df_train['label']
# # classifier, random forest
# from sklearn.ensemble import RandomForestClassifier
# clf = RandomForestClassifier(
#     n_estimators=100, 
#     max_depth=10, 
#     n_jobs=-1)
# # cast values, otherwise we get: ValueError: buffer source array is read-only
# # X_train = X_train.toarray()
# # y_train = y_train.toarray()
# # # k-fold cross validation
# # scores = cross_val_score(clf, X_train, y_train, cv=5)
# # print('Cross validation scores: ', scores)
# # print('Mean cross validation score: ', np.mean(scores))


In [26]:
##============ VECTORIZER ================##
# I get varous errors, also note that we are not refining at all the vectorizer
# let's refine if a bit and keep iterating fast with nayve bayes for a new versione
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

## ========== CLASSIFIER ==========##
from sklearn.model_selection import RandomizedSearchCV
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()

##========== PIPELINE ==========##
#rand search for vectorizer ... how to do it? how to pass the vectorizer to the classifier?
# need to give more info to sklearn, so we need to create a pipeline
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps=[
    ('vectorizer', vectorizer),
    ('classifier', classifier)]
)
# now we can use the pipeline in the random search, all examples I've seen just add
# a prefix with the name of preprocessor or classifier variable, not sure where it's said
# explicitly but hey, this is python, we can do whatever we want
# found this comment in example code from sklearn about the double underscore:
# "Parameters of pipelines can be set using '__' separated parameter names:"


##========== GRID PARAMS ==========##
grid_params = {
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vectorizer__max_features': [1000, 10000, 100000, None], #max number of features
    'vectorizer__min_df': [1, 5, 10, 15], #ignore terms with frequency last than this (nb of docs)
    'vectorizer__max_df': [0.9, 0.95, 1.0], #ignore terms with frequency higher than this (ratio of docs in corpus)
    'vectorizer__binary': [True, False], #if True, all non zero counts are set to 1 (should be better for our case)
    'vectorizer__stop_words': ['english'], #ignore common words, default is english but we can pass a list of words, applies only if analyzer is a 'word'
    'vectorizer__analyzer': ['word'],
    'vectorizer__lowercase': [True, False], #convert all characters to lowercase before tokenizing
    #'classifier__alpha': [1] # else error - Invalid parameter 'vectorizer' for estimator MultinomialNB(). --- fixed, was later
}


##========== SEARCH AND FIT ==========##
rand_search = RandomizedSearchCV(
    pipeline, # I forgot to pass the pipeline here, I was still passing the classifier
    grid_params,
    n_iter=10,
    n_jobs=-1,
    cv=5,
    verbose=2,
)
# stats 
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
from pprint import pprint
from time import time
pprint(grid_params)
t0 = time()

# rand_search.fit(X_train, y_train) # this is not working, we need to obtain these from vectorizer, let's try
# to put just the data in a way it can be used by the pipeline
classifier = rand_search.fit(df_train['tweet'], df_train['label']) # I was not equating this to classifier, had to fit it after again all over

print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % rand_search.best_score_)
print("Best parameters set:")
best_parameters = rand_search.best_estimator_.get_params()
for param_name in sorted(grid_params.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name])) 


Performing grid search...
pipeline: ['vectorizer', 'classifier']
parameters:
{'vectorizer__analyzer': ['word'],
 'vectorizer__binary': [True, False],
 'vectorizer__lowercase': [True, False],
 'vectorizer__max_df': [0.9, 0.95, 1.0],
 'vectorizer__max_features': [1000, 10000, 100000, None],
 'vectorizer__min_df': [1, 5, 10, 15],
 'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
 'vectorizer__stop_words': ['english']}
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END vectorizer__analyzer=word, vectorizer__binary=False, vectorizer__lowercase=False, vectorizer__max_df=0.95, vectorizer__max_features=None, vectorizer__min_df=5, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english; total time=  21.5s
[CV] END vectorizer__analyzer=word, vectorizer__binary=False, vectorizer__lowercase=False, vectorizer__max_df=0.95, vectorizer__max_features=None, vectorizer__min_df=5, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english; total time=  22.1s
[CV] END vec



[CV] END vectorizer__analyzer=word, vectorizer__binary=True, vectorizer__lowercase=False, vectorizer__max_df=0.95, vectorizer__max_features=1000, vectorizer__min_df=15, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=english; total time= 1.1min
[CV] END vectorizer__analyzer=word, vectorizer__binary=True, vectorizer__lowercase=False, vectorizer__max_df=0.95, vectorizer__max_features=1000, vectorizer__min_df=15, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=english; total time= 1.1min
[CV] END vectorizer__analyzer=word, vectorizer__binary=True, vectorizer__lowercase=False, vectorizer__max_df=0.95, vectorizer__max_features=1000, vectorizer__min_df=15, vectorizer__ngram_range=(1, 2), vectorizer__stop_words=english; total time= 1.0min
[CV] END vectorizer__analyzer=word, vectorizer__binary=True, vectorizer__lowercase=False, vectorizer__max_df=0.9, vectorizer__max_features=10000, vectorizer__min_df=15, vectorizer__ngram_range=(1, 1), vectorizer__stop_words=english; total time=

Not sure what I was doing below this cell but i dont wanna delete it

In [28]:
# pipeline: ['vectorizer', 'classifier']
# parameters:
# {'vectorizer__analyzer': ['word'],
#  'vectorizer__binary': [True, False],
#  'vectorizer__lowercase': [True, False],
#  'vectorizer__max_df': [0.9, 0.95, 1.0],
#  'vectorizer__max_features': [1000, 10000, 100000, None],
#  'vectorizer__min_df': [1, 5, 10, 15],
#  'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
#  'vectorizer__stop_words': ['english']}
# done in 761.731s
# ...
# 	vectorizer__max_features: None
# 	vectorizer__min_df: 10
# 	vectorizer__ngram_range: (1, 3)
# 	vectorizer__stop_words: 'english'

In [30]:
print('Best score: ', rand_search.best_score_)
print('Best params: ', rand_search.best_params_)
# Best score:  0.7657478048810253
# Best params:  {'vectorizer__stop_words': 'english', 
# 'vectorizer__ngram_range': (1, 3), 
# 'vectorizer__min_df': 10, 
# 'vectorizer__max_features': None, 
# 'vectorizer__max_df': 0.95, 
# 'vectorizer__lowercase': True, 
# 'vectorizer__binary': True, 
# 'vectorizer__analyzer': 'word'}

Best score:  0.7657478048810253
Best params:  {'vectorizer__stop_words': 'english', 'vectorizer__ngram_range': (1, 3), 'vectorizer__min_df': 10, 'vectorizer__max_features': None, 'vectorizer__max_df': 0.95, 'vectorizer__lowercase': True, 'vectorizer__binary': True, 'vectorizer__analyzer': 'word'}


In [33]:
# worse than default?
# my guess is the english stop words, let's try without them
classifier = MultinomialNB()
vectorizer = CountVectorizer(
    ngram_range=(1, 3),
    min_df=5,
    max_features=None,
    max_df=0.95,
    binary=True,
    stop_words=None,
    lowercase=True,
    analyzer='word',
)
pipeline = Pipeline(steps=[('vectorizer', vectorizer), ('classifier', classifier)])
#pipeline.fit(df_train['tweet'], df_train['label'])
#kfold
scores = cross_val_score(pipeline, df_train['tweet'], df_train['label'], cv=5)
print("Scores: ", scores)
print("Mean: ", scores.mean())


Scores:  [0.80647969 0.80734208 0.80398406 0.80755564 0.8056051 ]
Mean:  0.8061933169127382


In [37]:
# 7min
# Scores:  [0.80647969 0.80734208 0.80398406 0.80755564 0.8056051 ]
# Mean:  0.8061933169127382

# seems highest so far. need to explore more with n-grams but strategy seems that
# more features and data is better

# next: try with tfidf specifically
# also note that in all tests I've done so far
# result in BETTER test score than train score. 
# Not sure the exact reason, gotta look into it

In [44]:

# fit, predict, score
df_train = load_train_data()
print(df_train.head())
#pipeline.fit(df_train['tweet'], df_train['label'])
df_test = load_test_data()
print(df_test.head())
print(pipeline.get_params())
predict_test_data_pipeline(df_test, pipeline, filename='data/out/submission-v2_2.csv')

Train set:  (2458295, 2)
Train set positives:  (1218655, 2)
Train set negatives:  (1239640, 2)
                                               tweet  label
0  <user> i dunno justin read my mention or not ....      1
1  because your logic is so dumb , i won't even c...      1
2   <user> just put casper in a box !  looved the...      1
3  <user> <user> thanks sir > > don't trip lil ma...      1
4  visiting my brother tmr is the bestest birthda...      1
  id                                              tweet
0  1  sea doo pro sea scooter ( sports with the port...
1  2  <user> shucks well i work all week so now i ca...
2  3            i cant stay away from bug thats my baby
3  4  <user> no ma'am ! ! ! lol im perfectly fine an...
4  5  whenever i fall asleep watching the tv , i alw...
{'memory': None, 'steps': [('vectorizer', CountVectorizer(binary=True, max_df=0.95, min_df=5, ngram_range=(1, 3))), ('classifier', MultinomialNB())], 'verbose': False, 'vectorizer': CountVectorizer(binary=True

Unnamed: 0,Id,tweet,Prediction
0,1,sea doo pro sea scooter ( sports with the port...,-1
1,2,<user> shucks well i work all week so now i ca...,1
2,3,i cant stay away from bug thats my baby,1
3,4,<user> no ma'am ! ! ! lol im perfectly fine an...,1
4,5,"whenever i fall asleep watching the tv , i alw...",-1
...,...,...,...
9995,9996,had a nice time w / my friend lastnite,1
9996,9997,<user> no it's not ! please stop !,-1
9997,9998,not without my daughter ( dvd two-time oscar (...,-1
9998,9999,<user> have fun in class sweetcheeks,1


In [45]:
# acc: 0.799	f1: 0.819
# actually I shouldnt be surprised,
# same model, same params a part from binary that doesnt change anything and
# 0.95 max_df which indeed doesnt do much a part from reducing f1 slightly

In [46]:
# what seemed to be woring is increasing n-grams
classifier = MultinomialNB()
vectorizer = CountVectorizer(
    ngram_range=(1, 4),
    min_df=10,
    max_features=None,
    max_df=0.98,
    binary=True,
    stop_words=None,
    lowercase=True,
    analyzer='word',
)
pipeline = Pipeline(steps=[('vectorizer', vectorizer), ('classifier', classifier)])
#pipeline.fit(df_train['tweet'], df_train['label'])
#kfold
scores = cross_val_score(pipeline, df_train['tweet'], df_train['label'], cv=5)
print("Scores: ", scores)
print("Mean: ", scores.mean())

Scores:  [0.80573934 0.80637596 0.80270269 0.80606884 0.80438881]
Mean:  0.8050551296732085


In [47]:
# 7min 50s, similar to 1-3 but we removed a lot of 5-10 occurences features
# Scores:  [0.80573934 0.80637596 0.80270269 0.80606884 0.80438881]
# Mean:  0.8050551296732085

In [7]:
# next: better stop words, twitter specific
# also no lowercase
# let's try with tfidf
classifier = MultinomialNB()
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
    ngram_range=(1, 4),
    min_df=5,
    max_features=None,
    max_df=0.98,
    binary=True,
    stop_words=None,
    lowercase=True,
    analyzer='word',
)
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps=[('vectorizer', vectorizer), ('classifier', classifier)])
#pipeline.fit(df_train['tweet'], df_train['label'])
# simple split, just to see if it works
from sklearn.model_selection import train_test_split
X_train, X_eval, y_train, y_eval = train_test_split(df_train['tweet'], df_train['label'], test_size=0.2)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_eval)
from sklearn import metrics
print(metrics.classification_report(y_eval, y_pred))

NameError: name 'X_test' is not defined

In [9]:
# 1min 30s with 4 grams! nice tfidf
# y_pred = pipeline.predict(X_eval)
# from sklearn import metrics
# print(metrics.classification_report(y_eval, y_pred))
#  precision    recall  f1-score   support

#            0       0.88      0.76      0.82    247847
#            1       0.79      0.89      0.84    243812

#     accuracy                           0.83    491659
#    macro avg       0.83      0.83      0.83    491659
# weighted avg       0.83      0.83      0.83    491659


In [11]:
# lets lower min_df to 3 and increase max_df to 1
classifier = MultinomialNB()
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
    ngram_range=(1, 4),
    min_df=3,
    max_features=None,
    max_df=1.0,
    binary=True,
    stop_words=None,
    lowercase=True,
    analyzer='word',
)
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps=[('vectorizer', vectorizer), ('classifier', classifier)])
from sklearn.model_selection import train_test_split
X_train, X_eval, y_train, y_eval = train_test_split(df_train['tweet'], df_train['label'], test_size=0.2)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_eval)
from sklearn import metrics
print(metrics.classification_report(y_eval, y_pred))
# 1min 40s
#               precision    recall  f1-score   support

#            0       0.88      0.76      0.82    247929
#            1       0.79      0.90      0.84    243730

#     accuracy                           0.83    491659
#    macro avg       0.84      0.83      0.83    491659
# weighted avg       0.84      0.83      0.83    491659



              precision    recall  f1-score   support

           0       0.88      0.76      0.82    247929
           1       0.79      0.90      0.84    243730

    accuracy                           0.83    491659
   macro avg       0.84      0.83      0.83    491659
weighted avg       0.84      0.83      0.83    491659



In [12]:
# fit, predict, score
df_train = load_train_data()
print(df_train.head())
pipeline.fit(df_train['tweet'], df_train['label'])
df_test = load_test_data()
print(df_test.head())
print(pipeline.get_params())
predict_test_data_pipeline(df_test, pipeline, filename='data/out/submission-v2_3.csv')

Train set:  (2458295, 2)
Train set positives:  (1218655, 2)
Train set negatives:  (1239640, 2)
                                               tweet  label
0  <user> i dunno justin read my mention or not ....      1
1  because your logic is so dumb , i won't even c...      1
2   <user> just put casper in a box !  looved the...      1
3  <user> <user> thanks sir > > don't trip lil ma...      1
4  visiting my brother tmr is the bestest birthda...      1
  id                                              tweet
0  1  sea doo pro sea scooter ( sports with the port...
1  2  <user> shucks well i work all week so now i ca...
2  3            i cant stay away from bug thats my baby
3  4  <user> no ma'am ! ! ! lol im perfectly fine an...
4  5  whenever i fall asleep watching the tv , i alw...
{'memory': None, 'steps': [('vectorizer', TfidfVectorizer(binary=True, min_df=3, ngram_range=(1, 4))), ('classifier', MultinomialNB())], 'verbose': False, 'vectorizer': TfidfVectorizer(binary=True, min_df=3, n

Unnamed: 0,Id,tweet,Prediction
0,1,sea doo pro sea scooter ( sports with the port...,-1
1,2,<user> shucks well i work all week so now i ca...,1
2,3,i cant stay away from bug thats my baby,1
3,4,<user> no ma'am ! ! ! lol im perfectly fine an...,1
4,5,"whenever i fall asleep watching the tv , i alw...",-1
...,...,...,...
9995,9996,had a nice time w / my friend lastnite,1
9996,9997,<user> no it's not ! please stop !,-1
9997,9998,not without my daughter ( dvd two-time oscar (...,-1
9998,9999,<user> have fun in class sweetcheeks,1


In [13]:
# acc: 0.822	f1: 0.834
# 1min51 from scratch
# best so far

In [19]:
# let's save the tfidf vectorizer as a pickle file so we can use it later without having to recompute it
# and we can iter more on classifier
# TfidfVectorizer(binary=True, min_df=3, ngram_range=(1, 4))
# MultinomialNB()
print(pipeline.steps[0][1])
print(pipeline.steps[1][1])
import pickle
with open('data/out/trained/tfidf_vectorizer-multinomialNB-pipeline-v2_3.pickle', 'wb') as f:
    pickle.dump(pipeline, f)

TfidfVectorizer(binary=True, min_df=3, ngram_range=(1, 4))
MultinomialNB()


In [20]:
# let's try with a different classifier
# SVM
from sklearn.svm import LinearSVC
classifier = LinearSVC()
# take the same vectorizer as before in the pipeline, take from the pickle file
with open('data/out/trained/tfidf_vectorizer-multinomialNB-pipeline-v2_3.pickle', 'rb') as f:
    pipeline = pickle.load(f)
pipeline.steps[1] = ('classifier', classifier)

# split to check if it works
from sklearn.model_selection import train_test_split
X_train, X_eval, y_train, y_eval = train_test_split(df_train['tweet'], df_train['label'], test_size=0.2)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_eval)
from sklearn import metrics
print(metrics.classification_report(y_eval, y_pred))


              precision    recall  f1-score   support

           0       0.87      0.85      0.86    248065
           1       0.85      0.87      0.86    243594

    accuracy                           0.86    491659
   macro avg       0.86      0.86      0.86    491659
weighted avg       0.86      0.86      0.86    491659



In [21]:
#2min 20s
#               precision    recall  f1-score   support

#            0       0.87      0.85      0.86    248065
#            1       0.85      0.87      0.86    243594

#     accuracy                           0.86    491659
#    macro avg       0.86      0.86      0.86    491659
# weighted avg       0.86      0.86      0.86    491659


In [22]:
# fit, predict, score
df_train = load_train_data()
print(df_train.head())
pipeline.fit(df_train['tweet'], df_train['label'])
df_test = load_test_data()
print(df_test.head())
print(pipeline.get_params())
predict_test_data_pipeline(df_test, pipeline, filename='data/out/submission-v2_4.csv')

# save the pipeline
print(pipeline.steps[0][1])
print(pipeline.steps[1][1])
import pickle
with open('data/out/trained/tfidf_vectorizer-linSVC-pipeline-v2_4.pickle', 'wb') as f:
    pickle.dump(pipeline, f)

Train set:  (2458295, 2)
Train set positives:  (1218655, 2)
Train set negatives:  (1239640, 2)
                                               tweet  label
0  <user> i dunno justin read my mention or not ....      1
1  because your logic is so dumb , i won't even c...      1
2   <user> just put casper in a box !  looved the...      1
3  <user> <user> thanks sir > > don't trip lil ma...      1
4  visiting my brother tmr is the bestest birthda...      1
  id                                              tweet
0  1  sea doo pro sea scooter ( sports with the port...
1  2  <user> shucks well i work all week so now i ca...
2  3            i cant stay away from bug thats my baby
3  4  <user> no ma'am ! ! ! lol im perfectly fine an...
4  5  whenever i fall asleep watching the tv , i alw...
{'memory': None, 'steps': [('vectorizer', TfidfVectorizer(binary=True, min_df=3, ngram_range=(1, 4))), ('classifier', LinearSVC())], 'verbose': False, 'vectorizer': TfidfVectorizer(binary=True, min_df=3, ngram

In [None]:
# acc: 0.848	f1: 0.850
# 2min 45s from scratch
# best so far, nice
# next in v3: 
# more complex preprocessing with external data, vocab, etc (spaCy, gensim, nltk, etc)
# more complex classifier

In [None]:
# # csv with word next to its embedding of d = 200
# vocab_embeddings = pd.read_csv('data/glove/glove.twitter.27B.25d.txt', sep='\r', index_col=0, names=['line'], nrows=10000)
# vocab_embeddings['word'] = vocab_embeddings.index.str.split(' ', 1).str[0]
# vocab_embeddings['embedding'] = vocab_embeddings.index.str.split(' ', 1).str[1]
# vocab_embeddings['embedding'] = vocab_embeddings['embedding'].apply(lambda x: np.fromstring(x, dtype=float, sep=' '))
# vocab_embeddings = vocab_embeddings.reset_index(drop=True)
# vocab_embeddings.head(20)
# print(vocab_embeddings.head(-20))
# # Build vocabulary
# vocab = {}
# for i, word in enumerate(vocab_embeddings['word']):
#     vocab[word] = i 
# print('Vocabulary size: ', len(vocab))
# # Build embeddings matrix
# embeddings = np.zeros((len(vocab), vocab_embeddings['embedding'][0].shape[0]))
# for i, embedding in enumerate(vocab_embeddings['embedding']):
#     embeddings[i] = embedding
# print('Embeddings shape: ', embeddings.shape)
# # Build feature matrix
# X_train = build_feature_matrix(df_train, vocab, embeddings, mode='avg')
# print('Feature matrix shape: ', X_train.shape)
# # n-grams
# from sklearn.feature_extraction.text import CountVectorizer

# # Build vocabulary 
# vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=10000)
# vectorizer.fit(df_train['tweet'])


# # Build feature matrix



In [None]:
# X_train_full = build_feature_matrix(df_train, vocab, embeddings, mode='avg')
# y_train_full = df_train['label'].values

In [None]:
# print('X_train_full shape: ', X_train_full.shape)
# print('y_train_full shape: ', y_train_full.shape)
# print('Embeddings shape: ', embeddings.shape)

In [None]:
# # random forest classifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import cross_val_score

# # let's try a shallower structure 
# clf = RandomForestClassifier(
#     n_estimators=250, 
#     max_depth=5,
#     n_jobs=-1,
#     min_samples_split=15,
#     verbose=2
# )
# scores = cross_val_score(clf, X_train_full, y_train_full, cv=5)
# print('Cross validation scores: ', scores)
# print('Mean cross validation score: ', np.mean(scores))

In [None]:
# # Load test data: id, tweet for each row
# df_test = load_test_data()
# X_test = build_feature_matrix(df_test, vocab, embeddings, mode='avg')

# # pred
# predict_test_data(X_test, clf, filename='data/out/submission-v2.csv')