# Exploration Notebook v1 - Jacopo

**Version**: v1

Next (v2+):
- Better embeddings, GloVe, n-grams, TF-IDF, text preprocessing, etc.
- Include external datasets, use open source models, glove embeddings, etc.
- Smileys = sentiment analysis, expand problem to sentiment analysis, etc.


**Goal**: improve baseline model, feature engineering

**Folder Structure**: 
- `data/` - all data 
- `data/twitter-datasets/` - provided data
- `data/out/` - output data, predictions
- `data/glove_custom/` - tweaking of provided code to generate embeddings from GloVe + embeddings

In [17]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from helpers_Jacopo import *
import pickle


In [18]:
def build_feature_matrix(df, vocab, embeddings, mode='avg'):
    X = np.zeros((df.shape[0], embeddings.shape[1]))
    for i, tweet in enumerate(df['tweet']):
        words = tweet.split()
        for word in words:
            if word in vocab:
                X[i] += embeddings[vocab[word]]
        if mode == 'avg':
            X[i] /= len(words)
        elif mode == 'sum':
            pass
        else:
            raise ValueError('Unknown mode: {}'.format(mode))
    return X
    
def load_test_data():
    # Load test data: id, tweet for each row
    data_path = 'data/twitter-datasets/'
    df_test = pd.read_csv(data_path + 'test_data.txt', header=None, names=['line'], sep='\t')
    # Extract id and tweet, limit split by 1 so we don't split the tweet (this is v0, at least we keep it intact)
    df_test['id'] = df_test['line'].apply(lambda x: x.split(',',1)[0]) 
    df_test['tweet'] = df_test['line'].apply(lambda x: x.split(',',1)[1])
    df_test = df_test.drop('line', axis=1)
    return df_test
def predict_test_data(X_test, classifier, filename='submission.csv'):
    # Predict test data and save to csv
    y_pred = classifier.predict(X_test)
    df_test['Prediction'] = y_pred
    df_test.rename(columns={'id': 'Id'}, inplace=True)
    df_test['Prediction'] = df_test['Prediction'].apply(lambda x: -1 if x == 0 else x)
    df_test.to_csv(filename, columns=['Id', 'Prediction'], index=False)
    return df_test

In [19]:
# Load data, txt as csv
data_path = 'data/twitter-datasets/'
df_train_pos = pd.read_csv(data_path + 'train_pos.txt', sep = '\t', names = ['tweet'])
df_train_pos['label'] = 1
df_train_neg = pd.read_csv(data_path + 'train_neg.txt', sep = '\t', names = ['tweet'])
df_train_neg['label'] = 0
df_train = pd.concat([df_train_pos, df_train_neg])
print('Train set: ', df_train.shape)
print('Train set positives: ', df_train_pos.shape)
print('Train set negatives: ', df_train_neg.shape)


vocab = pickle.load(open('data/glove_custom/vocab.pkl', 'rb'))
embeddings = np.load('data/glove_custom/embeddings.npy')
X_train_full = build_feature_matrix(df_train, vocab, embeddings, mode='avg')
y_train_full = df_train['label'].values

Train set:  (196970, 2)
Train set positives:  (97902, 2)
Train set negatives:  (99068, 2)


In [22]:
# k-fold, random forests
from sklearn.ensemble import RandomForestClassifier

# random forest, kinda deep -n_jobs=-1 makes it use all cores, so much better
clf = RandomForestClassifier(
    n_estimators=150, 
    min_samples_split=30, 
    max_features='sqrt', 
    max_depth=45, 
    criterion='gini',
    n_jobs=-1,)

# k-fold
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X_train_full, y_train_full, cv=3, n_jobs=-1)
print('Scores: ', scores)
print('Mean score: ', np.mean(scores))

Scores:  [0.63530164 0.62384818 0.61782625]
Mean score:  0.6256586899911453


In [23]:
# fit 
clf.fit(X_train_full, y_train_full)

In [24]:
# Load test data: id, tweet for each row
df_test = load_test_data()
# Build feature matrix for test data
X_test = build_feature_matrix(df_test, vocab, embeddings, mode='avg')
# Predict test data and save to csv
predict_test_data(X_test, clf, filename= 'data/out/submission-v1.csv')

Unnamed: 0,Id,tweet,Prediction
0,1,sea doo pro sea scooter ( sports with the port...,-1
1,2,<user> shucks well i work all week so now i ca...,1
2,3,i cant stay away from bug thats my baby,1
3,4,<user> no ma'am ! ! ! lol im perfectly fine an...,-1
4,5,"whenever i fall asleep watching the tv , i alw...",-1
...,...,...,...
9995,9996,had a nice time w / my friend lastnite,1
9996,9997,<user> no it's not ! please stop !,1
9997,9998,not without my daughter ( dvd two-time oscar (...,-1
9998,9999,<user> have fun in class sweetcheeks,-1


**Result v1.0**:

**Accuracy**: 0.639

**F1**: 0.635

In [28]:
# I just realized I'm using the 10% datasets...
# load full data
df_train_pos = pd.read_csv(data_path + 'train_pos_full.txt', sep = '\t', names = ['tweet'])
df_train_pos['label'] = 1
# gives error: Error tokenizing data. C error: Expected 1 fields in line 760409, saw 2

df_train_neg = pd.read_csv(data_path + 'train_neg_full.txt', sep = '\t', names = ['tweet'], on_bad_lines='skip')
df_train_neg['label'] = 0
df_train = pd.concat([df_train_pos, df_train_neg])
print('Train set: ', df_train.shape)
print('Train set positives: ', df_train_pos.shape)
print('Train set negatives: ', df_train_neg.shape)
vocab = pickle.load(open('data/glove_custom/vocab.pkl', 'rb'))
embeddings = np.load('data/glove_custom/embeddings.npy')
X_train_full = build_feature_matrix(df_train, vocab, embeddings, mode='avg')
y_train_full = df_train['label'].values

Train set:  (2458295, 2)
Train set positives:  (1218655, 2)
Train set negatives:  (1239640, 2)


In [34]:
# k-fold, random forests
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(
    n_estimators=150, 
    min_samples_split=30, 
    max_features='sqrt', 
    max_depth=45, 
    criterion='gini',
    n_jobs=-1
)
# k-fold
from sklearn.model_selection import cross_val_score
#scores = cross_val_score(clf, X_train_full, y_train_full, cv=3, n_jobs=-1)
# 30min pain

In [35]:
print('Scores: ', scores)
print('Mean score: ', np.mean(scores))
# I thought more...
# Scores:  [0.65654014 0.65856593 0.66088542]
# Mean score:  0.6586638309159757

Scores:  [0.65654014 0.65856593 0.66088542]
Mean score:  0.6586638309159757


In [38]:
clf = RandomForestClassifier(
    n_estimators=250, 
    min_samples_split=25, 
    max_features='sqrt', 
    max_depth=5, 
    criterion='gini',
    n_jobs=-1
)
#clf.fit(X_train_full, y_train_full)
#8+min pain

In [41]:
#predict_test_data(X_test, clf, filename= 'data/out/submission-v1_1.csv')
# Accuaracy: 0.598
# F1: 0.600
# big sad, doesn't like the shallow version uh

Unnamed: 0,Id,tweet,Prediction
0,1,sea doo pro sea scooter ( sports with the port...,-1
1,2,<user> shucks well i work all week so now i ca...,1
2,3,i cant stay away from bug thats my baby,1
3,4,<user> no ma'am ! ! ! lol im perfectly fine an...,-1
4,5,"whenever i fall asleep watching the tv , i alw...",-1
...,...,...,...
9995,9996,had a nice time w / my friend lastnite,1
9996,9997,<user> no it's not ! please stop !,1
9997,9998,not without my daughter ( dvd two-time oscar (...,-1
9998,9999,<user> have fun in class sweetcheeks,-1


In [43]:
# let's try a single tree
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
#scores = cross_val_score(clf, X_train_full, y_train_full, cv=5, n_jobs=-1)
#print('Scores: ', scores)
#print('Mean score: ', np.mean(scores))
# 12min 35s
#Scores:  [0.57297843 0.56829225 0.56839802 0.57466862 0.5719879 ]
#Mean score:  0.5712650434549149

Scores:  [0.57297843 0.56829225 0.56839802 0.57466862 0.5719879 ]
Mean score:  0.5712650434549149


In [46]:
# so they are getting worse...
clf = DecisionTreeClassifier(
    max_depth=20,
    min_samples_split=25,
)
# scores = cross_val_score(clf, X_train_full, y_train_full, cv=5, n_jobs=-1)
# print('Scores: ', scores)
# print('Mean score: ', np.mean(scores))
# 11min of pain
# Scores:  [0.58725865 0.58265587 0.5845759  0.59181872 0.5875312 ]
# Mean score:  0.5867680648579604

In [47]:
clf = DecisionTreeClassifier(
    max_depth=5,
    min_samples_split=25,
)
# scores = cross_val_score(clf, X_train_full, y_train_full, cv=5, n_jobs=-1)
# print('Scores: ', scores)
# print('Mean score: ', np.mean(scores))
#5min
# Scores:  [0.56871327 0.56940278 0.56790174 0.57007194 0.56620747]
# Mean score:  0.5684594403844941

Scores:  [0.56871327 0.56940278 0.56790174 0.57007194 0.56620747]
Mean score:  0.5684594403844941
