# Exploration Notebook v2 - Jacopo

**Version**: v2

## I just Realized I've been using the 10% datasets the whole time...

Use glove.twitter.27B as embeddings and/or better preprocessing 

In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
def build_feature_matrix(df, vocab, embeddings, mode='avg'):
    X = np.zeros((df.shape[0], embeddings.shape[1]))
    for i, tweet in enumerate(df['tweet']):
        words = tweet.split()
        for word in words:
            if word in vocab:
                X[i] += embeddings[vocab[word]]
        if mode == 'avg':
            X[i] /= len(words)
        elif mode == 'sum':
            pass
        else:
            raise ValueError('Unknown mode: {}'.format(mode))
    return X
def load_train_data(path_pos='data/twitter-datasets/train_pos_full.txt', path_neg='data/twitter-datasets/train_neg_full.txt'):
    # Load data, txt as csv
    #data_path = 'data/twitter-datasets/'
    df_train_pos = pd.read_csv(path_pos, sep = '\t', names = ['tweet'])
    df_train_pos['label'] = 1
    df_train_neg = pd.read_csv(path_neg, sep = '\t', names = ['tweet'])
    df_train_neg['label'] = 0
    df_train = pd.concat([df_train_pos, df_train_neg])
    print('Train set: ', df_train.shape)
    print('Train set positives: ', df_train_pos.shape)
    print('Train set negatives: ', df_train_neg.shape)
    return df_train   
def load_test_data():
    # Load test data: id, tweet for each row
    data_path = 'data/twitter-datasets/'
    df_test = pd.read_csv(data_path + 'test_data.txt', header=None, names=['line'], sep='\t')
    # Extract id and tweet, limit split by 1 so we don't split the tweet (this is v0, at least we keep it intact)
    df_test['id'] = df_test['line'].apply(lambda x: x.split(',',1)[0]) 
    df_test['tweet'] = df_test['line'].apply(lambda x: x.split(',',1)[1])
    df_test = df_test.drop('line', axis=1)
    return df_test
def predict_test_data(X_test, classifier, filename='submission.csv'):
    # Predict test data and save to csv
    y_pred = classifier.predict(X_test)
    df_test['Prediction'] = y_pred
    df_test.rename(columns={'id': 'Id'}, inplace=True)
    df_test['Prediction'] = df_test['Prediction'].apply(lambda x: -1 if x == 0 else x)
    df_test.to_csv(filename, columns=['Id', 'Prediction'], index=False)
    return df_test

In [3]:
# Load data, txt as csv
data_path = 'data/twitter-datasets/'
df_train_pos = pd.read_csv(data_path + 'train_pos_full.txt', sep = '\t', names = ['tweet'])
df_train_pos['label'] = 1
df_train_neg = pd.read_csv(data_path + 'train_neg_full.txt', sep = '\t', names = ['tweet'], on_bad_lines='skip')
df_train_neg['label'] = 0
df_train = pd.concat([df_train_pos, df_train_neg])
print('Train set: ', df_train.shape)
print('Train set positives: ', df_train_pos.shape)
print('Train set negatives: ', df_train_neg.shape)

Train set:  (2458295, 2)
Train set positives:  (1218655, 2)
Train set negatives:  (1239640, 2)


In [4]:
# csv with word next to its embedding of d = 200
vocab_embeddings = pd.read_csv('data/glove/glove.twitter.27B.25d.txt', sep='\r', index_col=0, names=['line'], nrows=10000)
vocab_embeddings['word'] = vocab_embeddings.index.str.split(' ', 1).str[0]
vocab_embeddings.head(20)
print(vocab_embeddings.head(20))

                                                            word
line                                                            
<user> 0.62415 0.62476 -0.082335 0.20101 -0.137...        <user>
. 0.69586 -1.1469 -0.41797 -0.022311 -0.023801 ...             .
: 1.1242 0.054519 -0.037362 0.10046 0.11923 -0....             :
rt 0.74056 0.9155 -0.16352 0.35843 0.05266 0.14...            rt
, 0.84705 -1.0349 -0.050419 0.27164 -0.58659 0....             ,
<repeat> 0.67867 -0.74651 -0.31831 -0.093681 0....      <repeat>
<hashtag> 0.18227 -0.29194 -1.3632 -1.201 0.084...     <hashtag>
<number> 1.3956 0.2892 0.48572 -1.1412 0.21461 ...      <number>
<url> 0.80384 -1.0366 -0.53877 -1.0806 0.84718 ...         <url>
! 0.4049 -0.87651 -0.23362 -0.34844 -0.097002 0...             !
i -0.26079 0.59108 0.61622 -0.70368 -0.85159 -0...             i
a 0.21294 0.31035 0.17694 0.87498 0.067926 0.59...             a
 1.0822 -0.59378 -0.19992 0.66626 0.18051 0.014...              
stressfree -1.399 0.8163 

  vocab_embeddings['word'] = vocab_embeddings.index.str.split(' ', 1).str[0]


In [None]:
X_train_full = build_feature_matrix(df_train, vocab, embeddings, mode='avg')
y_train_full = df_train['label'].values

In [None]:
print('X_train_full shape: ', X_train_full.shape)
print('y_train_full shape: ', y_train_full.shape)
print('Embeddings shape: ', embeddings.shape)

In [None]:
# random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# let's try a shallower structure 
clf = RandomForestClassifier(
    n_estimators=250, 
    max_depth=5,
    n_jobs=-1,
    min_samples_split=15,
    verbose=2
)
scores = cross_val_score(clf, X_train_full, y_train_full, cv=5)
print('Cross validation scores: ', scores)
print('Mean cross validation score: ', np.mean(scores))

In [None]:
# Load test data: id, tweet for each row
df_test = load_test_data()
X_test = build_feature_matrix(df_test, vocab, embeddings, mode='avg')

# pred
predict_test_data(X_test, clf, filename='data/out/submission-v2.csv')