In [2]:
import numpy as np
import pandas as pd 
import re

from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords
import nltk

from nltk.corpus import words, brown
%load_ext autoreload
%autoreload 2
from classes import PreProcessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.model_selection import cross_val_score
from nltk.stem import WordNetLemmatizer

In [6]:
train_neg = pd.read_fwf('../Data/train_neg.txt', header=None, names=['tweet'])
train_pos = pd.read_fwf('../Data/train_pos.txt', header=None, names=['tweet'])

In [7]:
test = pd.read_csv('../Data/test_data.txt', sep='\n', header=None, names=['tweet'])

In [8]:
print('train_neg', train_neg.shape)
print('train_pos', train_pos.shape)
print('test', test.shape)

train_neg (100000, 1)
train_pos (100000, 1)
test (10000, 1)


In [9]:
test['tweet-id'] = test.tweet.apply(lambda x: x.split(',')[0])
test['tweet'] = test.tweet.apply(lambda x: ' '.join(x.split(',')[1:]))

In [11]:
preprocessor = PreProcessing()

In [13]:
# Clean the tweets
train_pos_clean = train_pos.copy()
train_neg_clean = train_neg.copy()
train_pos_clean['tweet'] = train_pos_clean.tweet.apply(lambda x: preprocessor.clean(x))
train_neg_clean['tweet'] = train_neg_clean.tweet.apply(lambda x: preprocessor.clean(x))

In [14]:
# Save cleaned files
train_pos_clean.to_csv('../Data/train_pos_clean_full.txt', header=None, index=None, sep=',')
train_neg_clean.to_csv('../Data/train_neg_clean_full.txt', header=None, index=None, sep=',')

In [15]:
word_embeddings = np.load('../Data/embeddings.npy')
vocabulary = pd.read_fwf('../Data/vocab_cut.txt', header=None, names=['tweet'])

In [16]:
print(word_embeddings.shape)
print(vocabulary.shape)

(21161, 20)
(21161, 1)


In [17]:
# Store embedding as dictionary
embedding_dict = dict(zip(list(vocabulary.tweet.values), list(word_embeddings)))

In [18]:
# Convert sentences to list of words
train_pos_clean['tweet'] = train_pos_clean.tweet.apply(lambda x: x.split(' '))
train_neg_clean['tweet'] = train_neg_clean.tweet.apply(lambda x: x.split(' '))

In [19]:
# Save tweet's id
train_pos_clean['tweet-id'] = train_pos_clean.index
train_neg_clean['tweet-id'] = train_neg_clean.index

In [20]:
train_pos_clean = train_pos_clean.explode('tweet')
train_neg_clean = train_neg_clean.explode('tweet')

In [21]:
# Inner join with vocabulary to filter words. 
train_pos_final = pd.DataFrame(train_pos_clean.merge(vocabulary, how='inner').groupby('tweet-id')['tweet'].apply(list))
train_neg_final = pd.DataFrame(train_neg_clean.merge(vocabulary, how='inner').groupby('tweet-id')['tweet'].apply(list))

In [22]:
def mean_word_embedding(tweet):
    return np.mean([embedding_dict[word] for word in tweet], axis=0)

In [23]:
train_pos_final['mean_embedding'] = train_pos_final.tweet.apply(lambda x: mean_word_embedding(x))
train_neg_final['mean_embedding'] = train_neg_final.tweet.apply(lambda x: mean_word_embedding(x))

In [24]:
# Add sentiment 
train_neg_final['sentiment'] = 0
train_pos_final['sentiment'] = 1

In [25]:
train_data = pd.concat([train_pos_final[['mean_embedding', 'sentiment']], train_neg_final[['mean_embedding', 'sentiment']]], ignore_index=True)

In [26]:
np.stack(train_data.mean_embedding.to_numpy()).shape

(199985, 20)

In [27]:
X = train_data['mean_embedding']
y = train_data['sentiment']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [29]:
logistic = LogisticRegression(solver='lbfgs')

In [1]:
precision = cross_val_score(logistic, np.stack(X_train.to_numpy()), y_train, cv=5, scoring='precision')
recall = cross_val_score(logistic, np.stack(X_train.to_numpy()), y_train, cv=5, scoring='recall')
accuracy = cross_val_score(logistic, np.stack(X_train.to_numpy()), y_train, cv=5, scoring='accuracy')
# Precision: avoid false positives
print("Precision: %0.2f (+/- %0.2f)" % (precision.mean(), precision.std() * 2))
# Recall: avoid false negatives
print("Recall: %0.2f (+/- %0.2f)" % (recall.mean(), recall.std() * 2))
print("Accuracy: %0.2f (+/- %0.2f)" % (accuracy.mean(), accuracy.std() * 2))

In [None]:
logistic.fit(np.stack(X_train.to_numpy()), y_train)

# Using a pre-trained glove

In [3]:
import csv
words_embedding = pd.read_table('../Data/glove.twitter.27B.25d.txt', sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)
words_embedding = words_embedding[~words_embedding.index.isna()]

In [4]:
words_embedding.loc['cat'].values.shape

(25,)

In [5]:
train_neg = pd.read_fwf('../Data/train_neg.txt', header=None, names=['tweet'])
train_pos = pd.read_fwf('../Data/train_pos.txt', header=None, names=['tweet'])
test = pd.read_csv('../Data/test_data.txt', sep='\n', header=None, names=['tweet'])
test['tweet-id'] = test.tweet.apply(lambda x: x.split(',')[0])
test['tweet'] = test.tweet.apply(lambda x: ' '.join(x.split(',')[1:]))
test = test.set_index('tweet-id')

In [6]:
# def clean(tweet):
#     tweet = tweet.replace('<user>', '')
#     tweet = tweet.replace('<url>', '')
#     tweet = ''.join(' <number> ' if c.isnumeric() else c for c in tweet)
#     tweet = ''.join(c if (c.isalpha() or c.isspace() or c in ['<', '>']) else ' ' + c + ' ' for c in tweet)
#     tweet = ' '.join(tweet.split())
#     tweet = list(filter(lambda x: x not in stop_words, tweet))
#     return tweet

In [7]:
# preprocessor.clean("#fuckmylife aaanddd   i'd")

In [8]:
# single_char = list(filter(lambda x: len(x) == 1, words_embedding.index.to_list()))

In [9]:
preprocessor = PreProcessing()

In [10]:
# Clean the tweets
train_pos_clean = train_pos.copy()
train_neg_clean = train_neg.copy()
test_clean = test.copy()
train_pos_clean['tweet'] = train_pos_clean.tweet.apply(lambda x: preprocessor.clean(x))
train_neg_clean['tweet'] = train_neg_clean.tweet.apply(lambda x: preprocessor.clean(x))
test_clean['tweet'] = test_clean.tweet.apply(lambda x: preprocessor.clean(x))

In [11]:
vocabulary = pd.concat([train_neg_clean['tweet'].apply(lambda x: x.split(' ')).explode(), 
                        train_pos_clean['tweet'].apply(lambda x: x.split(' ')).explode(),
                        test_clean['tweet'].apply(lambda x: x.split(' ')).explode()])

In [12]:
words_embedding.merge(vocabulary, how='inner', left_on=words_embedding.index, right_on='tweet')

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,17,18,19,20,21,22,23,24,25,tweet
0,0.62415,0.62476,-0.082335,0.201010,-0.13741,-0.11431,0.779090,2.6356,-0.46351,0.57465,...,-0.017336,-0.86349,-1.33480,0.046811,0.36999,-0.57663,-0.484690,0.400780,0.75345,<user>
1,0.62415,0.62476,-0.082335,0.201010,-0.13741,-0.11431,0.779090,2.6356,-0.46351,0.57465,...,-0.017336,-0.86349,-1.33480,0.046811,0.36999,-0.57663,-0.484690,0.400780,0.75345,<user>
2,0.62415,0.62476,-0.082335,0.201010,-0.13741,-0.11431,0.779090,2.6356,-0.46351,0.57465,...,-0.017336,-0.86349,-1.33480,0.046811,0.36999,-0.57663,-0.484690,0.400780,0.75345,<user>
3,0.62415,0.62476,-0.082335,0.201010,-0.13741,-0.11431,0.779090,2.6356,-0.46351,0.57465,...,-0.017336,-0.86349,-1.33480,0.046811,0.36999,-0.57663,-0.484690,0.400780,0.75345,<user>
4,0.62415,0.62476,-0.082335,0.201010,-0.13741,-0.11431,0.779090,2.6356,-0.46351,0.57465,...,-0.017336,-0.86349,-1.33480,0.046811,0.36999,-0.57663,-0.484690,0.400780,0.75345,<user>
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3705464,-1.82700,0.65208,0.559010,-0.865550,-0.15098,-0.81923,-0.879400,-1.4516,1.07420,-0.58916,...,1.159600,-0.34825,1.20280,0.721290,-0.52420,-0.49049,0.677370,-0.466360,0.67608,tropix
3705465,-0.51433,0.76584,-0.030068,0.109210,0.39225,-1.03610,-1.090300,-1.3018,0.40247,-0.56274,...,0.711060,-0.45192,0.62978,0.700950,-0.69766,0.42155,-0.193360,-0.085761,-1.02550,tuperware
3705466,-0.51433,0.76584,-0.030068,0.109210,0.39225,-1.03610,-1.090300,-1.3018,0.40247,-0.56274,...,0.711060,-0.45192,0.62978,0.700950,-0.69766,0.42155,-0.193360,-0.085761,-1.02550,tuperware
3705467,-0.51433,0.76584,-0.030068,0.109210,0.39225,-1.03610,-1.090300,-1.3018,0.40247,-0.56274,...,0.711060,-0.45192,0.62978,0.700950,-0.69766,0.42155,-0.193360,-0.085761,-1.02550,tuperware


In [13]:
vocabulary = pd.DataFrame(vocabulary).drop_duplicates()

In [14]:
words_vocab = words_embedding.merge(vocabulary, how='inner', left_on=words_embedding.index, right_on='tweet')
words_vocab = words_vocab.set_index('tweet')

In [15]:
words_vocab.shape

(58662, 25)

In [16]:
unmatched_words = vocabulary[vocabulary.tweet.apply(lambda x: x not in words_vocab.index)]

In [17]:
unmatched_words.applymap(lambda x: splitter.split(x))

NameError: ("name 'splitter' is not defined", 'occurred at index tweet')

In [18]:
words_vocab_dict = dict(zip(words_vocab.index, words_vocab[words_vocab.columns].values))

In [19]:
def mean_word_embedding(tweet, dictionary):
    mean_embedding = np.mean([dictionary[word] for word in tweet.split(' ') if word in words_embedding.index], axis=0)
    return mean_embedding

In [20]:
def sentence_embedding(tweet, dictionary):
    tweet = list(filter(lambda word: word in words_embedding.index, tweet.split(' ')))
    t = np.asarray(list(map(lambda word: dictionary[word], tweet)))
    return t

In [21]:
# train_pos_clean['words_count'] = train_pos_clean.tweet.apply(lambda x: len(x.split(' ')))
# train_neg_clean['words_count'] = train_neg_clean.tweet.apply(lambda x: len(x.split(' ')))

In [22]:
# train_neg_clean[train_neg_clean.words_count == train_neg_clean['words_count'].max()]

In [23]:
# Use mean_word_embedding
train_pos_clean['embedding'] = train_pos_clean['tweet'].apply(lambda x: mean_word_embedding(x, words_vocab_dict))
train_neg_clean['embedding'] = train_neg_clean['tweet'].apply(lambda x: mean_word_embedding(x, words_vocab_dict))

In [24]:
train_pos_clean['embedding'] = train_pos_clean['tweet'].apply(lambda x: sentence_embedding(x, words_vocab_dict))
train_neg_clean['embedding'] = train_neg_clean['tweet'].apply(lambda x: sentence_embedding(x, words_vocab_dict))

In [25]:
train_pos_clean['sentiment'] = 1
train_neg_clean['sentiment'] = 0

In [50]:
train_data = pd.concat([train_pos_clean[['embedding', 'sentiment']], train_neg_clean[['embedding', 'sentiment']]], ignore_index=True)

In [51]:
# train_data[train_data['sentiment'] == 1].embedding.mean()

In [89]:
# train_data[train_data['embedding'].apply(lambda x: np.logical_or.reduce(np.isnan(x)))]

TypeError: unhashable type: 'numpy.ndarray'

In [53]:
# train_data[train_data.sentiment == 1] = train_data[train_data.sentiment == 1].fillna(train_data[train_data.sentiment == 1].embedding.mean())
# train_data[train_data.sentiment == 0] = train_data[train_data.sentiment == 0].fillna(train_data[train_data.sentiment == 0].embedding.mean())

In [54]:
train_data = train_data.dropna()

In [55]:
max_words = train_data.embedding.apply(lambda x: x.shape[0]).max()
embedding_dim = train_data.embedding.iloc[0].shape[1]
train_data['padded_embedding'] = train_data.embedding.apply(lambda x: np.pad(x, ((0, max_words - x.shape[0]), (0, 0)), mode='constant'))

In [56]:
# train_data[['embedding', 'sentiment']].to_csv('../Data/train_data.csv')

In [57]:
# train_data[['padded_embedding', 'sentiment']].to_csv

In [58]:
# train_data[['embedding', 'sentiment']].to_hdf('../Data/train_data.h5', key='train_data', mode='w')

In [59]:
# t = pd.read_hdf('../Data/train_data.h5')

In [63]:
X = train_data['padded_embedding']
X = np.stack(X.to_numpy())
y = train_data['sentiment']

In [64]:
np.save('../Data/features', X)
np.save('../Data/labels', y)

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=0)

In [88]:
X_train = (X_train - X_train.mean(axis=0)) / X_train.std(axis=0)
nsamples, nx, ny = X_train.shape
d2_X_train = X_train.reshape((nsamples, nx*ny))

AttributeError: 'numpy.ndarray' object has no attribute 'isin'

In [86]:
logistic = LogisticRegression(solver='lbfgs', max_iter=2000)
precision = cross_val_score(logistic, d2_X_train, y_train, cv=5, scoring='precision')
recall = cross_val_score(logistic, d2_X_train, y_train, cv=5, scoring='recall')
accuracy = cross_val_score(logistic, d2_X_train, y_train, cv=5, scoring='accuracy')
# Precision: avoid false positives
print("Precision: %0.2f (+/- %0.2f)" % (precision.mean(), precision.std() * 2))
# Recall: avoid false negatives
print("Recall: %0.2f (+/- %0.2f)" % (recall.mean(), recall.std() * 2))
print("Accuracy: %0.2f (+/- %0.2f)" % (accuracy.mean(), accuracy.std() * 2))



ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
# Grid Search
# Parameter Grid
param_grid = {'C': [0.1, 1], 'gamma': [0.01, 0.001]}
 
# Make grid search classifier
clf_grid = GridSearchCV(svm.SVC(), param_grid, verbose=1, cv=2)
 
# Train the classifier
clf_grid.fit(X_train, y_train)
 
# clf = grid.best_estimator_()
print("Best Parameters:\n", clf_grid.best_params_)
print("Best Estimators:\n", clf_grid.best_estimator_)

In [None]:
logistic.fit(np.stack(X_train.to_numpy()), y_train)

In [None]:
logistic.score