In [1]:
pip install fasttext

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import fasttext
import bz2
import csv
from sklearn.metrics import roc_auc_score
import os

In [4]:
DIR='/Users/anastasiamoiseva/Downloads'

In [135]:
data = bz2.BZ2File(DIR+"/amazonreviews/train.ft.txt.bz2")
data = data.readlines()
data = [x.decode('utf-8') for x in data]
print(len(data))



3600000


In [6]:
data = pd.DataFrame(data)
data.to_csv("train.txt", index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")

In [7]:
model = fasttext.train_supervised('train.txt',label_prefix='__label__', thread=4, epoch = 10)
print(model.labels, 'are the labels or targets the model is predicting')

['__label__1', '__label__2'] are the labels or targets the model is predicting


In [9]:
test = bz2.BZ2File(DIR+"/amazonreviews/test.ft.txt.bz2")
test = test.readlines()
test = [x.decode('utf-8') for x in test]
print(len(test), 'number of records in the test set') 

400000 number of records in the test set


In [10]:
new = [w.replace('__label__2 ', '') for w in test]
new = [w.replace('__label__1 ', '') for w in new]
new = [w.replace('\n', '') for w in new]

In [226]:
data_test = pd.DataFrame(new)

In [228]:
data_test.columns = ['Message']

In [229]:
data_test.head()

Unnamed: 0,Message
0,Great CD: My lovely Pat has one of the GREAT v...
1,One of the best game music soundtracks - for a...
2,Batteries died within a year ...: I bought thi...
3,"works fine, but Maha Energy is better: Check o..."
4,Great for the non-audiophile: Reviewed quite a...


In [14]:
labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test]
pred_labels = [0 if x == ['__label__1'] else 1 for x in pred[0]]

In [11]:
pred = model.predict(new)

In [12]:
print(pred[0][0], 'is the predicted label')
print(pred[0][1], 'is the probability score')

['__label__2'] is the predicted label
['__label__2'] is the probability score


In [15]:
print(roc_auc_score(labels, pred_labels))

0.91719


transforming labels into 0s ans 1s, 
0 - label_1 and 1 - label_2

In [136]:


train_labels  = [0 if x.split(' ')[0] == '__label__1' else 1 for x in data]

In [137]:
new_train = [w.replace('__label__2 ', '') for w in data]
new_train = [w.replace('__label__1 ', '') for w in new]
new_train = [w.replace('\n', '') for w in new]

In [151]:
labels_tr = pd.DataFrame(train_labels)

In [160]:
data_train = pd.DataFrame(new_train, dtype=str)
data_train[:4]

Unnamed: 0,0
0,Great CD: My lovely Pat has one of the GREAT v...
1,One of the best game music soundtracks - for a...
2,Batteries died within a year ...: I bought thi...
3,"works fine, but Maha Energy is better: Check o..."


In [186]:
data_train.columns = ['Message', 'Labels']

In [195]:
data_train.tail()


Unnamed: 0,Message,Labels
399995,Unbelievable- In a Bad Way: We bought this Tho...,1
399996,"Almost Great, Until it Broke...: My son reciev...",0
399997,Disappointed !!!: I bought this toy for my son...,1
399998,Classic Jessica Mitford: This is a compilation...,0
399999,"Comedy Scene, and Not Heard: This DVD will be ...",1


In [194]:
data_train.shape

(400000, 2)

In [177]:
data_train['Labels'] = labels_tr

In [178]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix

In [198]:
bow_transformer = CountVectorizer(lowercase=False).fit(data_train['Message'])
print(len(bow_transformer.vocabulary_))

340716


In [199]:
amazon_bow = bow_transformer.transform(data_train['Message'])

In [200]:
data_train.shape


(400000, 2)

In [201]:
print('Shape of Sparse Matrix: ',amazon_bow.shape)
print('Amount of non-zero occurences:',amazon_bow.nnz)

Shape of Sparse Matrix:  (400000, 340716)
Amount of non-zero occurences: 22760250


In [202]:
sparsity =(100.0 *amazon_bow.nnz/(amazon_bow.shape['Message']*amazon_bow.shape[1]))
print('sparsity:{}'.format(round(sparsity)))

TypeError: tuple indices must be integers or slices, not str

In [203]:
message4=data_train['Message'][3]
print(message4)

works fine, but Maha Energy is better: Check out Maha Energy's website. Their Powerex MH-C204F charger works in 100 minutes for rapid charge, with option for slower charge (better for batteries). And they have 2200 mAh batteries.


In [204]:
bow4=bow_transformer.transform([message4])
print(bow4)
print(bow4.shape)

  (0, 479)	1
  (0, 3661)	1
  (0, 16499)	1
  (0, 31038)	1
  (0, 37478)	1
  (0, 56961)	2
  (0, 98470)	1
  (0, 100511)	2
  (0, 124559)	1
  (0, 157555)	1
  (0, 188972)	2
  (0, 190640)	2
  (0, 195657)	1
  (0, 199525)	2
  (0, 199533)	1
  (0, 230567)	1
  (0, 232401)	3
  (0, 241405)	1
  (0, 247481)	1
  (0, 252126)	1
  (0, 261087)	1
  (0, 266172)	1
  (0, 275856)	1
  (0, 276807)	1
  (0, 291535)	1
  (0, 307782)	1
  (0, 321364)	1
  (0, 335358)	1
  (0, 337360)	1
  (0, 338249)	2
(1, 340716)


In [205]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer=TfidfTransformer().fit(amazon_bow)
tfidf4 = tfidf_transformer.transform(bow4)
print(tfidf4)

  (0, 338249)	0.1705579646334395
  (0, 337360)	0.03976565582701023
  (0, 335358)	0.12334808179757155
  (0, 321364)	0.057069492687200525
  (0, 307782)	0.15681912609077459
  (0, 291535)	0.17915094760195382
  (0, 276807)	0.05489020546153735
  (0, 275856)	0.1293923729376425
  (0, 266172)	0.09854872705107565
  (0, 261087)	0.21223183120828037
  (0, 252126)	0.02933826102716621
  (0, 247481)	0.03408429495220594
  (0, 241405)	0.04163513783867195
  (0, 232401)	0.1024374830521529
  (0, 230567)	0.09410370707694916
  (0, 199533)	0.13824837119640393
  (0, 199525)	0.2416619093286253
  (0, 195657)	0.04037133767437156
  (0, 190640)	0.1370243478657787
  (0, 188972)	0.23812091241481786
  (0, 157555)	0.13090757275492307
  (0, 124559)	0.24114221884528292
  (0, 100511)	0.4692905183070882
  (0, 98470)	0.21417630959208234
  (0, 56961)	0.37882071606953455
  (0, 37478)	0.1411995194245566
  (0, 31038)	0.26942481598567664
  (0, 16499)	0.08429141354688773
  (0, 3661)	0.21045666152495238
  (0, 479)	0.11451825536394

In [206]:
amazon_tfidf=tfidf_transformer.transform(amazon_bow)
print(amazon_tfidf.shape)

(400000, 340716)


In [209]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(amazon_tfidf,data_train['Labels'])

In [210]:
all_predictions = spam_detect_model.predict(amazon_tfidf)
print(all_predictions)

[1 0 1 ... 0 1 1]


In [None]:
all_predictions = spam_detect_model.predict()
print(all_predictions)

In [212]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(data_train['Labels'],all_predictions))
print(confusion_matrix(data_train['Labels'],all_predictions))

              precision    recall  f1-score   support

           0       0.78      0.62      0.69    197932
           1       0.69      0.83      0.75    202068

    accuracy                           0.73    400000
   macro avg       0.74      0.73      0.72    400000
weighted avg       0.74      0.73      0.72    400000

[[123628  74304]
 [ 34523 167545]]


In [213]:
print(roc_auc_score(data_train['Labels'], all_predictions))

0.7268749598224544


Ok, so obviously the fasttext models performs better according to roc score

In [217]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords



In [222]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten
from keras.layers import GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split


In [220]:
conda install tensorflow

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: failed with repodata from current_repodata.json, will retry with next repodata source.
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: //anaconda3

  added / updated specs:
    - tensorflow


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    absl-py-0.9.0              |   py37hc8dfbb8_1         162 KB  conda-forge
    astor-0.7.1                |             py_0          22 KB  conda-forge
    c-ares-1.15.0              |    h01d97ff_1001          81 KB  conda-forge
    gast-0.3.3                 |             py_0          12 KB  conda-forge
    grpcio-1.23.0              |   py37h6ef0057_0         1.0 MB  conda-forge
    keras-applications-1.0.8   |     

In [221]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [223]:
data_train.head()

Unnamed: 0,Message,Labels
0,Great CD: My lovely Pat has one of the GREAT v...,1
1,One of the best game music soundtracks - for a...,1
2,Batteries died within a year ...: I bought thi...,1
3,"works fine, but Maha Energy is better: Check o...",1
4,Great for the non-audiophile: Reviewed quite a...,1


In [241]:
y_train = data_train['Labels']

In [224]:
data_train['Message'][9]

'Not an "ultimate guide": Firstly,I enjoyed the format and tone of the book (how the author addressed the reader). However, I did not feel that she imparted any insider secrets that the book promised to reveal. If you are just starting to research law school, and do not know all the requirements of admission, then this book may be a tremendous help. If you have done your homework and are looking for an edge when it comes to admissions, I recommend some more topic-specific books. For example, books on how to write your personal statment, books geared specifically towards LSAT preparation (Powerscore books were the most helpful for me), and there are some websites with great advice geared towards aiding the individuals whom you are asking to write letters of recommendation. Yet, for those new to the entire affair, this book can definitely clarify the requirements for you.'

In [338]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data_train['Message'])

X_train = tokenizer.texts_to_sequences(data_train['Message'])
X_test = tokenizer.texts_to_sequences(data_test['Message'])

In [289]:
data_test.head()

Unnamed: 0,Message
0,Great CD: My lovely Pat has one of the GREAT v...
1,One of the best game music soundtracks - for a...
2,Batteries died within a year ...: I bought thi...
3,"works fine, but Maha Energy is better: Check o..."
4,Great for the non-audiophile: Reviewed quite a...


In [341]:
vocab_size = len(tokenizer.word_index) + 1

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [345]:
X_train

array([[  29,   92,   20, ...,   12,  924,    0],
       [   5,  279,    1, ...,  368,  161,    6],
       [ 671, 1008,  549, ...,    0,    0,    0],
       ...,
       [  15,  189,   24, ..., 1527,   11,  434],
       [ 395,    8,    9, ...,    0,    0,    0],
       [  52,  204,   23, ...,    5,  199, 1341]], dtype=int32)

In [346]:
vocab_size

291459

In [347]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()
glove_file = open('glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [348]:
embedding_matrix = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [349]:
from keras import layers

Starting with a simple NN

In [350]:
model = Sequential()
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=True)
model.add(embedding_layer)


model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [351]:
from keras import optimizers

In [352]:
opt = optimizers.adam(lr=0.00001)

In [353]:
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['acc'])

print(model.summary())

Model: "sequential_30"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_30 (Embedding)     (None, 100, 100)          29145900  
_________________________________________________________________
flatten_10 (Flatten)         (None, 10000)             0         
_________________________________________________________________
dense_32 (Dense)             (None, 1)                 10001     
Total params: 29,155,901
Trainable params: 29,155,901
Non-trainable params: 0
_________________________________________________________________
None


In [354]:
history = model.fit(X_train, y_train, batch_size=512, epochs=5, verbose=1, validation_split=0.2)

Train on 320000 samples, validate on 80000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [356]:
score = model.evaluate(X_test, labels, verbose=1)



In [357]:
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 0.7100805459403992
Test Accuracy: 0.49722999334335327


In [310]:
all_predictions1 = model.predict(X_test)

In [311]:
all_predictions1

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]], dtype=float32)

In [312]:
print(roc_auc_score(labels, all_predictions1))

0.500049999525


Now trying CNN, to see whether results will be better

In [361]:
from keras.layers.convolutional import Conv1D   

In [375]:
from keras.optimizers import SGD

In [382]:
model_c = Sequential()

embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=True)
model_c.add(embedding_layer)

model_c.add(Conv1D(128, 5, activation='relu'))
model_c.add(GlobalMaxPooling1D())
model_c.add(Dense(1, activation='sigmoid'))

model_c.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [385]:
history_c = model_c.fit(X_train, y_train, batch_size=128, epochs=5, verbose=1, validation_split=0.2)

Train on 320000 samples, validate on 80000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5

KeyboardInterrupt: 

thoughts:
    increase number of epochs (small chance that this will work) /
    switch to sgd instead of adam

In [378]:
score_c = model_c.evaluate(X_test, labels, verbose=1)



In [365]:
#epochs = 5, batchsize = 256, opt = adam, trainable=False

print("Test Score:", score_c[0])
print("Test Accuracy:", score_c[1])

Test Score: 0.7036181306695938
Test Accuracy: 0.5121700167655945


In [370]:
#epochs = 70, batchsize = 256, opt = adam, trainable=False

print("Test Score:", score_c[0])
print("Test Accuracy:", score_c[1])

Test Score: 0.9672149581480026
Test Accuracy: 0.5002400279045105


Have now idea why score higher, accuracy not

In [380]:
all_predictions2 = model_c.predict(X_test)

In [367]:
#epochs = 5, batchsize = 256, opt = adam, trainable=False
print(roc_auc_score(labels, all_predictions2))

0.5166875429500001


In [373]:
#epochs = 70, batchsize = 256, opt = adam, trainable=False
print(roc_auc_score(labels, all_predictions2))

0.5000792721625


In [379]:
#epochs = 7, batchsize = 256, opt = sgd, trainable=False
print("Test Score:", score_c[0])
print("Test Accuracy:", score_c[1])

Test Score: 0.7069648948860169
Test Accuracy: 0.494997501373291


In [381]:
#epochs = 7, batchsize = 256, opt = sgd, trainable=False
print(roc_auc_score(labels, all_predictions2))

0.49145148396250005


And now Recurent Neural network attempt

In [254]:
from keras.layers.recurrent import LSTM

In [333]:
model_r = Sequential()
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=True)
model_r.add(embedding_layer)

model_r.add(LSTM(128))

model_r.add(Dense(1, activation='softmax'))
model_r.compile(optimizer=opt, loss='binary_crossentropy', metrics=['acc'])

In [334]:
history_r = model_r.fit(X_train, y_train, batch_size=256, epochs=6, verbose=1, validation_split=0.2)

score = model_r.evaluate(X_test, labels, verbose=1)

Train on 320000 samples, validate on 80000 samples
Epoch 1/6
Epoch 2/6

KeyboardInterrupt: 

In [262]:
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 0.6971812977552414
Test Accuracy: 0.4834724962711334
