In [16]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Models

In [17]:
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint 
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.normalization import BatchNormalization
from keras import optimizers
from keras import initializers
from keras.models import Model, Sequential
from keras.layers import Convolution1D, MaxPooling1D, GlobalAveragePooling1D, BatchNormalization, LSTM, GRU, CuDNNGRU, CuDNNLSTM, concatenate, Input, SimpleRNN
from keras.layers.embeddings import Embedding
from keras.regularizers import l2
from keras.constraints import maxnorm

# Pre-processing

In [18]:
import numpy as np 
import pandas as pd 
import bz2
import gc
import chardet
import re
import os
import random

In [19]:
#Checking files in Kaggle
# List data files that are connected to the kernel

#os.listdir('../input')

In [20]:
# Read Train & Test Files

#Kaggle
train_file = bz2.BZ2File('../input/train.ft.txt.bz2')
test_file = bz2.BZ2File('../input/test.ft.txt.bz2')

#Localhost
#train_file = bz2.BZ2File('C:/Users/Lenovo/Documents/GitHub/Datasets/amazonreviews/train.ft.txt.bz2')
#test_file = bz2.BZ2File('C:/Users/Lenovo/Documents/GitHub/Datasets/amazonreviews/test.ft.txt.bz2')

#Localhost - Versión recortada del archivo
#train_file = bz2.BZ2File('C:/Users/Lenovo/Documents/GitHub/Datasets/amazonreviews/Version_Recortada/r_train.ft.txt.bz2')
#test_file = bz2.BZ2File('C:/Users/Lenovo/Documents/GitHub/Datasets/amazonreviews/Version_Recortada/r_test.ft.txt.bz2')

#Create Lists containing Train & Test sentences
train_file_lines = train_file.readlines()
test_file_lines = test_file.readlines()

#Convert from raw binary strings to strings that can be parsed
train_file_lines = [x.decode('utf-8') for x in train_file_lines]
test_file_lines = [x.decode('utf-8') for x in test_file_lines]

In [21]:
#Delete memory reference (?)
del train_file, test_file
#Garbage collector
gc.collect()

0

In [22]:
print("Cantidad de elementos del Training Set: {}".format(len(train_file_lines)))
print("Cantidad de elementos del Testing Set: {}".format(len(test_file_lines)))

Cantidad de elementos del Training Set: 3600000
Cantidad de elementos del Testing Set: 400000


## Clean data

In [23]:
# Change labels: __label__1 -> 0 (Negative) / __label__2 -> 1 (Positive)
train_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in train_file_lines]
test_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test_file_lines]

# Make everything Lower Case
train_sentences = [x.split(' ', 1)[1][:-1].lower() for x in train_file_lines]

for i in range(len(train_sentences)):
    train_sentences[i] = re.sub('\d','0',train_sentences[i])
    
test_sentences = [x.split(' ', 1)[1][:-1].lower() for x in test_file_lines]

for i in range(len(test_sentences)):
    test_sentences[i] = re.sub('\d','0',test_sentences[i])

# Modify URLs to <url>
for i in range(len(train_sentences)):
    if 'www.' in train_sentences[i] or 'http:' in train_sentences[i] or 'https:' in train_sentences[i] or '.com' in train_sentences[i]:
        train_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", train_sentences[i])
        
for i in range(len(test_sentences)):
    if 'www.' in test_sentences[i] or 'http:' in test_sentences[i] or 'https:' in test_sentences[i] or '.com' in test_sentences[i]:
        test_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", test_sentences[i])

## Checking data before and after cleaning

In [24]:
#Random
r = random.randint(1,len(train_file_lines))

#Before
print("Data before cleaning:\n{}".format(train_file_lines[r-1:r]))

#After
print("\nData after cleaning:\n{}".format((train_sentences[r-1:r])))

#Labels
print("\nLabel:{}".format(train_labels[r-1:r]))

Data before cleaning:
["__label__1 pure dribble,all in dark,rotten audio: This is the biggest bunch of dribble I have ever seen. I think the whole film was shot in the dark. Even if meg ryan tryed to show something you would sure never see it. This has lousy audio, you can't hear snything and all of a sudden they try to break your eardrums with some car sound of something else dumb. I have no idea why meg ryan made this mess, If she needs money this bad mabe she should take up bartending\n"]

Data after cleaning:
["pure dribble,all in dark,rotten audio: this is the biggest bunch of dribble i have ever seen. i think the whole film was shot in the dark. even if meg ryan tryed to show something you would sure never see it. this has lousy audio, you can't hear snything and all of a sudden they try to break your eardrums with some car sound of something else dumb. i have no idea why meg ryan made this mess, if she needs money this bad mabe she should take up bartending"]

Label:[0]


### Output
From the above output it can be seen that each sentence begins with it's sentiment (label1 -> Negative, label2 -> Positive), which is then followed by the review and ends with a newline character \n.

So, first I go convert all the labels to O(Negative) and 1(Positive) and store it in lists that only contain the label values. After this, I store the remainder of the sentence excluding the newline character in lowercase in lists. Also, convert all numbers to 0.


In [25]:
#Delete memory reference (?)
del train_file_lines, test_file_lines
#Garbage collector
gc.collect()

0

### Using only a Percentage of train set

In [26]:
# Esto lo hago porque tarda mucho tiempo en entrenar => Me quedo con un % mas pequeño del dataset.
flag_size = 1
from sklearn.model_selection import train_test_split
if flag_size == 1:
    # Create a validation dataset
    size = 0.8 #Dejo el X para lo que sería valid => Me queda el (1-X) que voy a utilizar.
    train_sentences, valid_sentences, train_labels, valid_labes = train_test_split(train_sentences, train_labels, test_size = size)
    
print(len(train_sentences))
print(len(valid_sentences))

720000
2880000


## Text Pre-processing

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

#Delete special characters -> In Keras I use the Filter.
for i in range(len(train_sentences)):
    train_sentences[i] = re.sub("[^a-zA-Z]", " ",train_sentences[i])
    
for i in range(len(test_sentences)):
    test_sentences[i] = re.sub("[^a-zA-Z]", " ",train_sentences[i])
    
#Base definitions for text preprocessing
max_features = 20000
maxlen = 100

In [28]:
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

v = CountVectorizer(analyzer = "word",max_features = max_features)

X_train = v.fit_transform(train_sentences)
X_test = v.transform(test_sentences)

## Model

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score

train_labels_array = np.array(train_labels)
test_labels_array = np.array(test_labels)

  from numpy.core.umath_tests import inner1d


### Logistic Regression

In [None]:
classifier = LogisticRegression(max_iter=200)

fit = classifier.fit(X_train,train_labels_array)
pred = fit.predict(X_test)
accuracy = accuracy_score(pred,test_labels_array)

print('Accuracy of Logistic Regression is '+str(accuracy))    

### Linear SVC

In [None]:
classifier = LinearSVC(max_iter=200)

fit = classifier.fit(X_train,train_labels_array)
pred = fit.predict(X_test)
accuracy = accuracy_score(pred,test_labels_array)

print('Accuracy of Linear SVC is '+str(accuracy)) 

### NuSVC

In [None]:
classifier = NuSVC(max_iter=200)

fit = classifier.fit(X_train,train_labels_array)
pred = fit.predict(X_test)
accuracy = accuracy_score(pred,test_labels_array)

print('Accuracy of NuSVC is '+str(accuracy)) 

### Random Forest

In [None]:
classifier = RandomForestClassifier(n_estimators=200)

fit = classifier.fit(X_train,train_labels_array)
pred = fit.predict(X_test)
accuracy = accuracy_score(pred,test_labels_array)

print('Accuracy of Random Forest Classifier is '+str(accuracy)) 

### AdaBoostClassifier

In [None]:
classifier = AdaBoostClassifier()

fit = classifier.fit(X_train,train_labels_array)
pred = fit.predict(X_test)
accuracy = accuracy_score(pred,test_labels_array)

print('Accuracy ofAdaBoostClassifier is '+str(accuracy)) 

### KNeighborsClassifier

In [None]:
classifier = KNeighborsClassifier(2) #Son solo 2 grupos

fit = classifier.fit(X_train,train_labels_array)
pred = fit.predict(X_test)
accuracy = accuracy_score(pred,test_labels_array)

print('Accuracy of KNeighborsClassifier is '+str(accuracy)) 