# Sentiment Analysis

### Reading the data

In [1]:
import os
from string import punctuation
from collections import Counter
from nltk import word_tokenize
from nltk.corpus import stopwords
Neg='neg'
Pos='pos'

Neg_Docs=list()
Pos_Docs=list()

def GetDocs(Folder):
    List=list()
    Path=os.path.join(os.getcwd(),Folder)
    for doc in os.listdir(Path):
        file=open(os.path.join(Path,doc),'r')
        text=file.read()
        file.close()
        List.append(text)
    return List
Neg_Docs=GetDocs(Neg)
Pos_Docs=GetDocs(Pos)
print(''+str(len(Neg_Docs))+'/'+str(len(Pos_Docs)))

1000/1000


### Splitting into training and testing set

In [2]:
from numpy import array
split=900
training_data=Neg_Docs[:split]+Pos_Docs[:split]
print(len(training_data))
training_labels=array([0 for _ in range(split)]+[1 for _ in range(split)])
print(len(training_labels))
testing_data=Neg_Docs[split:]+Pos_Docs[split:]
print(len(testing_data))
testing_labels=array([0 for _ in range(1000-split)]+[1 for _ in range(1000-split)])
print(len(testing_labels))

1800
1800
200
200


### Generating Vocabulary

In [3]:
from nltk import SnowballStemmer
vocab=Counter()
Stop_words=set(stopwords.words('english'))
def GetTokens(is_stem=False):
    for doc in training_data:
        # Generatting the words
        tokens=word_tokenize(doc)
        # Removing punctutation 
        table=str.maketrans(' ',' ',punctuation)
        tokens=[w.translate(table) for w in tokens ]
        #Keep only alphabetic
        tokens=[w for w in tokens if w.isalpha()]
        #Remove stop words
        tokens=[w for w in tokens if w not in Stop_words]
        #Remove short tokens
        tokens=[w for w in tokens if len(w)>1]
        #Stem is required
        if(is_stem):
            stemmer=SnowballStemmer()
            tokens=[stemmer.stem(w) for w in tokens]
        vocab.update(tokens)
GetTokens(False)
print(len(vocab))
min_occurance=2
tokens=[k for k,c in vocab.items() if(c>min_occurance)]
print(len(tokens))

41914
19092


In [4]:
print(training_data[0])

plot : two teen couples go to a church party , drink and then drive . 
they get into an accident . 
one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . 
what's the deal ? 
watch the movie and " sorta " find out . . . 
critique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . 
which is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn't snag this one correctly . 
they seem to have taken this pretty neat concept , but executed it terribly . 
so what are the problems with the movie ? 
well , its main problem is that it's simply too jumbled . 
it starts off " normal " but then downshifts into this " fantasy " world in which you , as an audience member , have no id

### Converting doc into suitable form

In [5]:
def GetCleanedDoc(folder):
    data=[]
    for doc in folder:
        d=''
        for w in word_tokenize(doc):
           # print(w)
            if(w in tokens):
                d+=w+' '
        data.append(d)
    return data
training_data=GetCleanedDoc(training_data)
testing_data=GetCleanedDoc(testing_data)

## Saving data into pickle format for easy loading

In [6]:
from six.moves import cPickle as pickle
with open('Data.pickle','wb') as file:
    save={
           'training_data':training_data,
           'training_labels':training_labels,
           'testing_data':testing_data,
           'testing_labels':testing_labels,
           'vocab':vocab,
    }
    pickle.dump(save,file,pickle.HIGHEST_PROTOCOL)   

## Start running notbook from next line

In [7]:
from six.moves import cPickle as pickle
with open('Data.pickle','rb') as file:
    Data=pickle.load(file)
    training_data=Data['training_data']
    testing_data=Data['testing_data']
    training_labels=Data['training_labels']
    testing_labels=Data['testing_labels']
    vocab=Data['vocab']
    tokens=set(vocab)
    print(len(testing_data))

200


In [8]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
tokenizer=Tokenizer()
tokenizer.fit_on_texts(training_data)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [9]:
encoded_docs=tokenizer.texts_to_sequences(training_data)
max_length = max([len(s.split()) for s in training_data])
training_data = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [10]:
vocab_size = len(tokenizer.word_index) + 1

In [11]:
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1255, 100)         1850200   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1248, 32)          25632     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 624, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 19968)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                199690    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 2,075,533
Trainable params: 2,075,533
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(training_data,training_labels, epochs=10, verbose=2)

Epoch 1/10
20s - loss: 0.6880 - acc: 0.5489
Epoch 2/10
20s - loss: 0.5421 - acc: 0.7711
Epoch 3/10
20s - loss: 0.1010 - acc: 0.9783
Epoch 4/10
20s - loss: 0.0080 - acc: 1.0000
Epoch 5/10
20s - loss: 0.0027 - acc: 1.0000
Epoch 6/10
20s - loss: 0.0016 - acc: 1.0000
Epoch 7/10
20s - loss: 0.0012 - acc: 1.0000
Epoch 8/10
20s - loss: 8.7141e-04 - acc: 1.0000
Epoch 9/10
20s - loss: 6.6688e-04 - acc: 1.0000
Epoch 10/10
20s - loss: 5.2529e-04 - acc: 1.0000


<keras.callbacks.History at 0x230cfa9cbe0>

In [13]:
loss, acc = model.evaluate(training_data,training_labels, verbose=0)
print('Train Accuracy: %f' % (acc*100))

Train Accuracy: 100.000000


In [14]:
testing_data=tokenizer.texts_to_sequences(testing_data)
testing_data=pad_sequences(testing_data, maxlen=max_length, padding='post')
print(testing_data.shape)
# evaluate
loss, acc = model.evaluate(testing_data,testing_labels, verbose=0)

(200, 1255)


In [15]:
print(acc)

0.885
