In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv('Train.csv')

In [3]:
df.shape

(40000, 2)

In [4]:
df['label'].value_counts()

pos    20011
neg    19989
Name: label, dtype: int64

In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [6]:
y=df['label'].values
type(y)

numpy.ndarray

In [7]:
y=le.fit_transform(y)
y[:25]

array([1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 0, 1])

In [8]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [9]:
sw = set(stopwords.words('english'))
ps = PorterStemmer()

In [10]:
def clean_text(sample):
    sample = sample.lower()
    sample = sample.replace("<br /><br />", "")
    sample = re.sub("[^a-zA-Z]+", " ", sample)
    
    sample = sample.split()
    
    sample = [ps.stem(s) for s in sample if s not in sw] 
    
    sample = " ".join(sample)
    
    return sample

In [11]:
clean_text(df['review'][0])

'matur intellig highli charg melodrama unbelivebl film china wei wei stun perform catylast love triangl simpli stun oppurun see magnific film take'

In [12]:
df['cleaned_review'] =  df['review'].apply(clean_text)

In [13]:
df.head()

Unnamed: 0,review,label,cleaned_review
0,mature intelligent and highly charged melodram...,pos,matur intellig highli charg melodrama unbelive...
1,http://video.google.com/videoplay?docid=211772...,pos,http video googl com videoplay docid hl en dis...
2,Title: Opera (1987) Director: Dario Argento Ca...,pos,titl opera director dario argento cast cristin...
3,I think a lot of people just wrote this off as...,pos,think lot peopl wrote anoth one tom cruis weir...
4,This is a story of two dogs and a cat looking ...,pos,stori two dog cat look way back home old wise ...


In [14]:
review =  df['cleaned_review'].values

In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [16]:
cv = CountVectorizer(max_df=0.5,max_features=50000)

In [17]:
X = cv.fit_transform(review)

In [18]:
tfidf = TfidfTransformer()

In [19]:
X = tfidf.fit_transform(X)

In [20]:
print(X[0])

  (0, 47634)	0.5920011236382112
  (0, 44629)	0.35085644983076947
  (0, 43630)	0.21831285409006126
  (0, 41058)	0.08381958738922976
  (0, 39856)	0.3257356386942206
  (0, 36910)	0.12019616469026352
  (0, 35372)	0.06360992061360375
  (0, 30823)	0.09150742090549654
  (0, 26560)	0.19005467617975821
  (0, 26138)	0.18411548708696168
  (0, 25474)	0.17429834208396788
  (0, 25039)	0.07856379122772277
  (0, 21816)	0.14367484199159275
  (0, 20351)	0.13393901318695992
  (0, 9931)	0.1991671731452596
  (0, 9499)	0.17565914931902576
  (0, 9026)	0.35085644983076947


In [21]:
print(X.shape)
print(y.shape)

(40000, 50000)
(40000,)


In [22]:
from keras import models
from keras.layers import Dense

Using TensorFlow backend.


In [23]:
model = models.Sequential()
model.add( Dense(16, activation="relu", input_shape = (X.shape[1],) ) )
model.add( Dense(16, activation="relu") )
model.add( Dense(1, activation="sigmoid"))

In [24]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 16)                800016    
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 800,305
Trainable params: 800,305
Non-trainable params: 0
_________________________________________________________________


In [25]:
model.compile(optimizer='rmsprop', loss="binary_crossentropy", metrics=['accuracy'])

In [26]:
res = model.fit(X, y, batch_size=128, epochs=2, validation_data=(X, y))

Train on 40000 samples, validate on 40000 samples
Epoch 1/2
Epoch 2/2


In [27]:
test = pd.read_csv('Test.csv')

In [29]:
test.shape

(10000, 1)

In [30]:
test['cleaned_review'] = test['review'].apply(clean_text)

In [31]:
X_test = test['cleaned_review']

In [32]:
X_test = cv.transform(X_test)

In [33]:
X_test = tfidf.transform(X_test)

In [34]:
y_test = model.predict(X_test)

In [35]:
y_test[ y_test >= 0.5 ]  =  1

In [36]:
y_test = y_test.astype('int')

In [37]:
y_test[:10]

array([[0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1]])

In [38]:
dic = {0 : 'neg' , 1 : 'pos'}
y_test = [ dic[p[0]] for p in y_test ]

In [39]:
ids = np.arange(10000)

In [40]:
final_matrix = np.stack((ids, y_test), axis =1)

In [41]:
dframe = pd.DataFrame(final_matrix, columns=['Id', 'label'])

In [43]:
dframe.to_csv("ans.csv", index = False)