In [58]:
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans

import nltk
from nltk.corpus import stopwords 

In [59]:
train = pd.read_csv("labeledTrainData.tsv", header=0, \
                    delimiter="\t", quoting=3)

In [60]:
test = pd.read_csv("testData.tsv", header=0, \
                    delimiter="\t", quoting=3)

In [61]:
from sklearn.model_selection import train_test_split

In [62]:
X_train = train[['id', 'review']]

In [63]:
Y_train = train['sentiment']

In [64]:
X_test = test[['id', 'review']]

In [65]:
X_train.head()

Unnamed: 0,id,review
0,"""5814_8""","""With all this stuff going down at the moment ..."
1,"""2381_9""","""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""","""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""","""It must be assumed that those who praised thi..."
4,"""9495_8""","""Superbly trashy and wondrously unpretentious ..."


In [66]:
def review_to_words( raw_review ):
    review_text = BeautifulSoup(raw_review).get_text() 
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    words = letters_only.lower().split()                             
    stops = set(stopwords.words("english"))                  
    meaningful_words = [w for w in words if not w in stops]   
    return( " ".join( meaningful_words ))   

In [68]:
num_reviews = X_train["review"].size
print("Cleaning and parsing the training set movie reviews...\n")
clean_train_reviews = []
for i in range( 0, num_reviews ):
    if( (i+1)%1000 == 0 ):
        print("Review %d of %d" % ( i+1, num_reviews ))                                                                 
    clean_train_reviews.append( review_to_words( X_train["review"][i] ))

Cleaning and parsing the training set movie reviews...





 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000
Review 25000 of 25000


In [69]:
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 


train_data_features = vectorizer.fit_transform(clean_train_reviews)

train_data_features = train_data_features.toarray()
print(train_data_features.shape)

(25000, 5000)


Unnamed: 0,id,review
0,"""5814_8""","""With all this stuff going down at the moment ..."
1,"""2381_9""","""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""","""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""","""It must be assumed that those who praised thi..."
4,"""9495_8""","""Superbly trashy and wondrously unpretentious ..."


In [None]:
%%time
from sklearn.model_selection import GridSearchCV
print("Training the random forest...")
params_rf = {'n_estimators': [x for x in range(50,200,10)]}
RF_grid = GridSearchCV(estimator=RandomForestClassifier(), param_grid=params_rf, n_jobs=-1, cv=5)
forest = RF_grid.fit( train_data_features, train["sentiment"] )
print('\n',RF_grid.best_estimator_)


Training the random forest...


In [75]:
num_reviews = X_test["review"].size
print("Cleaning and parsing the training set movie reviews...\n")
clean_train_reviews_test = []
for i in range( 0, num_reviews ):
    if( (i+1)%1000 == 0 ):
        print("Review %d of %d" % ( i+1, num_reviews ))                                                                 
    clean_train_reviews_test.append( review_to_words( X_test["review"][i] ))

Cleaning and parsing the training set movie reviews...





 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000
Review 25000 of 25000


In [76]:
test_data_features = vectorizer.transform(clean_train_reviews_test)
test_data_features = test_data_features.toarray()

# передаем фичи натренированной модели
result = forest.predict(test_data_features)

# дальше создаем DataFrame с ответом и посылаем в систему
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv("Bag_of_Words_model.csv", index=False, quoting=3 )