## Import Training Data Set

In [None]:
import pandas as pd
train = pd.read_csv("train.csv")

## Sample Training Data Set 

In [None]:
train.head()

## Number of rows and Columns in Training Set 

In [None]:
train.shape

## Preprocessing 

In [None]:
import nltk
import re
from bs4 import BeautifulSoup  
from nltk.corpus import stopwords

## Function to preprocess the data 

In [None]:
def review_to_words( raw_review ):

    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    return( " ".join( meaningful_words ))  

## Calling the function to preprocess the data on our training set 

In [None]:
num_reviews = train["reviewText"].size
print ("Cleaning the training set reviews...\n")
clean_train_reviews = []
for i in range( 0, num_reviews ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print ("Review %d of %d\n" % ( i+1, num_reviews ))                                                                    
    clean_train_reviews.append( review_to_words( train["reviewText"][i] ))

## Sample Pre Processed Reviews 

In [None]:
clean_train_reviews[0:5]

## Creating Bag of Words to train our dataset on the features. 

In [None]:
print ("Creating bag of words...\n")
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. 
train_data_features = vectorizer.fit_transform(clean_train_reviews)


In [None]:
train_data_matrix = train_data_features.copy()

In [None]:
train_data_features = train_data_features.toarray()

In [None]:
train_data_features

In [None]:
train_data_features.shape

## Printing the features set along with the number of occurences of each feature

In [None]:
import numpy as np
vocab = vectorizer.get_feature_names()
# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)

# print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist):
    print (tag,count)
    

## Training our data using the features with Random Forest Classifier 

In [None]:
print ("Training the random forest...")
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable

forest = forest.fit( train_data_features, train["rating"] )


## Applying our  Random Forest model on test set to predict the polarity of the review

In [None]:
test = pd.read_csv("test.csv")


print (test.shape)

# Create an empty list and append the clean reviews one by one
num_reviews = len(test["reviewText"])
clean_test_reviews = [] 

print ("Cleaning the test set reviews...\n")
for i in range(0,num_reviews):
    if( (i+1) % 1000 == 0 ):
        print ("Review %d of %d\n" % (i+1, num_reviews))
    clean_review = review_to_words( test["reviewText"][i] )
    clean_test_reviews.append( clean_review )

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

# Use the random forest to make label predictions
result = forest.predict(test_data_features)

# Copy the results to a pandas dataframe 
output = pd.DataFrame( data={"reviewText":test["reviewText"], "predicted":result , "originalRating" :test["actual"] } )


## Test Data set along with the predicted Polarity and original rating for Random Forest      classifier

In [None]:
output.head(50)

## Accuracy of our Random Forest model

In [None]:
from sklearn.metrics import accuracy_score
print ("Accuracy Rate, which is calculated by accuracy_score() is: %f" % accuracy_score(output.originalRating, output.predicted))

## F-Score

In [None]:
from sklearn.metrics import f1_score
f1_score(output.originalRating, output.predicted, average = 'weighted')

## Training our data using the features with Multinomial Naive Bayes Classifier 

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit( train_data_features, train["rating"])

## Applying our  Naive Bayes model on test set to predict the polarity of the review

In [None]:
num_reviews = len(test["reviewText"])
clean_test_reviews = [] 

print ("Cleaning the test set reviews...\n")
for i in range(0,num_reviews):
    if( (i+1) % 1000 == 0 ):
        print ("Review %d of %d\n" % (i+1, num_reviews))
    clean_review = review_to_words( test["reviewText"][i] )
    clean_test_reviews.append( clean_review )

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

# Use the Naive Bayes to make label predictions
result2 = nb.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and
# a "predicted" column
output2 = pd.DataFrame( data={"reviewText":test["reviewText"], "predicted":result2 , "originalRating" :test["actual"]} )


## Test Data set along with the predicted Polarity and original rating for Naive Bayes Classifier

In [None]:
output2.head(50)

## Accuracy of our Naive Bayes Model

In [None]:
print ("Accuracy Rate for Naive Bayes Classifier, which is calculated by accuracy_score() is: %f" % accuracy_score(output2.originalRating, output2.predicted))

## F-Score

In [None]:
from sklearn.metrics import f1_score
f1_score(output2.originalRating, output2.predicted,average='weighted')

## Training our data using the features with Support Vector Machines Classifier 

In [None]:
from sklearn import svm
clf1 = svm.SVC()
clf1.fit(train_data_features, train["rating"])


# Applying our  SVM model on test set to predict the polarity of the review

In [None]:
num_reviews = len(test["reviewText"])
clean_test_reviews = [] 
print ("Cleaning the test set reviews...\n")
for i in range(0,num_reviews):
    if( (i+1) % 1000 == 0 ):
        print ("Review %d of %d\n" % (i+1, num_reviews))
    clean_review = review_to_words( test["reviewText"][i] )
    clean_test_reviews.append( clean_review )

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

# Use the SVM to make sentiment label predictions
result10 = clf1.predict(test_data_features)

# Copy the results to a pandas dataframe 
output3 = pd.DataFrame( data={"reviewText":test["reviewText"], "predicted":result10, "originalRating" :test["actual"]} )


## Test Data set along with the predicted Polarity and original rating for SVM Classifier

In [None]:
output3.head(50)

## Accuracy of the Model 

In [None]:
print ("Accuracy Rate for SVM classifier, which is calculated by accuracy_score() is: %f" % accuracy_score(output3.originalRating, output3.predicted))

## F-Score

In [None]:
from sklearn.metrics import f1_score
f1_score(output3.originalRating, output3.predicted,average='weighted')

## Training our Data using features with logistic regression classifier 

In [None]:
print ("Training the logistic regression...")
from sklearn import linear_model



logreg1 = linear_model.LogisticRegression(C=1e5) 



logreg1 = logreg1.fit( train_data_features, train["rating"] )

## Applying our logistic regression model to predict the polarity of the reviews in Test set 

In [None]:
num_reviews = len(test["reviewText"])
clean_test_reviews = [] 
print ("Cleaning the test set reviews...\n")
for i in range(0,num_reviews):
    if( (i+1) % 1000 == 0 ):
        print ("Review %d of %d\n" % (i+1, num_reviews))
    clean_review = review_to_words( test["reviewText"][i] )
    clean_test_reviews.append( clean_review )

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

# Use the SVM to make sentiment label predictions
result9 = logreg1.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and
# a "predicted" column
output9 = pd.DataFrame( data={"reviewText":test["reviewText"], "predicted":result9, "originalRating" :test["actual"]} )


## Accuracy of the Logistic Regression Classifier

In [None]:
print ("Accuracy Rate for logistic classifier, which is calculated by accuracy_score() is: %f" % accuracy_score(output9.originalRating, output9.predicted))

## F - Score

In [None]:
from sklearn.metrics import f1_score
f1_score(output9.originalRating, output9.predicted,average='weighted')

## Importing train data to classify Data to three labels ie, Postive(1) ,Negative(-1) or Neutral(0) 

In [None]:
import pandas as pd
train2 = pd.read_csv("train3.csv")

In [None]:
train2.reviewText.size

## Cleaning the new Data Set 

In [None]:
import nltk
import re
from bs4 import BeautifulSoup  
from nltk.corpus import stopwords

In [None]:
num_reviews2 = train2["reviewText"].size
print ("Cleaning the training set reviews...\n")
clean_train_reviews2 = []
for i in range( 0, num_reviews2 ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print ("Review %d of %d\n" % ( i+1, num_reviews2 ))                                                                    
    clean_train_reviews2.append( review_to_words( train2["reviewText"][i] ))

## Building bag of words to the new data set and getting the features to train data on

In [None]:
print ("Creating bag of words...\n")
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer2 = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. 
train_data_features2 = vectorizer2.fit_transform(clean_train_reviews2)

In [None]:
train_data_features2 = train_data_features2.toarray()

##  Training our data using the features with Random Forest Classifier 

In [None]:
print ("Training the random forest...")
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest2 = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest2 = forest2.fit( train_data_features2, train2["rating1"] )

## Applying our  Random Forest model on test set to predict the polarity of the review 

In [None]:
test2 = pd.read_csv("test3.csv")


print (test2.shape)

# Create an empty list and append the clean reviews one by one
num_reviews3 = len(test2["reviewText"])
clean_test_reviews3 = [] 

print ("Cleaning and parsing the test set reviews...\n")
for i in range(0,num_reviews3):
    if( (i+1) % 1000 == 0 ):
        print ("Review %d of %d\n" % (i+1, num_reviews3))
    clean_review3 = review_to_words( test2["reviewText"][i] )
    clean_test_reviews3.append( clean_review3 )

# Get a bag of words for the test set, and convert to a numpy array
test_data_features3 = vectorizer2.transform(clean_test_reviews3)
test_data_features3 = test_data_features3.toarray()

# Use the random forest to label predictions
result3 = forest2.predict(test_data_features3)

# Copy the results to a pandas dataframe 
output4 = pd.DataFrame( data={"reviewText":test2["reviewText"], "predicted":result3 , "originalRating" :test2["rating1"] } )

## Test Data set with predicted polarity 

In [None]:
output4.head()

## Accuracy of the Random Forest classifier when predicting three classes 

In [None]:
from sklearn.metrics import accuracy_score

print ("Accuracy Rate for Random Forest classifier, which is calculated by accuracy_score() is: %f" % accuracy_score(output4.originalRating, output4.predicted))

## F-Score

In [None]:
from sklearn.metrics import f1_score
f1_score(output4.originalRating, output4.predicted,average="weighted")

## Training our data using the features with Naive Bayes Classifier 

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb1 = MultinomialNB()
nb1.fit( train_data_features2, train2["rating1"])

## Applying our  Naive Bayes model on test set to predict the polarity of the review  

In [None]:
num_reviews3 = len(test2["reviewText"])
clean_test_reviews3 = [] 

print ("Cleaning the test set reviews...\n")
for i in range(0,num_reviews3):
    if( (i+1) % 1000 == 0 ):
        print ("Review %d of %d\n" % (i+1, num_reviews3))
    clean_review3 = review_to_words( test2["reviewText"][i] )
    clean_test_reviews3.append( clean_review3 )

# Get a bag of words for the test set, and convert to a numpy array
test_data_features4 = vectorizer2.transform(clean_test_reviews3)
test_data_features4 = test_data_features4.toarray()

# Use the naive bayes to make label predictions
result5 = nb1.predict(test_data_features4)

# Copy the results to a pandas dataframe 
output5 = pd.DataFrame( data={"reviewText":test2["reviewText"], "predicted":result5 , "originalRating" :test2["rating1"]} )

## Accuracy of the Naive Bayes classifier when predicting three classes 

In [None]:
print ("Accuracy Rate for Naive Bayes classifier, which is calculated by accuracy_score() is: %f" % accuracy_score(output5.originalRating, output5.predicted))

## F-Score

In [None]:
from sklearn.metrics import f1_score
f1_score(output5.originalRating, output5.predicted,average="weighted")

## Training our SVM model using the features generated

In [None]:
from sklearn import svm
clf = svm.SVC()
clf.fit(train_data_features2, train2["rating1"])

## Applying our SVM model on test set to predict the polarity of the review  

In [None]:
num_reviews4 = len(test2["reviewText"])
clean_test_reviews4 = [] 

print ("Cleaning the test set reviews...\n")
for i in range(0,num_reviews4):
    if( (i+1) % 1000 == 0 ):
        print ("Review %d of %d\n" % (i+1, num_reviews4))
    clean_review4 = review_to_words( test2["reviewText"][i] )
    clean_test_reviews4.append( clean_review4 )

# Get a bag of words for the test set, and convert to a numpy array
test_data_features5 = vectorizer2.transform(clean_test_reviews4)
test_data_features5 = test_data_features5.toarray()

# Use the SVM to make label predictions
result6 = clf.predict(test_data_features5)

# Copy the results to a pandas dataframe 
output6 = pd.DataFrame( data={"reviewText":test2["reviewText"], "predicted":result6 , "originalRating" :test2["rating1"]} )

## Accuracy of the SVM classifier when predicting three classes  

In [None]:
print ("Accuracy Rate for SVM classifier, which is calculated by accuracy_score() is: %f" % accuracy_score(output6.originalRating, output6.predicted))

## F-Score

In [None]:
from sklearn.metrics import f1_score
f1_score(output6.originalRating, output6.predicted,average="weighted")

## Training our Logistic Regression model using the features generated

In [None]:
print ("Training the logistic regression...")
from sklearn import linear_model



logreg = linear_model.LogisticRegression(C=1e5) 



logreg = logreg.fit( train_data_features2, train2["rating1"] )

## Applying our logistic regression model to predict the polarity of review in the test set

In [None]:
# Create an empty list and append the clean reviews one by one
num_reviews = len(test2["reviewText"])
clean_test_reviews = [] 

print ("Cleaning the test set reviews...\n")
for i in range(0,num_reviews):
    if( (i+1) % 1000 == 0 ):
        print ("Review %d of %d\n" % (i+1, num_reviews))
    clean_review = review_to_words( test2["reviewText"][i] )
    clean_test_reviews.append( clean_review )

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer2.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

# Use the random forest to make label predictions
result7 = logreg.predict(test_data_features)

# Copy the results to a pandas dataframe 
output7 = pd.DataFrame( data={"reviewText":test2["reviewText"], "predicted":result7 , "originalRating" :test2["rating1"] } )

## Accuracy

In [None]:
print ("Accuracy Rate for logistic classifier, which is calculated by accuracy_score() is: %f" % accuracy_score(output7.originalRating, output7.predicted))

## F- Score

In [None]:
f1_score(output7.originalRating, output7.predicted,average="weighted")

In [None]:
new = 'hello! this is great. Awesome. Have fun'
clean_test_reviews9 = []
clean_review9 = review_to_words( new )
clean_test_reviews9.append( clean_review9 )

In [None]:
clean_test_reviews9

In [None]:
test_data_features9 = vectorizer.transform(clean_test_reviews9)
test_data_features9 = test_data_features9.toarray()

In [None]:
nb.predict(test_data_features9)