# Cleaning the Reviews

In [2]:
import re
from bs4 import BeautifulSoup 

## remove HTML tags and convert words to lower case

In [3]:
def review_to_wordlist(review):
    '''
    Meant for converting each of the IMDB reviews into a list of words.
    '''
    # First remove the HTML.
    review_text = BeautifulSoup(review).get_text()
    
    # Use regular expressions to only include words.
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
    # Convert words to lower case and split them into separate words.
    words = review_text.lower().split()
   
    # Return a list of words
    return(words)

In [4]:
import pandas as pd

## load data with dataframe

In [5]:
train = pd.read_csv('labeledTrainData.tsv', header=0,
                delimiter="\t", quoting=3)
test = pd.read_csv('testData.tsv', header=0, delimiter="\t",
               quoting=3 )
               
# Import both the training and test data.

## split data to words list

In [8]:
traindata = []
for i in xrange(0,len(train['review'])):
    traindata.append(" ".join(review_to_wordlist(train['review'][i])))
testdata = []
for i in xrange(0,len(test['review'])):
    testdata.append(" ".join(review_to_wordlist(test['review'][i])))

# TF-IDF Vectorization

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer as TFIV

## n-gram

In [11]:
tfv = TFIV(min_df=3,  max_features=None, 
        strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
        ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
        stop_words = 'english')

## train test data and train data to vectors

In [12]:
X_all = traindata + testdata # Combine both to fit the TFIDF vectorization.
lentrain = len(traindata)

tfv.fit(X_all) # This is the slow part!
X_all = tfv.transform(X_all)

X = X_all[:lentrain] # Separate back into training and test sets. 
X_test = X_all[lentrain:]


# Making Our Classifiers 

In [14]:
X.shape

(25000, 309798)

## Logistic Regression with GridSearchCV

In [15]:
from sklearn.linear_model import LogisticRegression as LR
from sklearn.grid_search import GridSearchCV

In [17]:
grid_values = {'C':[30]} # Decide which settings you want for the grid search. 

model_LR = GridSearchCV(LR(penalty = 'L2', dual = True, random_state = 0), 
                        grid_values, scoring = 'roc_auc', cv = 20) 
# Try to set the scoring on what the contest is asking for. 
# The contest says scoring is for area under the ROC curve, so use this.
                        
model_LR.fit(X,y_train) # Fit the model.

GridSearchCV(cv=20,
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, penalty='L2', random_state=0, tol=0.0001),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': [30]}, pre_dispatch='2*n_jobs', refit=True,
       score_func=None, scoring='roc_auc', verbose=0)

In [18]:
model_LR.grid_scores_

[mean: 0.96459, std: 0.00489, params: {'C': 30}]

In [19]:
model_LR.best_estimator_

LogisticRegression(C=30, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, penalty='L2', random_state=0, tol=0.0001)

## MultinomialNB

In [20]:
from sklearn.naive_bayes import MultinomialNB as MNB

In [21]:
model_NB = MNB()
model_NB.fit(X, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [23]:
from sklearn.cross_validation import cross_val_score
import numpy as np

In [24]:
print "20 Fold CV Score for Multinomial Naive Bayes: ", np.mean(cross_val_score
                                                                (model_NB, X, y_train, cv=20, scoring='roc_auc'))
     # This will give us a 20-fold cross validation score that looks at ROC_AUC so we can compare with Logistic Regression. 

20 Fold CV Score for Multinomial Naive Bayes:  0.949631232


## SGD classifier apply to large number of training features

In [25]:
from sklearn.linear_model import SGDClassifier as SGD

### calculate the area under a ROC curve

In [26]:
sgd_params = {'alpha': [0.00006, 0.00007, 0.00008, 0.0001, 0.0005]} # Regularization parameter

model_SGD = GridSearchCV(SGD(random_state = 0, shuffle = True, loss = 'modified_huber'), 
                        sgd_params, scoring = 'roc_auc', cv = 20) # Find out which regularization parameter works the best. 
                        
model_SGD.fit(X, y_train) # Fit the model.

GridSearchCV(cv=20,
       estimator=SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
       fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
       loss='modified_huber', n_iter=5, n_jobs=1, penalty='l2',
       power_t=0.5, random_state=0, shuffle=True, verbose=0,
       warm_start=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'alpha': [6e-05, 7e-05, 8e-05, 0.0001, 0.0005]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring='roc_auc', verbose=0)

Again, similar to the Logistic Regression model, we can see which parameter did the best.

In [27]:
model_SGD.grid_scores_

[mean: 0.96477, std: 0.00484, params: {'alpha': 6e-05},
 mean: 0.96484, std: 0.00481, params: {'alpha': 7e-05},
 mean: 0.96486, std: 0.00480, params: {'alpha': 8e-05},
 mean: 0.96479, std: 0.00480, params: {'alpha': 0.0001},
 mean: 0.95869, std: 0.00484, params: {'alpha': 0.0005}]

# predict test data

In [28]:
LR_result = model_LR.predict_proba(X_test)[:,1] # We only need the probabilities that the movie review was a 7 or greater. 
LR_output = pd.DataFrame(data={"id":test["id"], "sentiment":LR_result}) # Create our dataframe that will be written.
LR_output.to_csv('Logistic_Reg_Proj2.csv', index=False, quoting=3) # Get the .csv file we will submit to Kaggle.

Repeat this with the other two.

In [29]:
# Repeat this for Multinomial Naive Bayes

MNB_result = model_NB.predict_proba(X_test)[:,1]
MNB_output = pd.DataFrame(data={"id":test["id"], "sentiment":MNB_result})
MNB_output.to_csv('MNB_Proj2.csv', index = False, quoting = 3)

# Last, do the Stochastic Gradient Descent model with modified Huber loss.

SGD_result = model_SGD.predict_proba(X_test)[:,1]
SGD_output = pd.DataFrame(data={"id":test["id"], "sentiment":SGD_result})
SGD_output.to_csv('SGD_Proj2.csv', index = False, quoting = 3)

## vote majority

 got 0.95 accuracy