# Bag of Words Meets Bag of Popcorn Submission

Let's import some basic libraries to get things rolling.

In [None]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

Alright, on to cleaning the text lceaning. Some of these methods are a bit redundant, but after a few trials I found that this best cleans all the noise.

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import string
import re

def clean_me(text):
    text = BeautifulSoup(text).get_text()
    text = re.sub("[^a-zA-Z]", " ", text)  
    stop_words = stopwords.words('english')
    stop_words += list(string.punctuation)
    token = word_tokenize
    tokens = list(token(text))
    stopwords_removed = [token.lower() for token in tokens if not token.lower() in stop_words]

    lemmatizer = WordNetLemmatizer()

    return " ".join([lemmatizer.lemmatize(w) for w in stopwords_removed])

Now to import the libraries and clean the review column right away. Naturally, I took out my information but you can replace the string with your file paths real easy.

In [None]:
#I downloaded the data and saved it. It can now be loaded with this method.
df = pd.read_table(r'XX')
test_df = pd.read_table(r'XX')
#Let's clean it right away
df.review = df.review.apply(clean_me)
test_df.review = test_df.review.apply(clean_me)

Alright now onto the actual work. I always build a parameter grid with many features then print out the best parameters to speed up the process in the future. For the sake of transparency, I left the whole grid here.

Admittedly, I tried many classification methods (MNB, KClusters, Random Forest) but simple Logistics Regression returned the highest accuracy score.

For further transparency, let's run a train-test split so that we can see the working model.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.review, df.sentiment, test_size = 0.1, random_state=0)

pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('norma', MaxAbsScaler()),
    ('reg', LogisticRegression())
])

param_grid = [{
    'tfidf__analyzer' : ['word', 'char'],
    'reg__class_weight': ['balanced', None],
    'reg__fit_intercept': [True, False]
    }]

grid = GridSearchCV(pipe, param_grid, cv=7)
grid.fit(X_train, y_train)

pred = grid.predict(X_test)

print(grid.best_params_)
print(confusion_matrix(pred, y_test))
print(accuracy_score(pred, y_test))

#This takes a minute to run. When I did it, it came back with an 0.898 accuracy score.

In [None]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('norma', MaxAbsScaler()),
    ('reg', LogisticRegression())
])

param_grid = [{
    'tfidf__analyzer' : ['word', 'char'],
    'reg__class_weight': ['balanced', None],
    'reg__fit_intercept': [True, False]
    }]

grid = GridSearchCV(pipe, param_grid, cv=10)
grid.fit(df.review, df.sentiment)

pred = grid.predict(test_df.review)

test_df['sentiment'] = pred

To close off, let's make the submission file.

In [32]:
sub_df = pd.DataFrame(test_df[['id', 'sentiment']])
sub_df.to_csv('submission', index=False)

In [33]:
pd.read_csv(r'C:\Users\David\Documents\code\Mod_5\Bag of Popcorn\submission')

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,0
3,7186_2,1
4,12128_7,1
...,...,...
24995,2155_10,1
24996,59_10,1
24997,2531_1,0
24998,7772_8,1
