In [140]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
import re
from string import punctuation
from textblob import Word
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [203]:
train_data = pd.read_table("train_file.dat", names=['label', 'Review'])

In [204]:
train_data

Unnamed: 0,label,Review
0,1,This book is such a life saver. It has been s...
1,1,I bought this a few times for my older son and...
2,1,"This is great for basics, but I wish the space..."
3,1,This book is perfect! I'm a first time new mo...
4,1,During your postpartum stay at the hospital th...
...,...,...
18501,-1,"I really liked this monitor at first, but the ..."
18502,-1,Apparently you get what you pay for. I've use...
18503,-1,The old saying holds true with this product --...
18504,-1,We did a great deal of research before purchas...


In [205]:
train_data.Review

0        This book is such a life saver.  It has been s...
1        I bought this a few times for my older son and...
2        This is great for basics, but I wish the space...
3        This book is perfect!  I'm a first time new mo...
4        During your postpartum stay at the hospital th...
                               ...                        
18501    I really liked this monitor at first, but the ...
18502    Apparently you get what you pay for.  I've use...
18503    The old saying holds true with this product --...
18504    We did a great deal of research before purchas...
18505    I ordered these after having great success wit...
Name: Review, Length: 18506, dtype: object

In [206]:
stop_words = stopwords.words('english')

In [207]:
def contractions(words):
    words_without_punc = []
    for word in words:
        if word.isalnum():
            words_without_punc.append(word.lower())
        elif "n't" in word:
            words_without_punc.append("not")
        elif "'re" in word:
            words_without_punc.append("are")
        elif "'ll" in word:
            words_without_punc.append("will")
        elif "'ve" in word:
            words_without_punc.append("have")
        elif "'m" in word:
            words_without_punc.append("am")
        elif "'s" in word:
            words_without_punc.append("is")

    return words_without_punc

In [219]:
def review_to_words(series):
    reviews = []
    for review in series:
        #Word Tokenization
        review = word_tokenize(str(review))
        #Perform the Contractions on the reviews
        review = contractions(review)
        #Lower case conversion
        review = [word.lower() for word in review]
        #Punctuation removal
        review = [x for x in review if not x in punctuation]
        #Number removal
        review = [x for x in review if not x.isdigit()]
        #Stopword removal
        review = [x for x in review if x not in stop_words]
        #Lemmatization
        review = " ".join([Word(x).lemmatize() for x in review])
        reviews.append(review)
    return reviews

In [212]:
X = train_data.Review
y = train_data.label

In [220]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=0)
X_train = review_to_words(X_train)

In [None]:
X_train

In [222]:
X_valid = review_to_words(X_valid)

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(random_state=0))
])

# Define the hyperparameters to search over
param_grid = {
    'tfidf__max_features': [1000, 5000, 10000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__C': [1, 10, 100],
    'clf__penalty': ['l1', 'l2'],
    'clf__max_iter': [10000]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=10, verbose=5, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Get the best estimator
best_estimator = grid_search.best_estimator_

print("Best hyperparameters: ", best_params)

In [228]:
grid_search.best_score_

0.8774543691350415

In [227]:
best_estimator.score(X_valid, y_valid)

0.8849270664505673

In [239]:
test_file = open("1675198994_6787539_1567602457_126649_test.dat")
test_data = test_file.readlines()
test_file.close()

In [None]:
clean_test_data = review_to_words(test_data)
clean_test_data

In [241]:
preds_test = best_estimator.predict(clean_test_data)

In [242]:
preds_test

array([ 1,  1,  1, ..., -1, -1, -1])

In [441]:
# Save test predictions to file
output = pd.DataFrame(preds_test)
output.to_csv('predictions.dat', index=False, header=False, escapechar=None)