In [140]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
import re
from string import punctuation
from textblob import Word
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [203]:
train_data = pd.read_table("1675198994_6220334_1567602457_1187546_train_file.dat", names=['label', 'Review'])

In [204]:
train_data

Unnamed: 0,label,Review
0,1,This book is such a life saver. It has been s...
1,1,I bought this a few times for my older son and...
2,1,"This is great for basics, but I wish the space..."
3,1,This book is perfect! I'm a first time new mo...
4,1,During your postpartum stay at the hospital th...
...,...,...
18501,-1,"I really liked this monitor at first, but the ..."
18502,-1,Apparently you get what you pay for. I've use...
18503,-1,The old saying holds true with this product --...
18504,-1,We did a great deal of research before purchas...


In [205]:
train_data.Review

0        This book is such a life saver.  It has been s...
1        I bought this a few times for my older son and...
2        This is great for basics, but I wish the space...
3        This book is perfect!  I'm a first time new mo...
4        During your postpartum stay at the hospital th...
                               ...                        
18501    I really liked this monitor at first, but the ...
18502    Apparently you get what you pay for.  I've use...
18503    The old saying holds true with this product --...
18504    We did a great deal of research before purchas...
18505    I ordered these after having great success wit...
Name: Review, Length: 18506, dtype: object

In [206]:
stop_words = stopwords.words('english')

In [207]:
def contractions(words):
    words_without_punc = []
    for word in words:
        if word.isalnum():
            words_without_punc.append(word.lower())
        elif "n't" in word:
            words_without_punc.append("not")
        elif "'re" in word:
            words_without_punc.append("are")
        elif "'ll" in word:
            words_without_punc.append("will")
        elif "'ve" in word:
            words_without_punc.append("have")
        elif "'m" in word:
            words_without_punc.append("am")
        elif "'s" in word:
            words_without_punc.append("is")

    return words_without_punc

In [219]:
def review_to_words(series):
    reviews = []
    for review in series:
        #Word Tokenization
        review = word_tokenize(str(review))
        #Perform the Contractions on the reviews
        review = contractions(review)
        #Lower case conversion
        review = [word.lower() for word in review]
        #Punctuation removal
        review = [x for x in review if not x in punctuation]
        #Number removal
        review = [x for x in review if not x.isdigit()]
        #Remove special characters
#         review = review.replace('\W', '', regex=True) 
        #Stopword removal
        review = [x for x in review if x not in stop_words]
        #Frequent word removal
#         temp = review.apply(lambda words: " ".join(words))
#         freq = pd.Series(temp).value_counts()[:10]
#         review = review.apply(lambda words: [x for x in words if x not in freq.keys()])

        #Lemmatization
        review = " ".join([Word(x).lemmatize() for x in review])
        reviews.append(review)
    return reviews

In [212]:
X = train_data.Review
y = train_data.label

In [220]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=0)
X_train = review_to_words(X_train)

In [221]:
X_train

['first week generally satisfied product though found many others spray strong definitely getting dunking looked sprayer seemed better review competitor chose pricy item within month first two week bracket used hang sprayer broke finished look like metal plastic simply broke drape sprayer behind toilet seat across toilet ideal huge full blast spray far strong even using much lower force found spray strong enough clean diaper also meant much splatter cleaning wall floor every time using sprayer solved one problem created another washing bathroom fecal matter sprayed especially disgusting sprayed solved problem buying device available amazon clip diaper spray spray pal name highly recommend number one problem deal breaker within six week using sprayer button press spayer initiate spray became stuck meant continuous water flow yes turn water flow turning valve near bottom toilet one every spray session also mean spray nonstop valve open ready soon turn valve annoying big waste inconsisten

In [222]:
X_valid = review_to_words(X_valid)

In [225]:
from sklearn.model_selection import GridSearchCV

# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(random_state=0))
])

# Define the hyperparameters to search over
param_grid = {
    'tfidf__max_features': [1000, 5000, 10000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__C': [1, 10, 100],
    'clf__penalty': ['l1', 'l2'],
    'clf__max_iter': [10000]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=10, verbose=5, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Get the best estimator
best_estimator = grid_search.best_estimator_

print("Best hyperparameters: ", best_params)

180 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/chaithrabekal/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/chaithrabekal/opt/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/chaithrabekal/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/chaithrabekal/opt

Best hyperparameters:  {'clf__C': 10, 'clf__max_iter': 10000, 'clf__penalty': 'l2', 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 2)}


In [228]:
grid_search.best_score_

0.8774543691350415

In [227]:
best_estimator.score(X_valid, y_valid)

0.8849270664505673

In [239]:
test_file = open("1675198994_6787539_1567602457_126649_test.dat")
test_data = test_file.readlines()
test_file.close()

In [240]:
clean_test_data = review_to_words(test_data)
clean_test_data

['perfect new parent able keep track baby feeding sleep diaper change schedule first two half month life made life easier doctor would ask question habit right',
 'help know exactly baby day gone mother law watching go work also section write note let know anything may need could happier book',
 'wanted alternative printing daily log sheet nanny fill worked great longer searching daughter bag crumpled piece paper day also nice able look back previous day week eating sleeping pattern would preferred cover held well far',
 'month old son spend half day mother half neighbor worked track activity loosely allowed get idea schedule developing much milk eating best way cohesion life work',
 'baby tracker brand book absolute best tracker available track nap feeding diaper change activity note easy use nice clean started using newborn baby tracker track night day gave nice way track feeding note important milestone used entire book bought baby tracker track day since daughter sleeping night car

In [241]:
preds_test = best_estimator.predict(clean_test_data)

In [242]:
preds_test

array([ 1,  1,  1, ..., -1, -1, -1])

In [441]:
# Save test predictions to file
output = pd.DataFrame(preds_test)
output.to_csv('submission5.dat', index=False, header=False, escapechar=None)