In [1]:
import pandas as pd
import numpy as np
import json

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [2]:
reviews = []

with open('reviews_Musical_Instruments_5.json', 'r') as f:
    raw_json = f.readlines()
    for record in raw_json:
        reviews.append(eval(record))

In [3]:
json_info = json.dumps(reviews)

In [4]:
df = pd.read_json(json_info)
df = df[['reviewText', 'overall']].copy()
df.sample(10)

Unnamed: 0,reviewText,overall
8838,Well made Capo. Works as designed on my acoust...,4
8703,This simple rubber device works great on flat ...,5
2851,"Good strings for the price, nice ring to them....",5
3336,I'll re rate this item if my problem is fixed ...,3
3389,Have not had the chance to stick it to my guit...,4
6761,Great polishing cloth. Not much to say here ot...,5
8104,The mat is just the right size for your guitar...,5
4486,This was the easiest tuner I had. I am sorry ...,5
5666,"Very nice guitar slide. The glass is nice, sli...",5
3067,I works ok but you have to observe a few detai...,4


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10261 entries, 0 to 10260
Data columns (total 2 columns):
reviewText    10261 non-null object
overall       10261 non-null int64
dtypes: int64(1), object(1)
memory usage: 160.4+ KB


In [6]:
def remove_punctuation(text):
    import string
    trantab = str.maketrans({key: None for key in string.punctuation})
    return text.translate(trantab)

df = df[df['overall'] != 3]
df['clean_review'] = df['reviewText'].apply(remove_punctuation)

In [7]:
df.sample(5)

Unnamed: 0,reviewText,overall,clean_review
9148,Disclaimers:1. Obtained free as part of Amazon...,5,Disclaimers1 Obtained free as part of Amazons ...
1519,Elixir strings are very nice. so slick and la...,5,Elixir strings are very nice so slick and las...
4624,"I dunno, cause I'm stingy with stars. These h...",4,I dunno cause Im stingy with stars These have...
69,This amp plug is great for the price its 10 ft...,5,This amp plug is great for the price its 10 ft...
906,it's kinda small for my hand but it works grea...,5,its kinda small for my hand but it works great...


In [8]:
df['sentiment'] = df['overall'].apply(lambda rating: 1 if rating>3 else -1)
df['sentiment'].mean()

0.9015702392243651

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_review'],
                                                   df['sentiment'],
                                                   random_state=0)

In [10]:
vect = CountVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)

In [11]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)
predictions = model.predict(vect.transform(X_test))
roc_auc_score(y_test, predictions)



0.6718504117458328