In [1]:
import pandas as pd
import numpy as np
import json

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import roc_auc_score

In [2]:
reviews = []

with open('reviews_Musical_Instruments_5.json', 'r') as f:
    raw_json = f.readlines()
    for record in raw_json:
        reviews.append(eval(record))

In [3]:
json_info = json.dumps(reviews)

In [4]:
df = pd.read_json(json_info)
df = df[['reviewText', 'overall']].copy()
df.sample(10)

Unnamed: 0,reviewText,overall
5118,Hearing ourselves the way we sound to others i...,5
1787,This shelf is so convenient and so often used ...,5
7947,Very close to the sound of an SM-58. Not quite...,4
6916,Yes!!!! finally...after reading endless tons o...,5
424,I've had this for about 11 years now. Even wit...,4
9956,"I don't travel often with my guitars, but need...",5
2573,it does what's it's suppose to do... holds a m...,5
4803,just used mine for the first time today love i...,5
8112,"Very very useful. Like seriously, unless you ...",5
5530,If you are have problems knowing what and wher...,5


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10261 entries, 0 to 10260
Data columns (total 2 columns):
reviewText    10261 non-null object
overall       10261 non-null int64
dtypes: int64(1), object(1)
memory usage: 160.4+ KB


In [6]:
def remove_punctuation(text):
    import string
    trantab = str.maketrans({key: None for key in string.punctuation})
    return text.translate(trantab)

df = df[df['overall'] != 3]
df['clean_review'] = df['reviewText'].apply(remove_punctuation)

In [7]:
df.sample(5)

Unnamed: 0,reviewText,overall,clean_review
2777,Nice cables. So far I've not had any problems...,4,Nice cables So far Ive not had any problems w...
1732,I love being able to adjust the tension. Does ...,5,I love being able to adjust the tension Does n...
6660,"Promptly received, nicely packaged with protec...",5,Promptly received nicely packaged with protect...
7272,"Once I read about the Bugera B5, I knew I want...",5,Once I read about the Bugera B5 I knew I wante...
4252,I always seem to have to re-tune with the spri...,5,I always seem to have to retune with the sprin...


In [8]:
df['sentiment'] = df['overall'].apply(lambda rating: 1 if rating>3 else -1)
df['sentiment'].mean()

0.9015702392243651

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_review'],
                                                   df['sentiment'],
                                                   random_state=0)

In [10]:
vect = CountVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)

In [11]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)
predictions = model.predict(vect.transform(X_test))
roc_auc_score(y_test, predictions)



0.6718504117458328

In [12]:
feature_names = np.array(vect.get_feature_names())
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs: \n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs: 
['returned' 'unless' 'something' 'broke' 'doesnt' 'poor' 'return' 'iphone'
 'thought' 'month']

Largest Coefs: 
['perfect' 'perfectly' 'works' 'using' 'love' 'easy' 'best' 'nice'
 'awesome' 'great']


In [13]:
vect = TfidfVectorizer(min_df=5).fit(X_train)
X_train_vectorized = vect.transform(X_train)

model=LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))
roc_auc_score(y_test, predictions)



0.5

In [14]:
feature_names = np.array(vect.get_feature_names())
sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest tfidf: \n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

Smallest tfidf: 
['ignore' 'retains' 'underlying' 'xp' 'ampthe' 'amounts' 'marshalls'
 'characteristics' 'saturation' 'hz']

Largest tfidf: 
['excellent' 'great' 'useful' 'pleased' 'stuff' 'deal' 'beginner'
 'polyweb' 'picks' 'awesome']


In [15]:
vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)
X_train_vectorized = vect.transform(X_train)

model=LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))
roc_auc_score(y_test, predictions)



0.6522924846836662

In [16]:
feature_names = np.array(vect.get_feature_names())
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs: \n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs: 
['doesnt' 'something' 'returned' 'even' 'thought' 'just use' 'poor' 'low'
 'broke' 'at all']

Largest Coefs: 
['nice' 'works' 'great' 'perfect' 'love' 'easy' 'perfectly' 'pick'
 'little' 'awesome']


In [17]:
significant_words = ['nice', 'works', 'great', 'perfect', 'love', 'easy', 'perfectly', 'pick',
 'little', 'awesome','excellent', 'useful', 'pleased', 'stuff', 'deal', 'beginner',
 'polyweb', 'picks', 'best']

vectorizer_word_subset = CountVectorizer(vocabulary=significant_words)
X_train_vectorized = vectorizer_word_subset.fit_transform(X_train)

model=LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vectorizer_word_subset.fit_transform(X_test))
roc_auc_score(y_test, predictions)



0.5

In [18]:
feature_names = np.array(vectorizer_word_subset.get_feature_names())
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs: \n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs: 
['stuff' 'useful' 'deal' 'beginner' 'polyweb' 'nice' 'picks' 'little'
 'pick' 'best']

Largest Coefs: 
['awesome' 'pleased' 'easy' 'perfect' 'perfectly' 'love' 'works' 'great'
 'excellent' 'best']


In [19]:
vectorizer_word_subset = TfidfVectorizer(min_df=5, vocabulary=significant_words)
X_train_vectorized = vectorizer_word_subset.fit_transform(X_train)

model=LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vectorizer_word_subset.fit_transform(X_test))
roc_auc_score(y_test, predictions)



0.5

In [20]:
feature_names = np.array(vectorizer_word_subset.get_feature_names())
sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest tfidf: \n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

Smallest tfidf: 
['nice' 'polyweb' 'beginner' 'deal' 'stuff' 'pleased' 'useful' 'excellent'
 'picks' 'awesome']

Largest tfidf: 
['best' 'little' 'works' 'great' 'perfect' 'love' 'easy' 'perfectly'
 'pick' 'awesome']


In [21]:
vect = CountVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)

model=DummyClassifier()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vectorizer_word_subset.fit_transform(X_test))
roc_auc_score(y_test, predictions)

0.49861350893007195