In [5]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [6]:
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']
df = pd.read_csv('amazon_baby.csv')
df['review'] = df.review.apply(np.str)
df.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [7]:
counter = CountVectorizer(vocabulary=selected_words)

In [8]:
counter.fit(df['review'])

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None,
        vocabulary=['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate'])

In [9]:
review_array = counter.transform(df['review']).toarray()

In [10]:
freq_matrix = pd.DataFrame(review_array, columns=counter.get_feature_names())
freq_matrix.apply(np.sum)

awesome       4075
great        59536
fantastic     1765
amazing       2726
love         43867
horrible      1245
bad           4950
terrible      1282
awful          753
wow            461
hate          1285
dtype: int64

In [11]:
df = df[(df['rating'] != 3)]
df['target'] = df['rating'] >= 4
df.head()

Unnamed: 0,name,review,rating,target
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,True
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,True
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,True
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,True
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,True


In [12]:
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['target'], random_state=0, train_size=.8, test_size=.2)

In [13]:
count_vector = CountVectorizer(vocabulary=selected_words)

In [14]:
training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.transform(X_test)

In [15]:
logistic_regression = linear_model.LogisticRegression()

In [16]:
logistic_regression.fit(training_data, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [17]:
logistic_regression.coef_

array([[ 1.17190339,  0.86596357,  0.90039213,  1.05418113,  1.38092967,
        -2.2690806 , -0.97524971, -2.209991  , -2.05579658, -0.0973475 ,
        -1.43351813]])

In [18]:
def get_max(row):
    return row[row == np.max(row)].index[0]

In [19]:
def get_min(row):
    return row[row == np.min(row)].index[0]

In [20]:
t = pd.DataFrame(logistic_regression.coef_, columns=selected_words)
t.head()
t['whichMax'] = t.apply(get_max, axis=1)
t['whichMin'] = t.apply(get_min, axis=1)

In [21]:
t

Unnamed: 0,awesome,great,fantastic,amazing,love,horrible,bad,terrible,awful,wow,hate,whichMax,whichMin
0,1.171903,0.865964,0.900392,1.054181,1.38093,-2.269081,-0.97525,-2.209991,-2.055797,-0.097348,-1.433518,love,horrible


In [22]:
predicted = logistic_regression.predict(testing_data)
predicted

array([ True,  True,  True, ...,  True,  True,  True])

In [23]:
y_test.values

array([ True,  True,  True, ..., False, False,  True])

In [24]:
print("accuracy is {}%".format(accuracy_score(y_test, predicted) * 100))

accuracy is 84.5821714491%


In [25]:
print(logistic_regression.predict_proba(testing_data))

[[0.20776157 0.79223843]
 [0.20776157 0.79223843]
 [0.20776157 0.79223843]
 ...
 [0.20776157 0.79223843]
 [0.20776157 0.79223843]
 [0.20776157 0.79223843]]


## Majority Class classifier

In [64]:
happy = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'wow']
sad = ['horrible', 'bad', 'terrible', 'awful', 'hate']
happy_counter = CountVectorizer(vocabulary=happy)
sad_counter = CountVectorizer(vocabulary=sad)

In [65]:
happy_array = happy_counter.fit_transform(X_test).toarray()
sad_array = sad_counter.fit_transform(X_test).toarray()

In [66]:
happy_df = pd.DataFrame(happy_array, columns=happy)
sad_df = pd.DataFrame(sad_array, columns=sad)

In [67]:
happy_df['count'] = happy_df.apply(np.sum, axis=1)
sad_df['count'] = sad_df.apply(np.sum, axis=1)

In [68]:
majority_predictions = (happy_df['count'] - sad_df['count']) >= 0

In [69]:
print("accuracy is {}".format(accuracy_score(y_test, majority_predictions)))

accuracy is 0.843243081167
