# Recommendation_Engine

In [29]:
import pandas as pd
import numpy as np
import datapreprocessing as dp


# sklearn Packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

# Visualization Packages
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Explorative Data Analysis (EDA)

In [2]:
df_Apps=dp.getDF("Apps_for_Android_5.json")

In [3]:
df_Apps = df_Apps[:10000]

### Reformat 'reviewTime' into a datetime

In [5]:
df_Apps.reviewTime = pd.to_datetime(df_Apps.reviewTime)

In [6]:
df_Apps.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1N4O8VOJZTDVB,B004A9SDD8,Annette Yancey,"[1, 1]","Loves the song, so he really couldn't wait to ...",3.0,Really cute,1383350400,2013-11-02
1,A2HQWU6HUKIEC7,B004A9SDD8,"Audiobook lover ""Kathy""","[0, 0]","Oh, how my little grandson loves this app. He'...",5.0,2-year-old loves it,1323043200,2011-12-05
2,A1SXASF6GYG96I,B004A9SDD8,Barbara Gibbs,"[0, 0]",I found this at a perfect time since my daught...,5.0,Fun game,1337558400,2012-05-21
3,A2B54P9ZDYH167,B004A9SDD8,"Brooke Greenstreet ""Babylove""","[3, 4]",My 1 year old goes back to this game over and ...,5.0,We love our Monkeys!,1354752000,2012-12-06
4,AFOFZDTX5UC6D,B004A9SDD8,C. Galindo,"[1, 1]",There are three different versions of the song...,5.0,This is my granddaughters favorite app on my K...,1391212800,2014-02-01


### Filtering

In [7]:
df_Apps.columns

Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime'],
      dtype='object')

In [8]:
reviews = df_Apps['asin'].groupby(df_Apps['asin']).count()
print('Number of reviews for each product: {}'.format(reviews))

Number of reviews for each product: asin
B004A9SDD8     17
B004AFQAUA     62
B004AHBBPW     96
B004ALVL6W     95
B004AMAIZQ      7
B004AMDC86      7
B004ANC00Q     63
B004ANE2WU      6
B004ANMWPY    244
B004AZH4C8     86
B004AZSY4K     53
B004BN3YQE     76
B004C4FL5Y     24
B004CN7Y4G     13
B004DJXQDC      5
B004DKSUXC     11
B004DLLNFS     17
B004DLNBDA     76
B004DLNC4I    380
B004DLPXAO    571
B004DLTS4G      6
B004DLUS4A     17
B004DLUSD6     19
B004DLYJCC     17
B004DLZBU6     33
B004DM0UII    122
B004DM0X8U     18
B004DM0X9O     36
B004DM1OAQ    355
B004DM1ZG4     14
             ... 
B004GL88GW     65
B004GLI2GS      9
B004GMJKTK    283
B004GMP53U     86
B004GMP8AU      5
B004GMT14Y    327
B004GR1Q38     44
B004GT9ET4     44
B004GVUPFE     69
B004GVV4T0    144
B004GVYJCE     30
B004GVYXTS    482
B004GW0EW2      8
B004GWRMWC    189
B004GY8NTQ     42
B004GYCF7W     96
B004GYNPL2     58
B004GYOQ6A     47
B004GYVYV0    122
B004GYY714    563
B004GYYQLA     99
B004GZHRR4      5
B004H

In [9]:
reviews_sorted = reviews.sort_values(ascending=False)
print('Top 10 reviewed products: \n')
print('{}'.format(reviews_sorted.head(10)))

Top 10 reviewed products: 

asin
B004FOA84A    582
B004DLPXAO    571
B004GYY714    563
B004DPCSKI    484
B004GVYXTS    482
B004DLNC4I    380
B004DM1OAQ    355
B004DPIEF6    355
B004EBZX6W    348
B004FRX0MY    328
Name: asin, dtype: int64


In [10]:
reviews_sorted = reviews.sort_values(ascending=True)
print('Bottom 10 reviewed products: \n')
print('{}'.format(reviews_sorted.head(10)))

Bottom 10 reviewed products: 

asin
B004FKH6MG    5
B004E32A1Q    5
B004H3H3VA    5
B004H2TZTO    5
B004FG2SKU    5
B004FFLJ8I    5
B004GZHRR4    5
B004EVRBJ4    5
B004H6SKWS    5
B004GMP8AU    5
Name: asin, dtype: int64


# Modelling

### Logistic Regression

In [11]:
df_Apps['pos_neg_rating'] = [1 if x > 3 else 0 for x in df_Apps['overall']]
df_Apps.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,pos_neg_rating
0,A1N4O8VOJZTDVB,B004A9SDD8,Annette Yancey,"[1, 1]","Loves the song, so he really couldn't wait to ...",3.0,Really cute,1383350400,2013-11-02,0
1,A2HQWU6HUKIEC7,B004A9SDD8,"Audiobook lover ""Kathy""","[0, 0]","Oh, how my little grandson loves this app. He'...",5.0,2-year-old loves it,1323043200,2011-12-05,1
2,A1SXASF6GYG96I,B004A9SDD8,Barbara Gibbs,"[0, 0]",I found this at a perfect time since my daught...,5.0,Fun game,1337558400,2012-05-21,1
3,A2B54P9ZDYH167,B004A9SDD8,"Brooke Greenstreet ""Babylove""","[3, 4]",My 1 year old goes back to this game over and ...,5.0,We love our Monkeys!,1354752000,2012-12-06,1
4,AFOFZDTX5UC6D,B004A9SDD8,C. Galindo,"[1, 1]",There are three different versions of the song...,5.0,This is my granddaughters favorite app on my K...,1391212800,2014-02-01,1


In [63]:
X = df_Apps.reviewText
y = df_Apps.pos_neg_rating

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2) 

In [65]:
print('Train_data set size: {}'.format(X_train.shape[0]))
print('Train_target set size: {}'.format(y_train.shape[0]))
print('Test set size: {}'.format(X_test.shape[0]))

Train_data set size: 8000
Train_target set size: 8000
Test set size: 2000


### CountVectorizer

In [66]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)

In [67]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [68]:
log_reg_cv = cross_val_score(LogisticRegression(), X_train, y_train, cv = 5)

In [69]:
np.mean(log_reg_cv)

0.8322499999999999

In [70]:
X_test = vectorizer.transform(X_test)

In [78]:
y_pred = log_reg.predict(X_test)

### Evaluation

In [80]:
target_name = ['class 0', 'class 1']
print(classification_report(y_test, y_pred, target_names=target_name))

             precision    recall  f1-score   support

    class 0       0.70      0.63      0.66       548
    class 1       0.86      0.90      0.88      1452

avg / total       0.82      0.82      0.82      2000

