In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import os

In [2]:
reviews = pd.read_csv('input/df_with_title2.csv')
features = pd.read_csv('input/final_features_data.csv')

In [3]:
total = pd.merge(reviews, features, how = "inner", on = "asin")

In [4]:
print(reviews.shape)
print(reviews.columns)

(22927, 10)
Index(['asin', 'helpful', 'overall', 'reviewText', 'reviewTime', 'reviewerID',
       'reviewerName', 'summary', 'unixReviewTime', 'title'],
      dtype='object')


In [5]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22927 entries, 0 to 22926
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   asin            22927 non-null  object
 1   helpful         22927 non-null  object
 2   overall         22927 non-null  int64 
 3   reviewText      22912 non-null  object
 4   reviewTime      22927 non-null  object
 5   reviewerID      22927 non-null  object
 6   reviewerName    21941 non-null  object
 7   summary         22926 non-null  object
 8   unixReviewTime  22927 non-null  int64 
 9   title           22927 non-null  object
dtypes: int64(2), object(8)
memory usage: 1.7+ MB


In [6]:
reviews.describe()

Unnamed: 0,overall,unixReviewTime
count,22927.0,22927.0
mean,4.087888,1356391000.0
std,1.248639,35992250.0
min,1.0,1077149000.0
25%,4.0,1345162000.0
50%,5.0,1362096000.0
75%,5.0,1379117000.0
max,5.0,1406074000.0


# TOP 10 PRODUCTS

In [7]:
rating = reviews.groupby('title').describe()['overall']
rating = rating.sort_values('mean', ascending = False)[['mean']]
rating.head(10)

Unnamed: 0_level_0,mean
title,Unnamed: 1_level_1
i.Trek Super Mount Metal Smartphone Tripod Adapter and Stand - Retail Packaging - Yellow,5.0
"Premium Soft Argyle Flexi TPU Gel Skin Case Cover for Samsung Fascinate, Clear Checkers",5.0
Samsung AT&T INFUSE 4G SGH-i997 Battery Charger with Stand [ET-CGPK002GSTA] - Retail Packaging - Black,5.0
"Black Leather Pouch Carrying Case w/Belt Clip and Loops for HTC Evo 4G, HD2, ...",5.0
Samsung ET-CGPK009GSTA Sprint SPH-D710 Battery Charger with Stand - Charger - Retail Packaging - Black,5.0
Belkin USB Charger + Sync / Charge Cable for Apple iPhone (Black),5.0
Wilson Electronics 4-inch 4G Mini Magnet-Mount Antenna w/ SMA Male Connector,5.0
"Ballistic SG Black Silicone, Black TPU, and Black PC for Samsung Nexus S and Samsung Nexus S 4G - 1 Pack - Case - Retail Packaging",5.0
"RAVPower 3100mAh Li-ion Battery For Samsung Galaxy Note 2/II, GT-N7100, SCH-I605(Verizon), SCH-R950(U.S. Cellular), SGH-I317(AT&T), SGH-T889(T-Mobile), SPH-L900(Sprint), fits Samsung EB595675LA",5.0
Motorola H385 Bluetooth Headset (Pink),5.0


# MOST POPULAR PRODUCT

In [8]:
from collections import Counter
productTypeCount = Counter(list(reviews['title']))
productTypeCount.most_common(10)

[('iPhone 4 / 4S Anti-Glare, Anti-Scratch, Anti-Fingerprint - Matte Finishing Screen Protector',
  694),
 ('iOttie  Easy One Touch Car Mount Holder for iPhoneX 8 7s 6s Plus 6s 5s 5c Samsung Galaxy S9 S8 Edge S7 S6',
  557),
 ('Galaxy S3 Case, Caseology [Daybreak Series] Slim Fit Shock Absorbent Cover [Pink] [Slip Resistant] for Samsung Galaxy S3 - Pink',
  510),
 ('Stylus Pen, New Trent Arcadia (1PC) Limir [Dual Purpose] Stylus/Styli for Capacitive Touch Screen Smartphones and Tablets. [2-in-1: Stylus Micro-Knit Tip + Fine Ball Pen]',
  348),
 ('External Battery, New Trent iCarrier 12000mAh Portable Dual USB Port External Battery Charger/Power Pack for Smartphones, iPhone 6, Iphone 6 Plus, S5, Nexus 6, Note 4, Tablets, iPad Air 2, iPad mini 3, and more (Now w/Micro-USB charge port)',
  339),
 ('iPhone 4 Screen Protector, Tech Armor High Definition HD-Clear Apple iPhone 4 / 4S Film Screen Protector [3-Pack]',
  325),
 ('New Trent: iTorch 5200mAh Ultra Portable USB Port External Battery 

# RECOMMENDATION SYSTEM

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from matplotlib import gridspec
from scipy.sparse import hstack

In [10]:
reviewsCopy = reviews.copy()
reviewsSorted = reviews.sort_values('title', ascending = False)

In [11]:
reviewsSorted.head(10)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,title
3612,B003U4TLL2,"[0, 0]",1,"second try with these, same result for my setu...","07 8, 2013",A2HP63L85NI0CC,"Greg Fridder ""Fridder""","2nd try same result, no workie",1373241600,zBoost ZB545 SOHO Dual Band Cell Phone Signal ...
3605,B003U4TLL2,"[5, 6]",2,I cant say I am completely unhappy with this d...,"03 23, 2012",A2IKMRNKQ12ZPT,"Amazon.Buyer ""Never argue with an 1d10t...the...",The Boost in the name and description is misle...,1332460800,zBoost ZB545 SOHO Dual Band Cell Phone Signal ...
3938,B003VOW5WI,"[0, 1]",1,"Doesn't works, I have the same problem, no pho...","07 13, 2014",A2Y8WWR06KCIUZ,Ezio,no recommended,1405209600,zBoost ZB545 SOHO Dual Band Cell Phone Signal ...
3937,B003VOW5WI,"[1, 1]",4,this booster...in a word...works!It does what ...,"03 31, 2013",ADZNLCXNYH3FN,"Dwight Howard ""dahoward77""",To extend in weak cell areas,1364688000,zBoost ZB545 SOHO Dual Band Cell Phone Signal ...
3936,B003VOW5WI,"[1, 1]",5,I am a freak at insulating my home. Aluminum ...,"07 9, 2013",AQCA8WBO82VQH,Duncan Cunningham,It works - If only for up to 3G,1373328000,zBoost ZB545 SOHO Dual Band Cell Phone Signal ...
3935,B003VOW5WI,"[848, 866]",5,"This review ran a bit longer than I expected, ...","10 2, 2010",ATFBVUXDIRXT6,D. Matheny,Perfect 5-bar solution when installed correctly!,1285977600,zBoost ZB545 SOHO Dual Band Cell Phone Signal ...
3934,B003VOW5WI,"[1, 1]",1,tried this piece of junk at three different si...,"02 14, 2013",A3SMOIJ8TF3V4U,dirkjal,total unadulterated crap,1360800000,zBoost ZB545 SOHO Dual Band Cell Phone Signal ...
3933,B003VOW5WI,"[2, 2]",5,Something you might want to do is have a high ...,"01 8, 2014",A2UFJUXRBBZ8UU,"David Sandbeck ""hallgrd""","This works, as a plug and play solution for po...",1389139200,zBoost ZB545 SOHO Dual Band Cell Phone Signal ...
3932,B003VOW5WI,"[1, 1]",5,"Put the external antenna in the attic, ran the...","07 23, 2013",AZDKXWAE79HJ9,"C. Ray ""GaDisciple""",Perfect Solution,1374537600,zBoost ZB545 SOHO Dual Band Cell Phone Signal ...
3931,B003VOW5WI,"[0, 0]",3,The product does seem to increase the availabl...,"09 18, 2011",A8JP7MZH2S3X3,cardton,works ok but if you have sprint...,1316304000,zBoost ZB545 SOHO Dual Band Cell Phone Signal ...


In [12]:
reviewsUnique = reviewsSorted.drop_duplicates(subset = 'asin', keep = "last")

In [13]:
print(reviewsUnique.shape)
reviewsUnique.head(10)

(1184, 10)


Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,title
3943,B003VOW5WI,"[0, 0]",5,WOW!!! Works great. Put it up on the roof. Pro...,"12 20, 2011",A3B4TRVL7JYHHD,"Lious Friend ""Credo ut intelligam""",Great cell phone booster!,1324339200,zBoost ZB545 SOHO Dual Band Cell Phone Signal ...
3606,B003U4TLL2,"[0, 0]",4,"Works as advertised, suction cups kind of chee...","09 19, 2013",A1MASW50Z8DVXH,ArvadaDude,Wireless extender,1379548800,zBoost ZB545 SOHO Dual Band Cell Phone Signal ...
7631,B0048IZL5E,"[0, 0]",4,Surprisingly snug on the sides for the touch. ...,"05 15, 2014",AXO4PQU0XG3TG,Dwight,Fits iPod touch with some headroom,1400112000,uNu Power DX PLUS External Protective Battery ...
15534,B006R94O7I,"[1, 1]",4,"These aren't too bad, love the matte back prot...","01 30, 2012",A17K8TKHWF8MWZ,CheetahTrans,Nice screen protectors,1327881600,splash MASQUE Screen Protector Film Clear (Inv...
15729,B006UYDCI2,"[0, 0]",5,This came in early and exactly like the pictur...,"05 12, 2013",A3A61D0Z02NW8,Baines,What you see is what you get!,1368316800,niceeshop(TM) White 3D Bling Crystals Rhinesto...
18623,B007SRWDUI,"[0, 0]",5,The color is a tad greener than the photo yet ...,"03 4, 2013",A1S67AHQEO3U13,Victoria,Green Polka Dot Gel Case Iphone 4,1362355200,niceeshop Green&White HM Polka Dots Gel Flexib...
8554,B004GJMTHS,"[0, 0]",5,Very pretty color. Easy to put on. Took a ve...,"12 4, 2012",A2YQF017D4AO0S,Dorothy Ebacher,purple case,1354579200,niceEshop(TM) TPU Rubber Skin Case Compatible ...
14585,B006FCO0NU,"[0, 0]",3,"Good case for it being cheap, but you can defi...","01 17, 2013",A1T14MCUQEN8KF,"Andrew J. Rectenwald ""arecten""",It's okay,1358380800,niceEshop Snap-on Rubber Coated Case compatibl...
4373,B003XNGY2Y,"[1, 1]",3,The reviewers that said that this is not a ver...,"01 17, 2011",A2XK8WOE7ZP52G,"D. Jenkins ""Jinkx""",Must agree with previous reviewers,1295222400,niceEshop Premium Soft Silicone Gel Skin Case ...
18051,B007OBVFRU,"[0, 0]",3,"Although this case was inexpensive, I would ex...","01 16, 2014",A3D27FSC1OZPN,Kajialee,Not what I expected....,1389830400,niceEshop Brown Wallet Style Carbon Fiber Prin...


In [14]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [15]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

In [18]:
nltk.download('punkt')

[nltk_data] Error loading punkt: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


False

In [19]:
nltk.download('all')

[nltk_data] Error loading all: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


False

In [20]:
stopWords = set(stopwords.words('english'))
stopWords

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - 'C:\\Users\\user2/nltk_data'
    - 'C:\\Users\\user2\\anaconda3\\nltk_data'
    - 'C:\\Users\\user2\\anaconda3\\share\\nltk_data'
    - 'C:\\Users\\user2\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\user2\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [None]:
def nlpPreprocessing(totalText, index, column):
    if type(totalText) is not int:
        string = ""
        for words in totalText.split():
            newWord = ("".join(e for e in words if e.isalnum()))
            newWord = newWord.lower()
            if newWord not in stopWords:
                string += newWord + " "
        reviewsUnique[column][index] = string

In [None]:
for index, row in reviewsUnique.iterrows():
    nlpPreprocessing(row['title'], index, 'title')

In [None]:
reviewsUnique

In [None]:
reviewsUnique.to_csv('output/preprocessed.csv')

In [None]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

In [None]:
def plotHeatmap(keys, values, labels, text):
    gs = gridspec.GridSpec(2, 2, width_ratios = [4, 1], height_ratios = [4, 1])
    fig = plt.figure(figsize = (25, 3))
    ax = plt.subplot(gs[0])
    ax = sns.heatmap(np.array([values]), annot = np.array([labels]))
    ax.set_xticklabels(keys)
    ax.set_title(text)
    plt.show()

In [None]:
def plotHeatmapImage(docID, vec1, vec2, text, model):
    intersection = set(vec1.keys()) & set(vec2.keys())
    for i in vec2:
        if i not in intersection:
            vec2[i] = 0
    keys = list(vec2.keys())
    values = [vec2[x] for x in vec2.keys()]
    if model == 'bag_of_words':
        labels = values
    elif model == 'tfidf':
        labels = []
        for x in vec2.keys():
            if x in tfidf_title_vectorizer.vocabulary_:
                labels.append(tfidf_title_features[docID, tfidf_title_vectorizer.vocabulary_[x]])
            else:
                labels.append(0)
    elif model == 'idf':
        labels = []
        for x in vec2.keys():
            if x in idf_title_vectorizer.vocabulary_:
                labels.append(idf_title_features[docID, idf_title_vectorizer.vocabulary_[x]])
            else:
                labels.append(0)
    plotHeatmap(keys, values, labels, text)         

In [None]:
import re
def textToVector(text):
    word = re.compile(r'\w+')
    words = word.findall(text)
    return Counter(words)

In [None]:
def getResults(docID, contentA, contentB, model):
    text1 = contentA
    text2 = contentB
    vector1 = textToVector(text1)
    vector2 = textToVector(text2)
    plotHeatmapImage(docID, vector1, vector2, text2, model)

In [None]:
print(reviewsUnique.shape)
reviewsUnique.head()

In [None]:
tfidf_title_vectorizer = TfidfVectorizer(min_df = 0)
tfidf_title_features = tfidf_title_vectorizer.fit_transform(reviewsUnique['title'])
print(tfidf_title_features[:5])
print(tfidf_title_features.shape)

In [None]:
def tfidfModel(docID, numResults):
    lt = []
    pairwise_dist = pairwise_distances(tfidf_title_features, tfidf_title_features[docID])
    indices = np.argsort(pairwise_dist.flatten())[0:numResults+5]
    pdists = np.sort(pairwise_dist.flatten())[0:numResults+5]
    dfIndices = list(reviewsUnique.index[indices])
    for i in range(0, len(indices)):
        if(pdists[i] != 0.0):
            getResults(indices[i], reviewsUnique['title'].loc[dfIndices[0]], reviewsUnique['title'].loc[dfIndices[i]], 'tfidf')
            lt.append(reviewsUnique['asin'].loc[dfIndices[i]])
            print('ASIN :',reviewsUnique['asin'].loc[dfIndices[i]])
            print ('Eucliden distance from the given image :', pdists[i])
            print('='*100)
    return lt
tfidfModel(1,5)

In [None]:
reviewsUniqueCopy = reviewsUnique.copy().reset_index()
print(reviewsUniqueCopy['asin'])

In [None]:
ip = int(input("Enter your choice: "))
lt = tfidfModel(ip, 5)

In [None]:
lts = pd.DataFrame(lt)
lts.rename(columns = {0:'asin'}, inplace = True)

In [None]:
found = pd.merge(lts, features, how = "inner", on = "asin")
temp = pd.DataFrame(features.columns[1:])

In [None]:
temp

In [None]:
print("Priority1: ")
c1 = int(input())
s1 = temp[0][c1]

In [None]:
print("Priority2: ")
c2 = int(input())
s2 = temp[0][c3]

In [None]:
print("Priority3: ")
c3 = int(input())
s3 = temp[0][c3]

In [None]:
found.sort_values(by=[s1, s2, s3], ascending=False)