In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv('amazon_clean4.csv')

In [3]:
df =df.dropna()

In [4]:
df.head()

Unnamed: 0,product_title,star_rating,review_body
0,Madden NFL 16 - Xbox One Digital Code,2,I keep buying madden every year hoping they ge...
1,Command & Conquer The Ultimate Collection [Ins...,5,If you are prepping for the end of the world t...
2,Sims 4,4,"i like the new skills like herbalism in this, ..."
3,Sid Meier's Civilization V,1,"As has been written by so many others, I quick..."
4,Build-a-lot 4: Power Source [Download],5,Probably the best game for learning aspects of...


In [5]:
#docs = [doc for doc in df['review_body']]
docs = df['review_body'].values

In [6]:
docs_label = [i[:30]+"..." for i in docs]

In [7]:
STOP = stopwords.words('english')
STOP += ['one', 'even', 'city', 'sim', 'sims', 'simcity', 'cities', 'really', 'would', 'lot', 'playing', 'many', 'sonic', 'however', 'still', 'since', 'ever', 'way', 'use', 'played', 'also', 'cdata', 'game', 'games', 'windows', 'object', 'video', 'ps']

In [8]:
vectorizer = CountVectorizer(stop_words=STOP, lowercase=True, token_pattern = r'(?u)\b[A-Za-z]+\b')
doc_word = vectorizer.fit_transform(docs)

In [9]:
nmf_model = NMF(10)
doc_topic = nmf_model.fit_transform(doc_word)

In [10]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [11]:
display_topics(nmf_model, vectorizer.get_feature_names(), 40)


Topic  0
story, well, first, much, level, character, world, make, combat, find, characters, player, system, different, two, gameplay, around, though, enemies, end, go, could, things, see, experience, pretty, weapons, back, right, take, want, people, another, bit, made, times, every, actually, enough, may

Topic  1
amazon, download, steam, code, buy, origin, work, account, card, install, purchase, product, bought, version, xbox, got, pc, online, downloaded, computer, ea, live, go, problem, software, money, key, support, purchased, first, digital, never, run, price, service, tried, downloading, worked, could, update

Topic  2
play, online, ea, want, player, able, servers, server, hours, people, pc, mode, single, computer, players, internet, version, multiplayer, money, friends, free, issues, bought, could, graphics, love, connection, day, origin, first, start, without, know, enjoy, offline, work, years, make, always, drm

Topic  3
like, feel, much, people, think, things, better, see, ba

In [12]:
H = pd.DataFrame(doc_topic.round(5),
             index = docs_label,
             columns = ["game_content", "download", "online", "critique", "negative_feedback", "positive_feeback", "dlc", "worth_time", "difficulty", "performance"])

In [13]:
rating = df['star_rating']
product_title = df['product_title']

H = H.reset_index().join(rating)
H = H.join(product_title)
H.head()

Unnamed: 0,index,game_content,download,online,critique,negative_feedback,positive_feeback,dlc,worth_time,difficulty,performance,star_rating,product_title
0,I keep buying madden every yea...,0.02181,0.03382,0.17542,0.05221,0.10315,0.06052,0.02655,0.00389,0.00186,0.00397,2,Madden NFL 16 - Xbox One Digital Code
1,If you are prepping for the en...,0.01967,0.0,0.0,0.0,0.0,0.05295,0.00238,0.0,0.0,0.0,5,Command & Conquer The Ultimate Collection [Ins...
2,i like the new skills like her...,0.0,0.0,0.0,0.12534,0.0,0.0,0.1001,0.0,0.04957,0.0,4,Sims 4
3,As has been written by so many...,0.0,0.01225,0.00335,0.00472,0.00086,0.00217,0.04575,0.00412,0.0,0.05595,1,Sid Meier's Civilization V
4,Probably the best game for lea...,0.03204,0.00254,0.00253,0.00252,0.0,0.00369,0.01175,0.00366,0.00198,0.00613,5,Build-a-lot 4: Power Source [Download]


In [24]:
sorted(H['product_title'].unique())

['007 Legends [Download]',
 '1 Moment Of Time: Silentville [Download]',
 '1 Month Subscription: EVE Online [Instant Access]',
 '1 PLEX: EVE Online [Instant Access]',
 '1 Penguin 100 Cases [Download]',
 '1 Year Membership: AdventureQuest Worlds [Instant Access]',
 '1 vs 100 [Download]',
 '1-Year PS Plus + $10 PS Gift Card - PS3 / PS4 [Digital Code]',
 '1-Year PS Plus + $20 PS Gift Card - PS3 / PS4 [Digital Code]',
 '1-Year PS Plus + $50 PS Gift Card - PS3 / PS4 [Digital Code]',
 '10 Talismans [Download]',
 '100 % Hidden Objects 2 [Download]',
 '100% Hidden Object (Mac) [Download]',
 '100% Hidden Objects',
 '1001 Japanese Crosswords',
 '1001 Kidz Games [Download]',
 '1001 Mini-Golf Challenge [Download]',
 '1001 Nights: The Adventures of Sindbad',
 '1001 Tangram Puzzles',
 '101 - in - 1 Megamix [Online Game Code]',
 '1080Â° Snowboarding [Online Game Code]',
 '12 Labours of Hercules II: The Cretan Bull [Download]',
 '12 Labours of Hercules [Download]',
 '12 PLEX: EVE Online [Instant Access

In [14]:
H.sort_values(by=['negative_feedback'], ascending=False).head(14)

Unnamed: 0,index,game_content,download,online,critique,negative_feedback,positive_feeback,dlc,worth_time,difficulty,performance,star_rating,product_title
104887,I finally broke down on impuls...,1.05517,0.0,0.11118,0.69516,2.63968,0.46506,0.27275,0.64848,0.54487,0.11295,5,Guild Wars 2 Digital Deluxe [Online Game Code]
57284,"When your boss says to you \\""...",1.50992,0.25803,0.25046,0.73313,2.28923,0.0,0.0,0.74243,0.0,0.27305,2,Faster Than Light
76163,"Weird thing, I love this game....",0.72964,0.0111,0.0,0.75484,1.68358,0.0,0.24484,0.71144,0.19651,0.77106,5,Far Cry 2
24040,The idea of a silly shooter in...,0.2755,0.0,0.84836,0.15013,1.61244,0.00596,0.49815,0.4264,0.39513,0.37682,3,Plants vs. Zombies Garden Warfare
104470,I have been really enjoying gu...,1.67494,0.21905,0.0,0.89327,1.52294,0.02385,0.3251,0.65496,0.0,0.0,5,Guild Wars 2 Digital Deluxe [Online Game Code]
81292,CONTENTS:1. The Bottom Line (I...,1.6621,0.27239,0.07295,0.4227,1.38545,0.13953,0.0,1.33372,0.00671,0.44099,2,Tomb Raider
70446,"Overall, Might & Magic VI is a...",1.00154,0.24251,0.18638,0.80654,1.18425,0.0,0.18405,0.33705,0.19026,0.27893,1,Might & Magic Heroes VI: Complete Edition [Dow...
79170,So quickest review: buy it.Lon...,0.55858,0.61979,0.11748,0.83753,1.17549,0.13319,2.60761,0.50434,0.35987,0.51528,5,Trainz Simulator 2009: World Builder Edition [...
41388,I found out about this game ov...,1.69468,0.0,0.3497,0.49941,1.09131,0.0,0.29672,0.65451,0.0,0.29483,3,Final Fantasy XIV: A Realm Reborn
31653,"Being only 19 years old, I hav...",0.53965,0.01299,0.20018,0.45237,1.016,0.60054,0.0,0.04483,0.0,0.12977,5,Wolfenstein: The New Order


In [15]:
H_new = H.set_index('index')
H_new.head()

Unnamed: 0_level_0,game_content,download,online,critique,negative_feedback,positive_feeback,dlc,worth_time,difficulty,performance,star_rating,product_title
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
I keep buying madden every yea...,0.02181,0.03382,0.17542,0.05221,0.10315,0.06052,0.02655,0.00389,0.00186,0.00397,2,Madden NFL 16 - Xbox One Digital Code
If you are prepping for the en...,0.01967,0.0,0.0,0.0,0.0,0.05295,0.00238,0.0,0.0,0.0,5,Command & Conquer The Ultimate Collection [Ins...
i like the new skills like her...,0.0,0.0,0.0,0.12534,0.0,0.0,0.1001,0.0,0.04957,0.0,4,Sims 4
As has been written by so many...,0.0,0.01225,0.00335,0.00472,0.00086,0.00217,0.04575,0.00412,0.0,0.05595,1,Sid Meier's Civilization V
Probably the best game for lea...,0.03204,0.00254,0.00253,0.00252,0.0,0.00369,0.01175,0.00366,0.00198,0.00613,5,Build-a-lot 4: Power Source [Download]


In [16]:
H_new['star_rating'] = H_new['star_rating'].replace([1,2], 'negative')
H_new['star_rating'] = H_new['star_rating'].replace([4,5], 'positive')

In [17]:
H_avg = H_new.groupby('star_rating').mean()
H_avg.head()

Unnamed: 0_level_0,game_content,download,online,critique,negative_feedback,positive_feeback,dlc,worth_time,difficulty,performance
star_rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3,0.024285,0.017526,0.026833,0.025433,0.022135,0.012732,0.01431,0.018584,0.019628,0.021368
negative,0.012201,0.032171,0.027417,0.014361,0.020894,0.005955,0.01014,0.016185,0.006609,0.009139
positive,0.014861,0.017442,0.018403,0.015569,0.015485,0.020517,0.010275,0.012709,0.016834,0.01507


Negative top Topics: download 0.029, online 0.027, negative_feedback 0.021, worth_time 0.016 <br>
Positive top Topics: positive_feedback 0.020, difficulty 0.020, online 0.019, performance 0.018, 

In [18]:
#def game_review(title):
    #return H_new.loc[(H_new['star_rating'] == 'positive')].mean()

In [19]:
H_new.loc[(H_new['product_title'] == 'Tropico 5') & (H_new['star_rating'] == 'positive')].mean().sort_values(ascending=False)

dlc                  0.051509
difficulty           0.042493
critique             0.031615
game_content         0.030587
online               0.027578
worth_time           0.025037
negative_feedback    0.023834
positive_feeback     0.020685
performance          0.013833
download             0.009981
dtype: float64

In [20]:
H_new.loc[(H_new['product_title'] == 'Tropico 5') & (H_new['star_rating'] == 'negative')].mean().sort_values(ascending=False)

download             0.031993
online               0.021996
negative_feedback    0.018361
performance          0.006763
game_content         0.006148
critique             0.004934
dlc                  0.004736
positive_feeback     0.001441
worth_time           0.001394
difficulty           0.000650
dtype: float64