In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors

In [2]:
df = pd.read_json('final_result_after_logistic.json')

In [3]:
df_knn = df[['asin','overall','predicted_rating','processedReview', 'summary']]

In [4]:
df_product_review_data_from_sentiment = df_knn.groupby("asin", as_index=False).mean()

In [5]:
df_product_review_data_from_sentiment.head()

Unnamed: 0,asin,overall,predicted_rating
0,5019281,4.458716,4.940745
1,307514161,4.732283,4.927104
2,310263662,3.962461,4.796995
3,310274281,4.869159,4.947654
4,767002652,4.296875,4.8422


In [6]:
df_merge_review_series = df_knn.groupby("asin")["summary"].apply(list).reset_index()
df_merge_review_data = pd.DataFrame(df_merge_review_series)

In [7]:
df_merge_review_data.head()

Unnamed: 0,asin,summary
0,5019281,"[good version of a classic, Good but not as mo..."
1,307514161,[Who needs it to be christmas to watch this fl...
2,310263662,[watching anyone being tortured would be sad -...
3,310274281,"[great life lessons, Awesome, Inspiring, GREAT..."
4,767002652,"[Entertaining!, Definitely a masterpiece! How..."


In [9]:
final_knn_data = pd.merge(df_product_review_data_from_sentiment, df_merge_review_series, on="asin", how='inner')

In [10]:
final_knn_data.head()

Unnamed: 0,asin,overall,predicted_rating,summary
0,5019281,4.458716,4.940745,"[good version of a classic, Good but not as mo..."
1,307514161,4.732283,4.927104,[Who needs it to be christmas to watch this fl...
2,310263662,3.962461,4.796995,[watching anyone being tortured would be sad -...
3,310274281,4.869159,4.947654,"[great life lessons, Awesome, Inspiring, GREAT..."
4,767002652,4.296875,4.8422,"[Entertaining!, Definitely a masterpiece! How..."


In [11]:
regEx = re.compile('[^a-z]+')
def clean_data(text_array):
    text = " ".join(text_array)
    text = text.lower()
    text = regEx.sub(' ', text).strip()
    return text

In [12]:
final_knn_data["clean_summary_data"] = final_knn_data["summary"].apply(clean_data)

# final_knn_data = pd.merge(df_product_review_data_from_sentiment, a, on="asin", how='inner')
# final_knn_data = pd.merge(df_merge_reviews,df_product_review_data_from_sentiment,on='asin')
# df_merge_reviews =df_merge_reviews.to_frame()

In [13]:
final_knn_data.head()

Unnamed: 0,asin,overall,predicted_rating,summary,clean_summary_data
0,5019281,4.458716,4.940745,"[good version of a classic, Good but not as mo...",good version of a classic good but not as movi...
1,307514161,4.732283,4.927104,[Who needs it to be christmas to watch this fl...,who needs it to be christmas to watch this fli...
2,310263662,3.962461,4.796995,[watching anyone being tortured would be sad -...,watching anyone being tortured would be sad je...
3,310274281,4.869159,4.947654,"[great life lessons, Awesome, Inspiring, GREAT...",great life lessons awesome inspiring great pic...
4,767002652,4.296875,4.8422,"[Entertaining!, Definitely a masterpiece! How...",entertaining definitely a masterpiece how refr...


In [14]:
final_knn_data_clean = final_knn_data[['asin','overall','predicted_rating','clean_summary_data']]

In [15]:
final_knn_data_clean.head()

Unnamed: 0,asin,overall,predicted_rating,clean_summary_data
0,5019281,4.458716,4.940745,good version of a classic good but not as movi...
1,307514161,4.732283,4.927104,who needs it to be christmas to watch this fli...
2,310263662,3.962461,4.796995,watching anyone being tortured would be sad je...
3,310274281,4.869159,4.947654,great life lessons awesome inspiring great pic...
4,767002652,4.296875,4.8422,entertaining definitely a masterpiece how refr...


In [16]:
final_knn_data_clean.to_json('final_knn_data_clean.json')


In [17]:
countVector = CountVectorizer(max_features = 300, stop_words='english') 
transformedReviews = countVector.fit_transform(final_knn_data_clean['clean_summary_data']) 

In [18]:
df_knn_vectorized_data = pd.DataFrame(transformedReviews.A, columns=countVector.get_feature_names())
df_knn_vectorized_data = df_knn_vectorized_data.astype(int)

In [19]:
countVector.get_feature_names()

[u'absolutely',
 u'acting',
 u'action',
 u'actors',
 u'actually',
 u'adaptation',
 u'adventure',
 u'age',
 u'amazing',
 u'american',
 u'animated',
 u'art',
 u'average',
 u'away',
 u'awesome',
 u'awful',
 u'bad',
 u'batman',
 u'beautiful',
 u'believe',
 u'best',
 u'better',
 u'big',
 u'bit',
 u'black',
 u'blood',
 u'blu',
 u'bond',
 u'book',
 u'boring',
 u'brilliant',
 u'buy',
 u'cast',
 u'character',
 u'characters',
 u'christmas',
 u'classic',
 u'collection',
 u'come',
 u'comedy',
 u'comic',
 u'complete',
 u'cool',
 u'creepy',
 u'cut',
 u'cute',
 u'dark',
 u'day',
 u'dead',
 u'decent',
 u'definitely',
 u'did',
 u'didn',
 u'die',
 u'different',
 u'director',
 u'disappointed',
 u'disappointing',
 u'disc',
 u'disney',
 u'disturbing',
 u'does',
 u'doesn',
 u'don',
 u'drama',
 u'dvd',
 u'edition',
 u'effects',
 u'end',
 u'ending',
 u'enjoy',
 u'enjoyable',
 u'enjoyed',
 u'entertaining',
 u'entertainment',
 u'epic',
 u'evil',
 u'excellent',
 u'exciting',
 u'expected',
 u'extras',
 u'family',

In [20]:
df_knn_vectorized_data.head()

Unnamed: 0,absolutely,acting,action,actors,actually,adaptation,adventure,age,amazing,american,...,world,worst,worth,worthy,wow,wrong,year,years,yes,zombie
0,0,0,0,0,0,4,0,0,0,21,...,0,0,0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0,2,1,0,...,0,0,1,0,0,0,2,0,0,0
2,2,1,1,0,1,0,0,0,10,0,...,4,4,8,1,10,1,5,2,2,0
3,0,0,0,0,0,0,0,0,0,0,...,2,0,0,1,0,0,0,0,0,0
4,1,1,1,0,0,0,0,1,0,0,...,1,0,2,1,1,0,0,2,1,0


In [21]:
final_knn_data_clean_without_summary = final_knn_data_clean[['asin','overall','predicted_rating']]


In [22]:
final_knn_data_clean_without_summary.head()

Unnamed: 0,asin,overall,predicted_rating
0,5019281,4.458716,4.940745
1,307514161,4.732283,4.927104
2,310263662,3.962461,4.796995
3,310274281,4.869159,4.947654
4,767002652,4.296875,4.8422


In [74]:
data_with_asin = final_knn_data_clean_without_summary.join(df_knn_vectorized_data, how='outer')

In [88]:
data_with_asin.reset_index()
data = data_with_asin.drop('asin', 1)
data_with_asin.tail(180)

Unnamed: 0,asin,overall,predicted_rating,absolutely,acting,action,actors,actually,adaptation,adventure,...,world,worst,worth,worthy,wow,wrong,year,years,yes,zombie
3377,B009JBZH54,4.653571,4.847075,1,0,3,0,0,0,0,...,0,0,3,0,1,0,0,0,0,0
3378,B009LDCWWG,4.683544,4.901649,1,1,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,0
3379,B009LDCXNY,4.678392,4.895247,1,0,0,0,0,0,0,...,1,0,2,0,0,0,0,3,1,0
3380,B009LDCZ7I,4.521739,4.939340,0,1,3,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
3381,B009LDD1H6,4.758621,4.999332,2,0,2,0,0,0,0,...,0,0,1,0,1,0,0,1,0,0
3382,B009MO57H6,3.502660,4.085877,2,0,23,0,0,0,0,...,1,8,2,0,2,1,0,1,1,1
3383,B009MO5E10,3.857143,4.678033,0,2,20,1,0,0,1,...,0,1,4,0,0,0,0,0,0,0
3384,B009NNM77E,4.373541,4.794674,1,0,2,0,0,0,1,...,1,0,2,0,2,0,0,1,4,0
3385,B009NNM828,2.470588,3.210331,0,1,2,0,0,1,1,...,0,3,3,0,0,0,1,0,0,0
3386,B009NNM9OA,4.539171,4.948506,0,5,1,0,0,0,0,...,0,0,4,1,5,0,3,1,0,0


In [77]:
data.head()

Unnamed: 0,overall,predicted_rating,absolutely,acting,action,actors,actually,adaptation,adventure,age,...,world,worst,worth,worthy,wow,wrong,year,years,yes,zombie
0,4.458716,4.940745,0,0,0,0,0,4,0,0,...,0,0,0,1,0,0,0,0,0,0
1,4.732283,4.927104,1,0,0,0,0,0,0,2,...,0,0,1,0,0,0,2,0,0,0
2,3.962461,4.796995,2,1,1,0,1,0,0,0,...,4,4,8,1,10,1,5,2,2,0
3,4.869159,4.947654,0,0,0,0,0,0,0,0,...,2,0,0,1,0,0,0,0,0,0
4,4.296875,4.8422,1,1,1,0,0,0,0,1,...,1,0,2,1,1,0,0,2,1,0


In [78]:
data.to_json("data.json")

In [79]:
np_data = np.array(data)


In [80]:
total_data = len(data)
size_of_cut = int(np.floor(0.95 * total_data))
training_data = np_data[:size_of_cut]
test_data = np_data[size_of_cut:]


In [101]:
len(training_data)
data_with_asin_test = data_with_asin.tail(total_data - size_of_cut)
data_with_asin_test_less_cols = data_with_asin_test[['asin','overall','predicted_rating']]

0

In [82]:
len(test_data)

178

In [83]:
neighbor = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(training_data)

In [84]:
distances, indices = neighbor.kneighbors(training_data)

In [228]:
def run_knn(row):
    val = neighbor.kneighbors([test_data[row['serial_no']]])[1]
    arr = val[0]
    temp_arr =[]
    for element in arr:
        temp_arr.append(data_with_asin["asin"][element])
    return ",".join(map(str, temp_arr))

In [None]:
data_with_asin_test_less_cols.reset_index()
data_with_asin_test_less_cols.insert(0, 'serial_no', range(0, 0 + len(data_with_asin_test_less_cols)))

In [229]:
data_with_asin_test_less_cols['recommended']= data_with_asin_test_less_cols.apply(run_knn,axis=1)
data_with_asin_test_less_cols.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,serial_no,asin,overall,predicted_rating,recommended
3379,0,B009LDCXNY,4.678392,4.895247,"B003L77FYS,B004YM6JI2,B004YM6JLO"
3380,1,B009LDCZ7I,4.521739,4.93934,"6303961614,B002JIOOCQ,B0060MYLCU"
3381,2,B009LDD1H6,4.758621,4.999332,"B000FTCLSU,1415707138,B0001HAGQK"
3382,3,B009MO57H6,3.50266,4.085877,"B002ZG98UA,0767834739,B003Y5H5EW"
3383,4,B009MO5E10,3.857143,4.678033,"B0062P332Y,6304681496,B009HIK3V2"


In [230]:
data_with_asin_test_less_cols.to_csv("data_with_asin_test_less_cols.csv")

In [231]:
data_with_asin_test_less_cols

Unnamed: 0,serial_no,asin,overall,predicted_rating,recommended
3379,0,B009LDCXNY,4.678392,4.895247,"B003L77FYS,B004YM6JI2,B004YM6JLO"
3380,1,B009LDCZ7I,4.521739,4.939340,"6303961614,B002JIOOCQ,B0060MYLCU"
3381,2,B009LDD1H6,4.758621,4.999332,"B000FTCLSU,1415707138,B0001HAGQK"
3382,3,B009MO57H6,3.502660,4.085877,"B002ZG98UA,0767834739,B003Y5H5EW"
3383,4,B009MO5E10,3.857143,4.678033,"B0062P332Y,6304681496,B009HIK3V2"
3384,5,B009NNM77E,4.373541,4.794674,"B001UV4XXS,B003UESJF6,B00005JPS6"
3385,6,B009NNM828,2.470588,3.210331,"0767839129,0767802497,B001OQCVI8"
3386,7,B009NNM9OA,4.539171,4.948506,"B001HN68ZU,B009AMAK54,B00466HN86"
3387,8,B009NNMAZI,3.887097,4.488358,"B0068RHSCW,B009HIK3V2,B005S9EJS2"
3388,9,B009OCR1OI,3.171053,4.003111,"B001F0TM4Y,B0000AZVEN,0780619250"
