## Model strategy:
### Hybrid model is the combination of user-based collaborative filtering and content-based filtering
<li> User-based collaborative: compare and suggest between users share similar interests ('user_id')
<li> Content-based filtering: based on 'meta' data to suggest similar products to user.

## Import Libraries and Data

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import data:
df= pd.read_csv('/Users/kienguyen/Documents/DATA SCIENCE/MSDS/12. Practicum II/data set/cleaned_encoded_data.csv', index_col=0)

In [3]:
df.head(5)

Unnamed: 0,rating,review_title,parent_asin,user_id,clean_review,main_category,product_name,average_rating,rating_number,price,...,Charlie Rose Store,Entertainment,Kelly,R.,Alien Saga,Van Halen,Engineering an Empire,Warner Archive,Chinese,Iran
0,3.0,battlespace,B000PIU2A0,AEFKF6R2GUSK2AWPSWRR4ZO36JVQ,item canceled comment,0,Battlespace,3.5,49.0,19.99,...,0,0,0,0,0,0,0,0,0,0
1,4.0,Four Stars,B000PIU2A0,AGCVM6J2S7N5YQNHNVNXQNEWQGOA,love nonstandard nontraditional movie,0,Battlespace,3.5,49.0,19.99,...,0,0,0,0,0,0,0,0,0,0
2,3.0,Three Stars,B000PIU2A0,AGTS54M26X3WOTH4IWLYLCWF54UQ,goog,0,Battlespace,3.5,49.0,19.99,...,0,0,0,0,0,0,0,0,0,0
3,5.0,Five Stars,B001QTXM5Y,AEFKF6R2GUSK2AWPSWRR4ZO36JVQ,good movie liked seen,0,Watchmen (Director's Cut),4.6,13923.0,12.4,...,0,0,0,0,0,0,0,0,0,0
4,5.0,Watchmen review,B001QTXM5Y,AENSS3WUCE2RGLXX5TYBFBXBRQEQ,familiar watchman series comic interested movi...,0,Watchmen (Director's Cut),4.6,13923.0,12.4,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df_copy=df.copy(deep=True)
df_copy.describe()

Unnamed: 0,rating,main_category,average_rating,rating_number,price,Movies & TV,Science Fiction & Fantasy,Science Fiction,Sci-Fi Action,Genre for Featured Categories,...,Charlie Rose Store,Entertainment,Kelly,R.,Alien Saga,Van Halen,Engineering an Empire,Warner Archive,Chinese,Iran
count,1062471.0,1062471.0,1062471.0,1062471.0,1062471.0,1062471.0,1062471.0,1062471.0,1062471.0,1062471.0,...,1062471.0,1062471.0,1062471.0,1062471.0,1062471.0,1062471.0,1062471.0,1062471.0,1062471.0,1062471.0
mean,4.529112,0.07619973,4.575194,6185.591,18.81626,0.9236487,0.0313825,0.03876435,0.002952551,0.1626689,...,9.412022e-07,9.412022e-07,9.412022e-07,9.412022e-07,9.412022e-07,3.764809e-06,9.412022e-07,9.412022e-07,9.412022e-07,9.412022e-07
std,0.924841,0.2653175,0.3050175,12690.75,23.99254,0.2655594,0.1743493,0.1930329,0.05425713,0.3690635,...,0.0009701557,0.0009701557,0.0009701557,0.0009701557,0.0009701557,0.001940309,0.0009701557,0.0009701557,0.0009701557,0.0009701557
min,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,0.0,4.5,312.0,8.99,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5.0,0.0,4.7,1384.0,12.99,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5.0,0.0,4.8,5862.0,19.99,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,5.0,1.0,5.0,194457.0,2456.78,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1062471 entries, 0 to 1063222
Columns: 706 entries, rating to Iran
dtypes: float64(4), int64(695), object(7)
memory usage: 5.6+ GB


In [6]:
df_copy.dropna(inplace=True)
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1003107 entries, 0 to 1063222
Columns: 706 entries, rating to Iran
dtypes: float64(4), int64(695), object(7)
memory usage: 5.3+ GB


In [7]:
df_copy=df_copy.drop_duplicates()

In [8]:
# Drop users have less than 30 reviews
user_review_cnt = df_copy.groupby('user_id').size()
user_to_keep = user_review_cnt[user_review_cnt >= 30].index
df_narrow = df_copy[df_copy['user_id'].isin(user_to_keep)]

In [9]:
df_narrow.head()

Unnamed: 0,rating,review_title,parent_asin,user_id,clean_review,main_category,product_name,average_rating,rating_number,price,...,Charlie Rose Store,Entertainment,Kelly,R.,Alien Saga,Van Halen,Engineering an Empire,Warner Archive,Chinese,Iran
1,4.0,Four Stars,B000PIU2A0,AGCVM6J2S7N5YQNHNVNXQNEWQGOA,love nonstandard nontraditional movie,0,Battlespace,3.5,49.0,19.99,...,0,0,0,0,0,0,0,0,0,0
2,3.0,Three Stars,B000PIU2A0,AGTS54M26X3WOTH4IWLYLCWF54UQ,goog,0,Battlespace,3.5,49.0,19.99,...,0,0,0,0,0,0,0,0,0,0
4,5.0,Watchmen review,B001QTXM5Y,AENSS3WUCE2RGLXX5TYBFBXBRQEQ,familiar watchman series comic interested movi...,0,Watchmen (Director's Cut),4.6,13923.0,12.4,...,0,0,0,0,0,0,0,0,0,0
6,4.0,Four Stars,B001QTXM5Y,AELEP6CFVPHMWS4IDS5EPM22GZQQ,played good happy purchase,0,Watchmen (Director's Cut),4.6,13923.0,12.4,...,0,0,0,0,0,0,0,0,0,0
7,4.0,you tell me,B001QTXM5Y,AETXVRADBT6BRQFMVSKTSWF7WMDA,watch friend,0,Watchmen (Director's Cut),4.6,13923.0,12.4,...,0,0,0,0,0,0,0,0,0,0


In [16]:
start_column_index = 11  # index of the 12th column
end_column_index = len(df_narrow.columns)

# Convert one-hot encoded sub_categories back to a column of string values
def decode_sub_categories(row):
    categories = []
    for column, value in row.iloc[start_column_index:end_column_index].iteritems():
        if value == 1:
            categories.append(column)
    return ','.join(categories)

df_narrow['sub_categories'] = df_narrow.apply(decode_sub_categories, axis=1)


In [17]:
df_narrow.head()

Unnamed: 0,rating,review_title,parent_asin,user_id,clean_review,main_category,product_name,average_rating,rating_number,price,...,Entertainment,Kelly,R.,Alien Saga,Van Halen,Engineering an Empire,Warner Archive,Chinese,Iran,sub_categories
1,4.0,Four Stars,B000PIU2A0,AGCVM6J2S7N5YQNHNVNXQNEWQGOA,love nonstandard nontraditional movie,0,Battlespace,3.5,49.0,19.99,...,0,0,0,0,0,0,0,0,0,"Movies & TV,Science Fiction & Fantasy,Science ..."
2,3.0,Three Stars,B000PIU2A0,AGTS54M26X3WOTH4IWLYLCWF54UQ,goog,0,Battlespace,3.5,49.0,19.99,...,0,0,0,0,0,0,0,0,0,"Movies & TV,Science Fiction & Fantasy,Science ..."
4,5.0,Watchmen review,B001QTXM5Y,AENSS3WUCE2RGLXX5TYBFBXBRQEQ,familiar watchman series comic interested movi...,0,Watchmen (Director's Cut),4.6,13923.0,12.4,...,0,0,0,0,0,0,0,0,0,"Movies & TV,Genre for Featured Categories,Acti..."
6,4.0,Four Stars,B001QTXM5Y,AELEP6CFVPHMWS4IDS5EPM22GZQQ,played good happy purchase,0,Watchmen (Director's Cut),4.6,13923.0,12.4,...,0,0,0,0,0,0,0,0,0,"Movies & TV,Genre for Featured Categories,Acti..."
7,4.0,you tell me,B001QTXM5Y,AETXVRADBT6BRQFMVSKTSWF7WMDA,watch friend,0,Watchmen (Director's Cut),4.6,13923.0,12.4,...,0,0,0,0,0,0,0,0,0,"Movies & TV,Genre for Featured Categories,Acti..."


In [45]:
df_narrow.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 499717 entries, 1 to 1063127
Columns: 707 entries, rating to sub_categories
dtypes: float64(4), int64(695), object(8)
memory usage: 2.6+ GB


## Build model

#### User-based Collaborative Filtering
Model: NearestNeighbors
<br>Ref: 
- https://towardsdatascience.com/text-classification-using-k-nearest-neighbors-46fa8a77acc5

- https://medium.com/@deepapandithu/recommender-system-user-collaborative-filtering-37613f0c6a9
- https://www.kaggle.com/code/meuge672/tf-idf-and-knn-in-people-wikipedia-dataset#We-load-the-file-into-pandas

Algorithims:
- Find 5 similar users bases on rating
- Suggest similar movies that target user didn't watch
- Print out the suggested movies

In [10]:
# User-based Collaborative Filtering function:
def user_based_recommendations(user_id,df):
    # create matrix table for each user and rating they gave to each product:
    user_based_matrix = df.pivot_table(index='user_id', columns='parent_asin', values='rating')
    user_based_matrix = user_based_matrix.fillna(0)
    # NearestNeighbors model to find similar users
    user_cf_model = NearestNeighbors(metric='cosine', algorithm='brute') # these 'metric' and 'algorithm' are suggested for recommendation system
    user_cf_model.fit(user_based_matrix)
    # get the index of the user_id in the user_based_matrix:
    user_idx = user_based_matrix.index.get_loc(user_id)
    # Find 5 k-nearest neighbors users:
    _, indices = user_cf_model.kneighbors(user_based_matrix.iloc[user_idx].values.reshape(1, -1), n_neighbors=5)
    # similar users:
    similar_user = user_based_matrix.iloc[indices.flatten()].index.tolist()
    # Get target user's viewed movies
    user_purchased_products = df[df['user_id'] == user_id]['parent_asin'].unique()
    # Get movies with 5 star rating of similar users that target user not watched yet:
    recommended_movies = df[(df['user_id'].isin(similar_user)) & (~df['parent_asin'].isin(user_purchased_products))&(df['rating'] == 5)]
    # Drop duplicates movie id to ensure uniques:
    recommended_movies=recommended_movies.drop_duplicates(subset='parent_asin')
    return recommended_movies
# df_copy.iloc[indices.flatten()].index.tolist()

In [38]:
df_narrow.iloc[6,3]

'AHPRGNDWLTC4EIDDASPKFELSLZSQ'

In [43]:
test_1 = user_based_recommendations(df_narrow.iloc[6,3],df_narrow)
print('User_based model recommend: ',len(test_1),' movies')
test_1.head(3)

User_based model recommend:  98  movies


Unnamed: 0,rating,review_title,parent_asin,user_id,clean_review,main_category,product_name,average_rating,rating_number,price,...,Entertainment,Kelly,R.,Alien Saga,Van Halen,Engineering an Empire,Warner Archive,Chinese,Iran,sub_categories
27322,5.0,Awesome movie!!,B0001I55SI,AENK4HJLBS5C4Y7FWLEU7JSS74BQ,saw movie tv child scene banshee death coach s...,0,Darby O'Gill and the Little People,4.7,4816.0,9.77,...,0,0,0,0,0,0,0,0,0,"Movies & TV,Walt Disney Studios Home Entertain..."
37599,5.0,three hours of beauty,B00AEFYSEA,AGSIXL4DPJMDIWMDTJYPT2MBB6TA,master director sergio leone delivers yet anot...,0,Once Upon A Time In The West,4.7,6505.0,12.91,...,0,0,0,0,0,0,0,0,0,"Movies & TV,Studio Specials,Warner Home Video,..."
39260,5.0,Classic Tarantino,B005LAIIJY,AHS5ZC5IVEBFQTFMDC44XW4QDIWQ,mr quentin one way worked christoph waltz osca...,0,Django Unchained,4.8,30019.0,7.84,...,0,0,0,0,0,0,0,0,0,"Movies & TV,Studio Specials,Lionsgate Home Ent..."


In [46]:
test_3 = user_based_recommendations(df_narrow.iloc[100000,3],df_narrow)
print('User_based model recommend: ',len(test_3),' movies')
test_3.head(3)

User_based model recommend:  154  movies


Unnamed: 0,rating,review_title,parent_asin,user_id,clean_review,main_category,product_name,average_rating,rating_number,price,...,Entertainment,Kelly,R.,Alien Saga,Van Halen,Engineering an Empire,Warner Archive,Chinese,Iran,sub_categories
748,5.0,Awesome,B002ZG981E,AGSVTH7RCAPXZAOSK23A4ZLBIZPA,visually entertaining story great,0,Inception (Blu-ray),4.6,15816.0,9.73,...,0,0,0,0,0,0,0,0,0,"Movies & TV,Blu-ray,Movies"
3056,5.0,hunger games,B0189HKELU,AHZAJYSL7ZS65MB7XTXUWYT2MJ3Q,loved book loved movie,0,The Hunger Games: Complete 4 Film Collection,4.8,6080.0,20.14,...,0,0,0,0,0,0,0,0,0,"Movies & TV,Studio Specials,Lionsgate Home Ent..."
13227,5.0,dr strange,B01M5EKXCA,AHZAJYSL7ZS65MB7XTXUWYT2MJ3Q,thing marvel,0,Doctor Strange,4.7,22150.0,12.15,...,0,0,0,0,0,0,0,0,0,"Movies & TV,Genre for Featured Categories,Acti..."


#### Content-based filtering and Hybrid model
Content-based filtering method: use the function cosine_similarity to calculate the 
<br> Ref: https://medium.com/web-mining-is688-spring-2021/content-based-movie-recommendation-system-72f122641eab


In [13]:
def create_text_data(x):
    return ' '.join(x['clean_description'])+' '.join(x['sub_categories'])


In [19]:
# TF-IDF Vectorization for text data. Prepare for content-based model:
tfidf_vectorizer = TfidfVectorizer()

In [32]:
# VER2
def hybrid_model(user_id,df):
    # USER_BASED MODEL:
    result_userbased = user_based_recommendations(user_id,df)
    # Get target user's viewed movies:
    user_purchased_movies = df[df['user_id'] == user_id]['parent_asin'].unique()
    user_purchased_movies_df=df[df['parent_asin'].isin(user_purchased_movies)].drop_duplicates(subset='parent_asin')
    # merge the user's watched movies with movies recommended by similar users:
    df_merge = pd.concat([user_purchased_movies_df,result_userbased],ignore_index=True)
    # CONTENT_ BASED MODEL:
    # select relative columns only:
    df_merge=df_merge[['parent_asin', 'user_id','product_name', 'rating','average_rating', 'rating_number','price',  'clean_description','sub_categories']]
    # create content column:
    df_merge['text_content']=df_merge.apply(create_text_data,axis=1)
    # TF-IDF Vectorization for text data.
    tfidf_matrix = tfidf_vectorizer.fit_transform(df_merge['text_content'])
    result = pd.DataFrame()
    num_recommendations = 1 #each targeted movie will have 3 recommendations
    for i in user_purchased_movies:
        # get the index of targeted product in df:
        product_idx = df_merge[df_merge['parent_asin'] == i].index[0]
        # calculate similarity score :
        tfidf_scores = cosine_similarity(tfidf_matrix[product_idx], tfidf_matrix).flatten()
        movie_recommend = df_merge.iloc[np.where(tfidf_scores > 0.9)]
        result = pd.concat([result,movie_recommend],ignore_index=True)
    # remove movies user watched and duplicates:
    result=result[~result['parent_asin'].isin(user_purchased_movies)].drop_duplicates(subset='parent_asin')
    return result

In [44]:
#testing:
test_2=hybrid_model(df_narrow.iloc[6,3],df_narrow)
print('Hybrid model recommend: ',len(test_2),' movies')
test_2.head(3)

Hybrid model recommend:  85  movies


Unnamed: 0,parent_asin,user_id,product_name,rating,average_rating,rating_number,price,clean_description,sub_categories,text_content
1,B001CO42J8,AHCN6VJ6PAZFH2S3CIK554GOBYUQ,A Charlie Brown Christmas (Remastered Deluxe E...,5.0,4.8,1720.0,13.0,product descriptiona charlie brown christmas d...,"Movies & TV,Holidays & Seasonal,Christmas,Time...",p r o d u c t d e s c r i p t i o n a c h ...
2,B00000G02H,AGSIXL4DPJMDIWMDTJYPT2MBB6TA,Punch-Drunk Love (Two-Disc Special Edition),5.0,4.5,302.0,10.95,product descriptionwinner best director prize ...,"Movies & TV,Studio Specials,Sony Pictures Home...",p r o d u c t d e s c r i p t i o n w i n n ...
6,B00AEFYSEA,AGSIXL4DPJMDIWMDTJYPT2MBB6TA,Once Upon A Time In The West,5.0,4.7,6505.0,12.91,upon time west dvdnow first time sergio leone ...,"Movies & TV,Studio Specials,Warner Home Video,...",u p o n t i m e w e s t d v d n o w f ...


In [48]:
#testing:
test_4=hybrid_model(df_narrow.iloc[100000,3],df_narrow)
print('Hybrid model recommend: ',len(test_4),' movies')
test_4.head(3)

Hybrid model recommend:  144  movies


Unnamed: 0,parent_asin,user_id,product_name,rating,average_rating,rating_number,price,clean_description,sub_categories,text_content
7,B0189HKELU,AHZAJYSL7ZS65MB7XTXUWYT2MJ3Q,The Hunger Games: Complete 4 Film Collection,5.0,4.8,6080.0,20.14,experience epic adventure hunger game start fi...,"Movies & TV,Studio Specials,Lionsgate Home Ent...",e x p e r i e n c e e p i c a d v e n t u ...
8,B01M5EKXCA,AHZAJYSL7ZS65MB7XTXUWYT2MJ3Q,Doctor Strange,5.0,4.7,22150.0,12.15,marvel studio come doctor strange story worldf...,"Movies & TV,Genre for Featured Categories,Acti...",m a r v e l s t u d i o c o m e d o c t ...
9,B00OMC0W9G,AELGZ73C76HZ3TALZMNTHZJYE47Q,Fury [Blu-ray],5.0,4.7,37145.0,14.98,april ally make final push european theatre ba...,"Movies & TV,Action & Adventure,Featured Catego...",a p r i l a l l y m a k e f i n a l p ...
