# Problem - 1

### Q1) Build a recommender system with the given data using UBCF.

### This dataset is related to the video gaming industry and a survey was conducted to build a recommendation engine so that the store can improve the sales of its gaming DVDs. Snapshot of the dataset is given below. Build a Recommendation Engine and suggest top selling DVDs to the store customers.


In [11]:
#import required libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
#reading data into python
game = pd.read_csv(r"D:\360 Data Science 360DigiTMG\Day18-Recomondation engine and network anlytics\Datasets_Recommendation Engine\game.csv")
game

Unnamed: 0,userId,game,rating
0,3,The Legend of Zelda: Ocarina of Time,4.0
1,6,Tony Hawk's Pro Skater 2,5.0
2,8,Grand Theft Auto IV,4.0
3,10,SoulCalibur,4.0
4,11,Grand Theft Auto IV,4.5
...,...,...,...
4995,4529,Donut County,2.5
4996,4533,MotorStorm: Apocalypse,3.0
4997,4544,The Last Guy,3.0
4998,4548,Valiant Hearts: The Great War,4.0


In [6]:
#shape of the data 
game.shape

(5000, 3)

In [9]:
#columns of the data
game.columns

Index(['userId', 'game', 'rating'], dtype='object')

In [10]:
#check missing values
game.isnull().sum()

userId    0
game      0
rating    0
dtype: int64

In [12]:
#create a tfidf vectorizer to remove all stop words
tfidf = TfidfVectorizer(stop_words='english')

In [16]:
# replacing the NaN values in overview column with empty string
game['game'].isna().sum()
game['game'] = game['game'].fillna(' ')

In [17]:
#preparing the tfidf vector for fit and transform
tfidf_matrix = tfidf.fit_transform(game.game)

In [19]:
tfidf_matrix.shape

(5000, 3068)

In [20]:
#computing the cosine similarity on Tfidf matrix
cosine_similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [21]:
#creating a mapping of game name to index number
game_index= pd.Series(game.index, index=game['game']).drop_duplicates()

In [22]:
game_id = game_index["SoulCalibur"]
game_id

game
SoulCalibur       3
SoulCalibur    3925
dtype: int64

In [25]:
def get_recommendations(Name, topN):
                                                               # topN=10 :    ## getting  the entertainment index using its title
    game_id = game_index[Name]
                                                                          # Getting the pair wise similarity score for all the game's with that 
   
    cosine_scores = list(enumerate(cosine_similarity_matrix[game_id]))
    
    cosine_scores = sorted(cosine_scores, key=lambda x:x[1], reverse = True)     # Sorting the cosine_similarity score based on scores 
    
    cosine_scores_N = cosine_scores[0: topN+1]                                 # Get the scores of top N most similar movies
    
    game_idx  =  [i[0] for i in cosine_scores_N]                                # Getting the game index
    game_scores =  [i[1] for i in cosine_scores_N]
    
    game_similar_show = pd.DataFrame(columns=["name", "Score"])                  # Similar movies and scores
    game_similar_show["name"] = game.iloc[game_idx,1]
    game_similar_show["Score"] = game_scores
    game_similar_show.reset_index(inplace = True)  
    print (game_similar_show)

In [26]:
# Enter your anime and number of anime's to be recommended 
get_recommendations("Metal Gear Solid 2: Sons of Liberty", topN = 10)
game_index["Metal Gear Solid 2: Sons of Liberty"]

    index                                  name     Score
0      39   Metal Gear Solid 2: Sons of Liberty  1.000000
1      92                      Metal Gear Solid  0.695207
2    2402                      Metal Gear Solid  0.695207
3     463        Metal Gear Solid HD Collection  0.558334
4     606        Metal Gear Solid HD Collection  0.558334
5    3050        Metal Gear Solid HD Collection  0.558334
6     953         Metal Gear Solid 2: Substance  0.544171
7    1003         Metal Gear Solid 2: Substance  0.544171
8      97       Metal Gear Solid 3: Subsistence  0.534065
9     969        Metal Gear Solid: Portable Ops  0.494686
10     59  Metal Gear Solid V: The Phantom Pain  0.488450


39

# problem - 2

### Q2) The Entertainment Company, which is an online movie watching platform, wants to improve its collection of movies and showcase those that are highly rated and recommend those movies to its customer by their movie watching footprint. For this, the company has collected the data and shared it with you to provide some analytical insights and also to come up with a recommendation algorithm so that it can automate its process for effective recommendations. The ratings are between -9 and +9.

In [67]:
#importing required libraries

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [68]:
#reading the libraries into python

entertainment = pd.read_csv(r"D:\360 Data Science 360DigiTMG\Day18-Recomondation engine and network anlytics\Datasets_Recommendation Engine\Entertainment.csv")
entertainment.head()

Unnamed: 0,Id,Titles,Category,Reviews
0,6973,Toy Story (1995),"Drama, Romance, School, Supernatural",-8.98
1,6778,Jumanji (1995),"Action, Adventure, Drama, Fantasy, Magic, Mili...",8.88
2,9702,Grumpier Old Men (1995),"Action, Comedy, Historical, Parody, Samurai, S...",99.0
3,6769,Waiting to Exhale (1995),"Sci-Fi, Thriller",99.0
4,1123,Father of the Bride Part II (1995),"Action, Comedy, Historical, Parody, Samurai, S...",-0.44


In [69]:
#shape of the data

entertainment.shape

(51, 4)

In [70]:
#columns of the data

entertainment.columns

Index(['Id', 'Titles', 'Category', 'Reviews'], dtype='object')

In [71]:
# Creating a Tfidf Vectorizer to remove all stop words

tfidf = TfidfVectorizer(stop_words='english')
tfidf

TfidfVectorizer(stop_words='english')

In [72]:
# replacing the NaN values in overview column with empty string

entertainment['Category'].isnull().sum()
entertainment['Category'] = entertainment['Category'].fillna(" ")

In [73]:
# Preparing the Tfidf matrix by fitting and transforming
tfidf_matrix = tfidf.fit_transform(entertainment.Category)
tfidf_matrix

<51x34 sparse matrix of type '<class 'numpy.float64'>'
	with 285 stored elements in Compressed Sparse Row format>

In [74]:
#shape of the tfidf matrix
tfidf_matrix.shape

(51, 34)

In [75]:
# Computing the cosine similarity on Tfidf matrix
cosine_similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_similarity_matrix

array([[1.        , 0.09421367, 0.        , ..., 0.12767481, 0.16772551,
        0.31295101],
       [0.09421367, 1.        , 0.16662513, ..., 0.22332745, 0.        ,
        0.        ],
       [0.        , 0.16662513, 1.        , ..., 0.13383076, 0.        ,
        0.        ],
       ...,
       [0.12767481, 0.22332745, 0.13383076, ..., 1.        , 0.47083158,
        0.17020003],
       [0.16772551, 0.        , 0.        , ..., 0.47083158, 1.        ,
        0.64107498],
       [0.31295101, 0.        , 0.        , ..., 0.17020003, 0.64107498,
        1.        ]])

In [76]:
# creating a mapping of entertainment name to index number
entertainment_index = pd.Series(entertainment.index, index=entertainment['Titles']).drop_duplicates()
entertainment_index.head()

Titles
Toy Story (1995)                      0
Jumanji (1995)                        1
Grumpier Old Men (1995)               2
Waiting to Exhale (1995)              3
Father of the Bride Part II (1995)    4
dtype: int64

In [77]:
entertainment_id = entertainment_index['Jumanji (1995)']
entertainment_id

1

In [107]:
def get_recomondations(Name, topN):  # topN=10 :   
    
    ## getting  the entertainment index using its title
    entertainment_id = entertainment_index[Name]
    
    #Getting the pair wise similarity score for all the anime's with that anime
    cosine_scores = list(enumerate(cosine_similarity_matrix[entertainment_id]))
    
    #Sorting the cosine_similarity scores based on scores
    cosine_scores = sorted(cosine_scores, key=lambda x:x[1], reverse = True)
    
    # Get the scores of top N most similar movies
    cosine_scores_N = cosine_scores[0: topN+1]
    
    #getting the entertainment index
    entertainment_idx  =  [i[0] for i in cosine_scores_N]
    entertainment_scores =  [i[1] for i in cosine_scores_N]
    

    entertainment_similar_show = pd.DataFrame(columns=["name", "Score"])
    entertainment_similar_show["name"] = entertainment.loc[entertainment_idx, "Titles"]
    entertainment_similar_show["Score"] = entertainment_scores
    entertainment_similar_show.reset_index(inplace = True)
    print (entertainment_similar_show)

In [108]:
# Enter your entertainment and number of entertainment's to be recommended 

get_recomondations("How to Make an American Quilt (1995)", topN = 10)
entertainment_index["How to Make an American Quilt (1995)"]

    index                                  name     Score
0      42  How to Make an American Quilt (1995)  1.000000
1      11    Dracula: Dead and Loving It (1995)  0.671482
2       5                           Heat (1995)  0.659052
3      14               Cutthroat Island (1995)  0.659052
4      43           Seven (a.k.a. Se7en) (1995)  0.659052
5      30                Dangerous Minds (1995)  0.613482
6      39                    Restoration (1995)  0.433194
7      20                     Get Shorty (1995)  0.412933
8      32                           Babe (1995)  0.412933
9      44                     Pocahontas (1995)  0.412933
10      2               Grumpier Old Men (1995)  0.405039


42