<a href="https://colab.research.google.com/github/A7sultan/Python-DA-ML-portfolio/blob/main/Aminah_Individual_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing all necessary libraries and their modules

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
movies=pd.read_csv('/content/movies.csv')   #reading the movie dataset
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [7]:
ratings=pd.read_csv('/content/ratings.csv')  #reading the rating dataset
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [8]:
movies.info()   #information of movie dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [9]:
movies['genres'].value_counts() #finding out the generes and their values

Drama                                                  1053
Comedy                                                  946
Comedy|Drama                                            435
Comedy|Romance                                          363
Drama|Romance                                           349
                                                       ... 
Action|Crime|Horror|Mystery|Thriller                      1
Adventure|Animation|Children|Comedy|Musical|Romance       1
Action|Adventure|Animation|Comedy|Crime|Mystery           1
Children|Comedy|Fantasy|Sci-Fi                            1
Action|Animation|Comedy|Fantasy                           1
Name: genres, Length: 951, dtype: int64

In [10]:
#removing the production year from the title
movies["production_year"] = movies["title"].str[-5:-1]
movies["title"] = movies["title"].str[:-7]
movies.reset_index(inplace=True)

movies.head(2)

Unnamed: 0,index,movieId,title,genres,production_year
0,0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,1,2,Jumanji,Adventure|Children|Fantasy,1995


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer  #Convert a collection of raw documents to a matrix of TF-IDF features
tfv = TfidfVectorizer(min_df=3,  max_features=None,          #minimum default is set to 3 and max features  to none
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',  #character normalization, feature should be a word or character of n-grams
            ngram_range=(1, 3),
            stop_words = 'english')

In [12]:
tfv_matrix = tfv.fit_transform(movies['genres'])
print(tfv_matrix)
print(tfv_matrix.shape)

  (0, 162)	0.3843006937736884
  (0, 127)	0.3440430311139633
  (0, 69)	0.3440430311139633
  (0, 202)	0.31787124478546813
  (0, 159)	0.2888770821219739
  (0, 126)	0.3010431960646841
  (0, 68)	0.314531443262947
  (0, 305)	0.23737018768536186
  (0, 182)	0.13150796802099715
  (0, 158)	0.24811119923146882
  (0, 125)	0.25370401204064785
  (0, 67)	0.20486284716809808
  (1, 77)	0.5571981702858497
  (1, 174)	0.4812876257171505
  (1, 74)	0.4432411743911896
  (1, 305)	0.3035405079212969
  (1, 158)	0.31727572939997495
  (1, 67)	0.2619712832937288
  (2, 224)	0.7695974416123483
  (2, 380)	0.5242383036039113
  (2, 182)	0.36454626441402677
  (3, 198)	0.5862556442494471
  (3, 291)	0.457368792379169
  (3, 193)	0.4388867391920914
  (3, 268)	0.23529026192425037
  :	:
  (9734, 193)	0.7846149876753742
  (9734, 268)	0.42063760299449465
  (9734, 182)	0.4554594691761476
  (9735, 125)	1.0
  (9736, 263)	1.0
  (9737, 16)	0.49880738884140885
  (9737, 136)	0.46145619690055906
  (9737, 14)	0.39146023742804137
  (9737

In [13]:
from sklearn.metrics.pairwise import sigmoid_kernel  # implements utilities to evaluate pairwise distances or affinity of sets of samples

# Compute the sigmoid kernel
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
print(sig[0])

[0.76263689 0.76180765 0.76164424 ... 0.76159416 0.76172628 0.76173151]


In [14]:
# Reverse mapping of indices and movie titles so we can search using titles
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()
print(indices)
print(indices['Andrew Dice Clay: Dice Rules'])
print(sig[9741])
print(list(enumerate(sig[indices['Andrew Dice Clay: Dice Rules']])))
print(sorted(list(enumerate(sig[indices['Andrew Dice Clay: Dice Rules']])), key=lambda x: x[1], reverse=True))

title
Toy Story                                0
Jumanji                                  1
Grumpier Old Men                         2
Waiting to Exhale                        3
Father of the Bride Part II              4
                                      ... 
Black Butler: Book of the Atlantic    9737
No Game No Life: Zero                 9738
Flint                                 9739
Bungo Stray Dogs: Dead Apple          9740
Andrew Dice Clay: Dice Rules          9741
Length: 9742, dtype: int64
9741
[0.76173151 0.76159416 0.76197474 ... 0.76159416 0.76159416 0.76263689]
[(0, 0.7617315097208257), (1, 0.7615941559557649), (2, 0.7619747389701566), (3, 0.7618601872178122), (4, 0.7626368906233343), (5, 0.7615941559557649), (6, 0.7619747389701566), (7, 0.7615941559557649), (8, 0.7615941559557649), (9, 0.7615941559557649), (10, 0.7618601872178122), (11, 0.7619108495586182), (12, 0.7615941559557649), (13, 0.7615941559557649), (14, 0.7615941559557649), (15, 0.7615941559557649), (16, 0.761

In [15]:
def recomendations(title, sig=sig): #function for creating a recommendation system
    # Get the index corresponding to titles
    idx = indices[title]
    # Get the pairwise similarity scores for all
    sig_scores = list(enumerate(sig[idx]))
    # Sort the movies
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)
    # Scores of the 10 most similar movies
    sig_scores = sig_scores[1:11]
    # Movie indices
    movie_indices = [i[0] for i in sig_scores]
    # Top 10 most similar movies
    return movies['title'].iloc[movie_indices]

In [17]:
print(recomendations('Shrek'))

7530                            Gnomeo & Juliet
7805     Puss in Boots (Nagagutsu o haita neko)
0                                     Toy Story
1706                                       Antz
2355                                Toy Story 2
2809    Adventures of Rocky and Bullwinkle, The
3000                  Emperor's New Groove, The
3568                             Monsters, Inc.
6194                                  Wild, The
6486                            Shrek the Third
Name: title, dtype: object
