# Machine Learning Model 📽
For making our recommendation system we need to perform these techniques :-

1. Keyword extraction -> Extract keywords from description
2. Count Vectorizer -> Count frequency of words
3. Cosine Similarity -> Find cosine similarity between all movie titles

In [18]:
# This csv provides the indices for the movies 

links_small = pd.read_csv('links_small.csv')
links_small.head(10)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
5,6,113277,949.0
6,7,114319,11860.0
7,8,112302,45325.0
8,9,114576,9091.0
9,10,113189,710.0


### Finalizing dataset :-

In [19]:
# These unique movie ids will be used as primary index for referring each movie

links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [20]:
movies[movies['id'].isnull()]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,month


In [21]:
# we need to make sure that the ID column of our main dataframe is clean and of (type) integer to impliment error free cosine function 


def convert(n):   # integer conversion of IDs 
    try:
        return int(n)
    except:
        return np.nan


In [22]:
movies['id'] = movies['id'].apply(convert)

In [23]:
movies['id'].isnull().sum()  # fiding any null values in IDs

3

In [24]:
movies[movies['id'].isnull()]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,month
19730,- Written by Ørnås,0.065736,/ff9qCepilowshEtG2GYWwzt2bs4.jpg,"[{'name': 'Carousel Productions', 'id': 11176}...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",,0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,,,,,,,,,NaT,
29503,Rune Balot goes to a casino connected to the ...,1.931659,/zV8bHuSL6WXoD6FWogP9j4x80bL.jpg,"[{'name': 'Aniplex', 'id': 2883}, {'name': 'Go...","[{'iso_3166_1': 'US', 'name': 'United States o...",,0,68.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,...,,,,,,,,,NaT,
35587,Avalanche Sharks tells the story of a bikini ...,2.185485,/zaSf5OG7V8X8gqFvly88zDdRm46.jpg,"[{'name': 'Odyssey Media', 'id': 17161}, {'nam...","[{'iso_3166_1': 'CA', 'name': 'Canada'}]",,0,82.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,,,,,,,,,NaT,


In [25]:
movies = movies.drop([19730, 29503, 35587]) 

In [26]:
movies['id'] = movies['id'].astype('int')

In [27]:
# filtering all the movies which have their ID's in links_small dataset

Fmovies = movies[movies['id'].isin(links_small)]  
Fmovies.shape

(9099, 26)

In [28]:
# building model based on text of every movie in the dataset & then use it over cosine similarity to predict nearest recommendations

Fmovies['tagline'] = Fmovies['tagline'].fillna('')
Fmovies['description'] = Fmovies['overview'] + Fmovies['tagline']
Fmovies['description'] = Fmovies['description'].fillna('')  # replacing NaN values with an empty string

In [29]:
Fmovies.to_csv('Fmovies.txt', index=False, sep='\t')

In [30]:
ls -lah Fmovies.csv

-rw-rw-r-- 1 dhruv11 dhruv11 20K Sep 12 17:42 Fmovies.csv


In [31]:
# constructing the required TF-IDF matrix by fitting and transforming the data

tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english') 
tfidf_matrix = tf.fit_transform(Fmovies['description'])

In [34]:
# initializing cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix) 

In [35]:
cosine_sim[0]

array([1.        , 0.00680476, 0.        , ..., 0.        , 0.00344913,
       0.        ])

In [36]:
# trying to construct a reverse map of indices & movie titles.

Fmovies = Fmovies.reset_index()
titles = Fmovies['title']
indices = pd.Series(Fmovies.index, index=Fmovies['title'])

In [37]:
def get_recommendations(title):
    idx = indices[title]
    
    sim_scores = list(enumerate(cosine_sim[idx])) # getting the pairwsie similarity scores of all movies with that movie
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) # sorting the movies based on the similarity scores
    
    sim_scores = sim_scores[1:11] # getting the scores of the 10 most similar movies
    
    movie_indices = [i[0] for i in sim_scores]  # gettng the movie indices
    
    return titles.iloc[movie_indices]  # Returning the top 10 most similar movies


In [38]:
get_recommendations('The Dark Knight').head(10)

7931                      The Dark Knight Rises
132                              Batman Forever
1113                             Batman Returns
8227    Batman: The Dark Knight Returns, Part 2
7565                 Batman: Under the Red Hood
524                                      Batman
7901                           Batman: Year One
2579               Batman: Mask of the Phantasm
2696                                        JFK
8165    Batman: The Dark Knight Returns, Part 1
Name: title, dtype: object

In [39]:
get_recommendations('Iron Man').head(10)

8285                 Iron Man 3
7506                 Iron Man 2
8758    Avengers: Age of Ultron
8090                      Brake
6055                    Hostage
5661                   Scarface
279           The Swan Princess
7082                       Igor
3009          Missing in Action
5177            The Magic Flute
Name: title, dtype: object