# import Toolkits

In [46]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import difflib

In [2]:
movies_df = pd.read_csv('/kaggle/input/top-rated-tmdb-movies-10k/top10K-TMDB-movies.csv')
movies_df.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [3]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 10000 non-null  int64  
 1   title              10000 non-null  object 
 2   genre              9997 non-null   object 
 3   original_language  10000 non-null  object 
 4   overview           9987 non-null   object 
 5   popularity         10000 non-null  float64
 6   release_date       10000 non-null  object 
 7   vote_average       10000 non-null  float64
 8   vote_count         10000 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 703.2+ KB


In [4]:
movies_df.isna().sum()

id                    0
title                 0
genre                 3
original_language     0
overview             13
popularity            0
release_date          0
vote_average          0
vote_count            0
dtype: int64

In [7]:
movies_df[movies_df['genre'].isna()]

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
3361,50472,Anplagghed al cinema,,it,"A queue at the ATM machine, a displaced family...",4.42,2006-11-26,7.0,313
7821,43211,7 Kilos in 7 Days,,it,Two not very clever young doctors open a fitne...,5.885,1986-02-02,6.0,212
8518,57114,"Amore, bugie e calcetto",,en,,4.709,2008-04-04,5.8,200


let`s fill the missing values in the genre column

it seems that Anplagghed al cinema movie is a Comedy and Fantasy

7 Kilos in 7 Days is a Comedy movie


Amore, bugie e calcett is a Comedy movie

In [8]:
movies_df['genre'].unique()

array(['Drama,Crime', 'Comedy,Drama,Romance', 'Drama,History,War', ...,
       'Action,TV Movie,Science Fiction,Comedy,Adventure',
       'Action,Science Fiction,War', 'Adventure,Fantasy,Action,Drama'],
      dtype=object)

In [9]:
movies_df.loc[3361 , 'genre'] = 'Comedy,Fantasy'
movies_df.loc[7821 , 'genre'] = 'Comedy'
movies_df.loc[8518 , 'genre'] = 'Comedy'

In [10]:
movies_df.isna().sum()

id                    0
title                 0
genre                 0
original_language     0
overview             13
popularity            0
release_date          0
vote_average          0
vote_count            0
dtype: int64

# Feature Selection

In [12]:
movies_df.columns

Index(['id', 'title', 'genre', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count'],
      dtype='object')

In [14]:
#we will deal with some feature like [id , title ,genre , overview ]
movies = movies_df[['id' , 'title' ,'genre' , 'overview']]
movies.sample(5)

Unnamed: 0,id,title,genre,overview
4189,87502,Flight,Drama,Commercial airline pilot Whip Whitaker has a p...
274,110416,Song of the Sea,"Family,Animation,Fantasy",The story of the last Seal Child’s journey hom...
2803,249688,The End of the Tour,Drama,The story of the five-day interview between Ro...
437,631,Sunrise: A Song of Two Humans,"Drama,Romance",A married farmer falls under the spell of a sl...
2607,9540,Dead Ringers,"Thriller,Horror","Elliot, a successful gynecologist, works at th..."


In [15]:
movies['tags'] = movies['overview'] + movies['genre']
movies.sample(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['tags'] = movies['overview'] + movies['genre']


Unnamed: 0,id,title,genre,overview,tags
5664,13205,Bambi II,"Animation,Drama,Family",Return to the forest and join Bambi as he reun...,Return to the forest and join Bambi as he reun...
4391,11185,"See No Evil, Hear No Evil","Comedy,Crime",A murder takes place in the shop of David Lyon...,A murder takes place in the shop of David Lyon...
7116,152748,Ain't Them Bodies Saints,"Crime,Drama,Romance","Bob Muldoon and Ruth Guthrie, an impassioned y...","Bob Muldoon and Ruth Guthrie, an impassioned y..."
1659,11236,The Secret Garden,"Drama,Family,Fantasy",A young British girl born and reared in India ...,A young British girl born and reared in India ...
261,582,The Lives of Others,"Drama,Thriller",A tragic love story set in East Berlin with th...,A tragic love story set in East Berlin with th...


In [16]:
movies.drop(columns = ['overview' , 'genre']  , inplace = True)
movies.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies.drop(columns = ['overview' , 'genre'] , inplace = True)


Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,In the continuing saga of the Corleone crime f...


In [19]:
cv = CountVectorizer(max_features= 10000 , stop_words= 'english')
cv

In [23]:
vector = cv.fit_transform(movies['tags'].values.astype('U')).toarray()

In [24]:
vector.shape

(10000, 10000)

In [26]:
similarity = cosine_similarity(vector)

In [27]:
similarity

array([[1.        , 0.05634362, 0.13041013, ..., 0.07559289, 0.11065667,
        0.06900656],
       [0.05634362, 1.        , 0.07715167, ..., 0.        , 0.03636965,
        0.        ],
       [0.13041013, 0.07715167, 1.        , ..., 0.02300219, 0.0673435 ,
        0.09449112],
       ...,
       [0.07559289, 0.        , 0.02300219, ..., 1.        , 0.03253   ,
        0.03042903],
       [0.11065667, 0.03636965, 0.0673435 , ..., 0.03253   , 1.        ,
        0.04454354],
       [0.06900656, 0.        , 0.09449112, ..., 0.03042903, 0.04454354,
        1.        ]])

In [38]:
distance = sorted(list(enumerate(similarity[2])), key = lambda x : x[1] , reverse = True)

for i in distance[0:5]:
    
    print(movies.iloc[i[0]]['title'])

The Godfather
The Godfather: Part II
Blood Ties
Joker
Bomb City


In [47]:
movies_title = movies['title'].tolist()
len(movies_title)

10000

In [48]:
def recommend(movie_name):
    find_close_matches = difflib.get_close_matches(movie_name , movies_title)
    close_match = find_close_matches[0]
    index = movies[movies['title'] == close_match].index[0]
    distance = sorted(list(enumerate(similarity[index])), key = lambda x : x[1] , reverse = True)
    for i in distance[0:5]:
        print(movies.iloc[i[0]]['title'])

In [50]:
recommend('The God Father')

The Godfather
The Godfather: Part II
Blood Ties
Joker
Bomb City


In [51]:
import pickle

In [52]:
pickle.dump(movies , open('movies_list.pkl' , 'wb'))

In [54]:
pickle.load(open('movies_list.pkl' , 'rb'))

Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,In the continuing saga of the Corleone crime f...
...,...,...,...
9995,10196,The Last Airbender,"The story follows the adventures of Aang, a yo..."
9996,331446,Sharknado 3: Oh Hell No!,The sharks take bite out of the East Coast whe...
9997,13995,Captain America,"During World War II, a brave, patriotic Americ..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,A man named Farmer sets out to rescue his kidn...


In [55]:
pickle.dump(similarity , open('similarity.pkl' , 'wb'))