# Movie Recommendation System using K-nearest neighbors Model 
## 1. Data Download and Inspection 

### 1.1 Import modules 

In [1]:
import json 
import pandas as pd 

### 1.2 Data download 

In [2]:
movies = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_movies.csv")
credits = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_credits.csv")

### 1.3 Data inspection 

In [3]:
movies.head().T

Unnamed: 0,0,1,2,3,4
budget,237000000,300000000,245000000,250000000,260000000
genres,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam..."
homepage,http://www.avatarmovie.com/,http://disney.go.com/disneypictures/pirates/,http://www.sonypictures.com/movies/spectre/,http://www.thedarkknightrises.com/,http://movies.disney.com/john-carter
id,19995,285,206647,49026,49529
keywords,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":..."
original_language,en,en,en,en,en
original_title,Avatar,Pirates of the Caribbean: At World's End,Spectre,The Dark Knight Rises,John Carter
overview,"In the 22nd century, a paraplegic Marine is di...","Captain Barbossa, long believed to be dead, ha...",A cryptic message from Bond’s past sends him o...,Following the death of District Attorney Harve...,"John Carter is a war-weary, former military ca..."
popularity,150.437577,139.082615,107.376788,112.31295,43.926995
production_companies,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""name"": ""Walt Disney Pictures"", ""id"": 2}]"


In [4]:
credits.head().T

Unnamed: 0,0,1,2,3,4
movie_id,19995,285,206647,49026,49529
title,Avatar,Pirates of the Caribbean: At World's End,Spectre,The Dark Knight Rises,John Carter
cast,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c..."
crew,"[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


- 'Id' from movies df and 'movie_id' from credits df are same, confirming matching data movies in each row! Both dataframes can be joined together for further processing! 

In [5]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

- Some missing data in columns: homepage, overview, release_date, runtime, tagline
- Mix of both numerical and categorical features in the dataframe! 

In [6]:
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object
 3   crew      4803 non-null   object
dtypes: int64(1), object(3)
memory usage: 150.2+ KB


- No missing data! 

### 1.4 Join two dataframes 

In [7]:
credits.rename({"movie_id": "id"}, axis=1, inplace=True) #renaming the column so we can merge the two datasets 
credits.head().T

Unnamed: 0,0,1,2,3,4
id,19995,285,206647,49026,49529
title,Avatar,Pirates of the Caribbean: At World's End,Spectre,The Dark Knight Rises,John Carter
cast,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c..."
crew,"[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [8]:
data_df = pd.merge(movies, credits, on='id', how='outer')
data_df.head().T

Unnamed: 0,0,1,2,3,4
budget,4000000,11000000,94000000,55000000,15000000
genres,"[{""id"": 80, ""name"": ""Crime""}, {""id"": 35, ""name...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 28, ""...","[{""id"": 16, ""name"": ""Animation""}, {""id"": 10751...","[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam...","[{""id"": 18, ""name"": ""Drama""}]"
homepage,,http://www.starwars.com/films/star-wars-episod...,http://movies.disney.com/finding-nemo,,http://www.dreamworks.com/ab/
id,5,11,12,13,14
keywords,"[{""id"": 612, ""name"": ""hotel""}, {""id"": 613, ""na...","[{""id"": 803, ""name"": ""android""}, {""id"": 4270, ...","[{""id"": 494, ""name"": ""father son relationship""...","[{""id"": 422, ""name"": ""vietnam veteran""}, {""id""...","[{""id"": 255, ""name"": ""male nudity""}, {""id"": 29..."
original_language,en,en,en,en,en
original_title,Four Rooms,Star Wars,Finding Nemo,Forrest Gump,American Beauty
overview,It's Ted the Bellhop's first night on the job....,Princess Leia is captured and held hostage by ...,"Nemo, an adventurous young clownfish, is unexp...",A man with a low IQ has accomplished great thi...,"Lester Burnham, a depressed suburban father in..."
popularity,22.87623,126.393695,85.688789,138.133331,80.878605
production_companies,"[{""name"": ""Miramax Films"", ""id"": 14}, {""name"":...","[{""name"": ""Lucasfilm"", ""id"": 1}, {""name"": ""Twe...","[{""name"": ""Pixar Animation Studios"", ""id"": 3}]","[{""name"": ""Paramount Pictures"", ""id"": 4}]","[{""name"": ""DreamWorks SKG"", ""id"": 27}, {""name""..."


In [9]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

- Merge worked but with 3 features referring to 'movie title' 
- cleanup the movie title issue from the data frame!

In [10]:
data_df.drop(['title_x', 'title_y'], axis=1, inplace=True)

In [11]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [12]:
data_df.rename({"original_title": "title"}, axis=1, inplace=True)

In [13]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   title                 4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [14]:
data_df['cast']

0       [{"cast_id": 42, "character": "Ted the Bellhop...
1       [{"cast_id": 3, "character": "Luke Skywalker",...
2       [{"cast_id": 8, "character": "Marlin (voice)",...
3       [{"cast_id": 7, "character": "Forrest Gump", "...
4       [{"cast_id": 6, "character": "Lester Burnham",...
                              ...                        
4798    [{"cast_id": 1, "character": "Dawn", "credit_i...
4799    [{"cast_id": 4, "character": "Smith Bhatnagar"...
4800    [{"cast_id": 3, "character": "Amber", "credit_...
4801                                                   []
4802    [{"cast_id": 0, "character": "Narrator", "cred...
Name: cast, Length: 4803, dtype: object

In [15]:
data_df['keywords']

0       [{"id": 612, "name": "hotel"}, {"id": 613, "na...
1       [{"id": 803, "name": "android"}, {"id": 4270, ...
2       [{"id": 494, "name": "father son relationship"...
3       [{"id": 422, "name": "vietnam veteran"}, {"id"...
4       [{"id": 255, "name": "male nudity"}, {"id": 29...
                              ...                        
4798                                                   []
4799                                                   []
4800    [{"id": 10060, "name": "christian film"}, {"id...
4801                                                   []
4802    [{"id": 6027, "name": "music"}, {"id": 225822,...
Name: keywords, Length: 4803, dtype: object

In [16]:
data_df['genres']

0       [{"id": 80, "name": "Crime"}, {"id": 35, "name...
1       [{"id": 12, "name": "Adventure"}, {"id": 28, "...
2       [{"id": 16, "name": "Animation"}, {"id": 10751...
3       [{"id": 35, "name": "Comedy"}, {"id": 18, "nam...
4                           [{"id": 18, "name": "Drama"}]
                              ...                        
4798                       [{"id": 27, "name": "Horror"}]
4799    [{"id": 35, "name": "Comedy"}, {"id": 10751, "...
4800    [{"id": 53, "name": "Thriller"}, {"id": 18, "n...
4801                    [{"id": 10751, "name": "Family"}]
4802                  [{"id": 99, "name": "Documentary"}]
Name: genres, Length: 4803, dtype: object

## 2. EDA

In [28]:
encoded_data_df= data_df.copy()
encoded_data_df.head().T

Unnamed: 0,0,1,2,3,4
budget,4000000,11000000,94000000,55000000,15000000
genres,"[{""id"": 80, ""name"": ""Crime""}, {""id"": 35, ""name...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 28, ""...","[{""id"": 16, ""name"": ""Animation""}, {""id"": 10751...","[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam...","[{""id"": 18, ""name"": ""Drama""}]"
homepage,,http://www.starwars.com/films/star-wars-episod...,http://movies.disney.com/finding-nemo,,http://www.dreamworks.com/ab/
id,5,11,12,13,14
keywords,"[{""id"": 612, ""name"": ""hotel""}, {""id"": 613, ""na...","[{""id"": 803, ""name"": ""android""}, {""id"": 4270, ...","[{""id"": 494, ""name"": ""father son relationship""...","[{""id"": 422, ""name"": ""vietnam veteran""}, {""id""...","[{""id"": 255, ""name"": ""male nudity""}, {""id"": 29..."
original_language,en,en,en,en,en
title,Four Rooms,Star Wars,Finding Nemo,Forrest Gump,American Beauty
overview,It's Ted the Bellhop's first night on the job....,Princess Leia is captured and held hostage by ...,"Nemo, an adventurous young clownfish, is unexp...",A man with a low IQ has accomplished great thi...,"Lester Burnham, a depressed suburban father in..."
popularity,22.87623,126.393695,85.688789,138.133331,80.878605
production_companies,"[{""name"": ""Miramax Films"", ""id"": 14}, {""name"":...","[{""name"": ""Lucasfilm"", ""id"": 1}, {""name"": ""Twe...","[{""name"": ""Pixar Animation Studios"", ""id"": 3}]","[{""name"": ""Paramount Pictures"", ""id"": 4}]","[{""name"": ""DreamWorks SKG"", ""id"": 27}, {""name""..."


### 2.1 Feature Encoding 
#### 2.1.1 Cleanup JSON formatted data 

In [29]:
encoded_data_df['cast'][0]

'[{"cast_id": 42, "character": "Ted the Bellhop", "credit_id": "52fe420dc3a36847f80001b7", "gender": 2, "id": 3129, "name": "Tim Roth", "order": 0}, {"cast_id": 31, "character": "Man", "credit_id": "52fe420dc3a36847f800018b", "gender": 2, "id": 3131, "name": "Antonio Banderas", "order": 1}, {"cast_id": 29, "character": "Angela", "credit_id": "52fe420dc3a36847f8000183", "gender": 1, "id": 3130, "name": "Jennifer Beals", "order": 2}, {"cast_id": 25, "character": "Elspeth", "credit_id": "52fe420dc3a36847f8000173", "gender": 1, "id": 3125, "name": "Madonna", "order": 3}, {"cast_id": 41, "character": "Margaret", "credit_id": "52fe420dc3a36847f80001b3", "gender": 1, "id": 3141, "name": "Marisa Tomei", "order": 4}, {"cast_id": 43, "character": "Leo", "credit_id": "52fe420dc3a36847f80001bb", "gender": 2, "id": 62, "name": "Bruce Willis", "order": 5}, {"cast_id": 38, "character": "Chester Rush", "credit_id": "52fe420dc3a36847f80001a7", "gender": 2, "id": 138, "name": "Quentin Tarantino", "order

In [30]:
encoded_data_df['cast']=data_df['cast'].apply(lambda x: [item['name'] for item in json.loads(x)][:3] if pd.notna(x) else None)
encoded_data_df['cast'][0]

['Tim Roth', 'Antonio Banderas', 'Jennifer Beals']

In [31]:
encoded_data_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4793,4794,4795,4796,4797,4798,4799,4800,4801,4802
budget,4000000,11000000,94000000,55000000,15000000,12800000,90000000,92620000,0,140000000,...,0,0,0,200,130000000,0,0,0,0,2
genres,"[{""id"": 80, ""name"": ""Crime""}, {""id"": 35, ""name...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 28, ""...","[{""id"": 16, ""name"": ""Animation""}, {""id"": 10751...","[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam...","[{""id"": 18, ""name"": ""Drama""}]","[{""id"": 18, ""name"": ""Drama""}, {""id"": 80, ""name...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 18, ""name"": ""Drama""}, {""id"": 878, ""nam...","[{""id"": 18, ""name"": ""Drama""}, {""id"": 10749, ""n...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",...,"[{""id"": 27, ""name"": ""Horror""}]","[{""id"": 27, ""name"": ""Horror""}]","[{""id"": 36, ""name"": ""History""}, {""id"": 18, ""na...","[{""id"": 18, ""name"": ""Drama""}]","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 27, ""name"": ""Horror""}]","[{""id"": 35, ""name"": ""Comedy""}, {""id"": 10751, ""...","[{""id"": 53, ""name"": ""Thriller""}, {""id"": 18, ""n...","[{""id"": 10751, ""name"": ""Family""}]","[{""id"": 99, ""name"": ""Documentary""}]"
homepage,,http://www.starwars.com/films/star-wars-episod...,http://movies.disney.com/finding-nemo,,http://www.dreamworks.com/ab/,,,,http://www.clubcultura.com/clubcine/clubcineas...,http://disney.go.com/disneyvideos/liveaction/p...,...,,,,,,,http://www.growingupsmithmovie.com,,,
id,5,11,12,13,14,16,18,19,20,22,...,396152,402515,407887,408429,417859,426067,426469,433715,447027,459488
keywords,"[{""id"": 612, ""name"": ""hotel""}, {""id"": 613, ""na...","[{""id"": 803, ""name"": ""android""}, {""id"": 4270, ...","[{""id"": 494, ""name"": ""father son relationship""...","[{""id"": 422, ""name"": ""vietnam veteran""}, {""id""...","[{""id"": 255, ""name"": ""male nudity""}, {""id"": 29...","[{""id"": 30, ""name"": ""individual""}, {""id"": 246,...","[{""id"": 402, ""name"": ""clone""}, {""id"": 444, ""na...","[{""id"": 312, ""name"": ""man vs machine""}, {""id"":...","[{""id"": 455, ""name"": ""farewell""}, {""id"": 457, ...","[{""id"": 911, ""name"": ""exotic island""}, {""id"": ...",...,[],"[{""id"": 321, ""name"": ""terror""}, {""id"": 8087, ""...","[{""id"": 10586, ""name"": ""korea""}, {""id"": 229851...",[],"[{""id"": 4414, ""name"": ""adventure""}, {""id"": 618...",[],[],"[{""id"": 10060, ""name"": ""christian film""}, {""id...",[],"[{""id"": 6027, ""name"": ""music""}, {""id"": 225822,..."
original_language,en,en,en,en,en,en,en,de,en,en,...,en,pt,ko,en,en,en,en,en,en,en
title,Four Rooms,Star Wars,Finding Nemo,Forrest Gump,American Beauty,Dancer in the Dark,The Fifth Element,Metropolis,My Life Without Me,Pirates of the Caribbean: The Curse of the Bla...,...,Restoration,Solitude,인천상륙작전,Perfect Cowboy,Puss in Boots,Midnight Cabaret,Growing Up Smith,8 Days,Running Forever,"To Be Frank, Sinatra at 100"
overview,It's Ted the Bellhop's first night on the job....,Princess Leia is captured and held hostage by ...,"Nemo, an adventurous young clownfish, is unexp...",A man with a low IQ has accomplished great thi...,"Lester Burnham, a depressed suburban father in...","Selma, a Czech immigrant on the verge of blind...","In 2257, a taxi driver is unintentionally give...",In a futuristic city sharply divided between t...,A Pedro Almodovar production in which a fatall...,"Jack Sparrow, a freewheeling 17th-century pira...",...,"During home renovations, a young couple releas...",After finding an old storage locker filled wit...,A squad of soldiers fight in the Korean War's ...,Two gay fathers of a straight country western ...,"Long before he even met Shrek, the notorious f...",A Broadway producer puts on a play with a Devi...,"In 1979, an Indian family moves to America wit...","After sneaking to a party with her friends, 16...",After being estranged since her mother's death...,
popularity,22.87623,126.393695,85.688789,138.133331,80.878605,22.022228,109.528572,32.351527,7.958831,271.972889,...,2.525569,0.018716,6.116436,0.050978,20.678787,0.001389,0.71087,0.015295,0.028756,0.050625
production_companies,"[{""name"": ""Miramax Films"", ""id"": 14}, {""name"":...","[{""name"": ""Lucasfilm"", ""id"": 1}, {""name"": ""Twe...","[{""name"": ""Pixar Animation Studios"", ""id"": 3}]","[{""name"": ""Paramount Pictures"", ""id"": 4}]","[{""name"": ""DreamWorks SKG"", ""id"": 27}, {""name""...","[{""name"": ""Fine Line Features"", ""id"": 8}, {""na...","[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""name"": ""Paramount Pictures"", ""id"": 4}, {""na...","[{""name"": ""El Deseo"", ""id"": 49}, {""name"": ""Mil...","[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,[],"[{""name"": ""Gravitas Ventures"", ""id"": 44632}]","[{""name"": ""Taewon Entertainment"", ""id"": 3965}]",[],"[{""name"": ""DreamWorks"", ""id"": 7}]",[],[],"[{""name"": ""After Eden Pictures"", ""id"": 85248}]","[{""name"": ""New Kingdom Pictures"", ""id"": 41671}]","[{""name"": ""Eyeline Entertainment"", ""id"": 60343}]"


In [32]:
encoded_data_df['genres'][0]

'[{"id": 80, "name": "Crime"}, {"id": 35, "name": "Comedy"}]'

In [33]:
encoded_data_df['genres']=data_df['genres'].apply( lambda x: [item['name'] for item in json.loads(x)][:3] if pd.notna(x) else None)
encoded_data_df['genres'][0]


['Crime', 'Comedy']

In [35]:
encoded_data_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4793,4794,4795,4796,4797,4798,4799,4800,4801,4802
budget,4000000,11000000,94000000,55000000,15000000,12800000,90000000,92620000,0,140000000,...,0,0,0,200,130000000,0,0,0,0,2
genres,"[Crime, Comedy]","[Adventure, Action, Science Fiction]","[Animation, Family]","[Comedy, Drama, Romance]",[Drama],"[Drama, Crime, Music]","[Adventure, Fantasy, Action]","[Drama, Science Fiction]","[Drama, Romance]","[Adventure, Fantasy, Action]",...,[Horror],[Horror],"[History, Drama, War]",[Drama],"[Action, Adventure, Animation]",[Horror],"[Comedy, Family, Drama]","[Thriller, Drama]",[Family],[Documentary]
homepage,,http://www.starwars.com/films/star-wars-episod...,http://movies.disney.com/finding-nemo,,http://www.dreamworks.com/ab/,,,,http://www.clubcultura.com/clubcine/clubcineas...,http://disney.go.com/disneyvideos/liveaction/p...,...,,,,,,,http://www.growingupsmithmovie.com,,,
id,5,11,12,13,14,16,18,19,20,22,...,396152,402515,407887,408429,417859,426067,426469,433715,447027,459488
keywords,"[{""id"": 612, ""name"": ""hotel""}, {""id"": 613, ""na...","[{""id"": 803, ""name"": ""android""}, {""id"": 4270, ...","[{""id"": 494, ""name"": ""father son relationship""...","[{""id"": 422, ""name"": ""vietnam veteran""}, {""id""...","[{""id"": 255, ""name"": ""male nudity""}, {""id"": 29...","[{""id"": 30, ""name"": ""individual""}, {""id"": 246,...","[{""id"": 402, ""name"": ""clone""}, {""id"": 444, ""na...","[{""id"": 312, ""name"": ""man vs machine""}, {""id"":...","[{""id"": 455, ""name"": ""farewell""}, {""id"": 457, ...","[{""id"": 911, ""name"": ""exotic island""}, {""id"": ...",...,[],"[{""id"": 321, ""name"": ""terror""}, {""id"": 8087, ""...","[{""id"": 10586, ""name"": ""korea""}, {""id"": 229851...",[],"[{""id"": 4414, ""name"": ""adventure""}, {""id"": 618...",[],[],"[{""id"": 10060, ""name"": ""christian film""}, {""id...",[],"[{""id"": 6027, ""name"": ""music""}, {""id"": 225822,..."
original_language,en,en,en,en,en,en,en,de,en,en,...,en,pt,ko,en,en,en,en,en,en,en
title,Four Rooms,Star Wars,Finding Nemo,Forrest Gump,American Beauty,Dancer in the Dark,The Fifth Element,Metropolis,My Life Without Me,Pirates of the Caribbean: The Curse of the Bla...,...,Restoration,Solitude,인천상륙작전,Perfect Cowboy,Puss in Boots,Midnight Cabaret,Growing Up Smith,8 Days,Running Forever,"To Be Frank, Sinatra at 100"
overview,It's Ted the Bellhop's first night on the job....,Princess Leia is captured and held hostage by ...,"Nemo, an adventurous young clownfish, is unexp...",A man with a low IQ has accomplished great thi...,"Lester Burnham, a depressed suburban father in...","Selma, a Czech immigrant on the verge of blind...","In 2257, a taxi driver is unintentionally give...",In a futuristic city sharply divided between t...,A Pedro Almodovar production in which a fatall...,"Jack Sparrow, a freewheeling 17th-century pira...",...,"During home renovations, a young couple releas...",After finding an old storage locker filled wit...,A squad of soldiers fight in the Korean War's ...,Two gay fathers of a straight country western ...,"Long before he even met Shrek, the notorious f...",A Broadway producer puts on a play with a Devi...,"In 1979, an Indian family moves to America wit...","After sneaking to a party with her friends, 16...",After being estranged since her mother's death...,
popularity,22.87623,126.393695,85.688789,138.133331,80.878605,22.022228,109.528572,32.351527,7.958831,271.972889,...,2.525569,0.018716,6.116436,0.050978,20.678787,0.001389,0.71087,0.015295,0.028756,0.050625
production_companies,"[{""name"": ""Miramax Films"", ""id"": 14}, {""name"":...","[{""name"": ""Lucasfilm"", ""id"": 1}, {""name"": ""Twe...","[{""name"": ""Pixar Animation Studios"", ""id"": 3}]","[{""name"": ""Paramount Pictures"", ""id"": 4}]","[{""name"": ""DreamWorks SKG"", ""id"": 27}, {""name""...","[{""name"": ""Fine Line Features"", ""id"": 8}, {""na...","[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""name"": ""Paramount Pictures"", ""id"": 4}, {""na...","[{""name"": ""El Deseo"", ""id"": 49}, {""name"": ""Mil...","[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,[],"[{""name"": ""Gravitas Ventures"", ""id"": 44632}]","[{""name"": ""Taewon Entertainment"", ""id"": 3965}]",[],"[{""name"": ""DreamWorks"", ""id"": 7}]",[],[],"[{""name"": ""After Eden Pictures"", ""id"": 85248}]","[{""name"": ""New Kingdom Pictures"", ""id"": 41671}]","[{""name"": ""Eyeline Entertainment"", ""id"": 60343}]"


In [42]:
encoded_data_df['keywords'][0]

'[{"id": 612, "name": "hotel"}, {"id": 613, "name": "new year\'s eve"}, {"id": 616, "name": "witch"}, {"id": 622, "name": "bet"}, {"id": 922, "name": "hotel room"}, {"id": 2700, "name": "sperm"}, {"id": 12670, "name": "los angeles"}, {"id": 160488, "name": "hoodlum"}, {"id": 187056, "name": "woman director"}, {"id": 198129, "name": "episode film"}]'

In [46]:
encoded_data_df['keywords'] =data_df['keywords'].apply(lambda x: [item['name'] for item in json.loads(x)][:3] if pd.notna(x) else 'none')
encoded_data_df['keywords'][0]

['hotel', "new year's eve", 'witch']

In [None]:
encoded_data_df['production_companies'][0]

'[{"name": "Miramax Films", "id": 14}, {"name": "A Band Apart", "id": 59}]'

In [48]:
encoded_data_df['production_companies'] =data_df['production_companies'].apply(lambda x: [item['name'] for item in json.loads(x)][:3] if pd.notna(x) else 'none')
encoded_data_df['production_companies'][0]

['Miramax Films', 'A Band Apart']

In [50]:
encoded_data_df['spoken_languages'][0]

'[{"iso_639_1": "en", "name": "English"}]'

In [51]:
encoded_data_df['spoken_languages'] = data_df['spoken_languages'].apply(lambda x: [item['name'] for item in json.loads(x)][:3] if pd.notna(x) else 'none')
encoded_data_df['spoken_languages'][0]

['English']

In [52]:
encoded_data_df['crew'][0]

'[{"credit_id": "52fe420dc3a36847f800012d", "department": "Directing", "gender": 1, "id": 3110, "job": "Director", "name": "Allison Anders"}, {"credit_id": "52fe420dc3a36847f80001c9", "department": "Writing", "gender": 1, "id": 3110, "job": "Writer", "name": "Allison Anders"}, {"credit_id": "52fe420dc3a36847f8000133", "department": "Directing", "gender": 2, "id": 3111, "job": "Director", "name": "Alexandre Rockwell"}, {"credit_id": "52fe420dc3a36847f8000151", "department": "Production", "gender": 2, "id": 3111, "job": "Executive Producer", "name": "Alexandre Rockwell"}, {"credit_id": "52fe420dc3a36847f80001cf", "department": "Writing", "gender": 2, "id": 3111, "job": "Writer", "name": "Alexandre Rockwell"}, {"credit_id": "52fe420dc3a36847f8000139", "department": "Directing", "gender": 0, "id": 2294, "job": "Director", "name": "Robert Rodriguez"}, {"credit_id": "52fe420dc3a36847f80001d5", "department": "Writing", "gender": 0, "id": 2294, "job": "Writer", "name": "Robert Rodriguez"}, {"c

In [53]:
encoded_data_df['crew'] = data_df['crew'].apply(lambda x: [item['name'] for item in json.loads(x)][:3] if pd.notna(x) else 'none')
encoded_data_df['crew'][0]

['Allison Anders', 'Allison Anders', 'Alexandre Rockwell']

In [54]:
encoded_data_df.head().T

Unnamed: 0,0,1,2,3,4
budget,4000000,11000000,94000000,55000000,15000000
genres,"[Crime, Comedy]","[Adventure, Action, Science Fiction]","[Animation, Family]","[Comedy, Drama, Romance]",[Drama]
homepage,,http://www.starwars.com/films/star-wars-episod...,http://movies.disney.com/finding-nemo,,http://www.dreamworks.com/ab/
id,5,11,12,13,14
keywords,"[hotel, new year's eve, witch]","[android, galaxy, hermit]","[father son relationship, harbor, underwater]","[vietnam veteran, hippie, mentally disabled]","[male nudity, female nudity, adultery]"
original_language,en,en,en,en,en
title,Four Rooms,Star Wars,Finding Nemo,Forrest Gump,American Beauty
overview,It's Ted the Bellhop's first night on the job....,Princess Leia is captured and held hostage by ...,"Nemo, an adventurous young clownfish, is unexp...",A man with a low IQ has accomplished great thi...,"Lester Burnham, a depressed suburban father in..."
popularity,22.87623,126.393695,85.688789,138.133331,80.878605
production_companies,"[Miramax Films, A Band Apart]","[Lucasfilm, Twentieth Century Fox Film Corpora...",[Pixar Animation Studios],[Paramount Pictures],"[DreamWorks SKG, Jinks/Cohen Company]"


#### 2.1.2 convert 'Overview' feature to a list 

In [55]:
encoded_data_df['overview']=data_df['overview'].apply(lambda x: [x if pd.notna(x) else 'none'])
encoded_data_df['overview'][0]

["It's Ted the Bellhop's first night on the job...and the hotel's very unusual guests are about to place him in some outrageous predicaments. It seems that this evening's room service is serving up one unbelievable happening after another."]

In [57]:
encoded_data_df.head(3).T

Unnamed: 0,0,1,2
budget,4000000,11000000,94000000
genres,"[Crime, Comedy]","[Adventure, Action, Science Fiction]","[Animation, Family]"
homepage,,http://www.starwars.com/films/star-wars-episod...,http://movies.disney.com/finding-nemo
id,5,11,12
keywords,"[hotel, new year's eve, witch]","[android, galaxy, hermit]","[father son relationship, harbor, underwater]"
original_language,en,en,en
title,Four Rooms,Star Wars,Finding Nemo
overview,[It's Ted the Bellhop's first night on the job...,[Princess Leia is captured and held hostage by...,"[Nemo, an adventurous young clownfish, is unex..."
popularity,22.87623,126.393695,85.688789
production_companies,"[Miramax Films, A Band Apart]","[Lucasfilm, Twentieth Century Fox Film Corpora...",[Pixar Animation Studios]


#### 2.1.3 Combine and encode features 

In [60]:
encoded_data_df["tags"]=encoded_data_df["overview"] + encoded_data_df["cast"] + encoded_data_df["genres"] + encoded_data_df["keywords"] 
encoded_data_df["tags"]=encoded_data_df["tags"].apply(lambda x: ', '.join(x))
encoded_data_df["tags"][0]

"It's Ted the Bellhop's first night on the job...and the hotel's very unusual guests are about to place him in some outrageous predicaments. It seems that this evening's room service is serving up one unbelievable happening after another., Tim Roth, Antonio Banderas, Jennifer Beals, Crime, Comedy, hotel, new year's eve, witch"

### 2.2 Feature selection 

In [62]:
tags = encoded_data_df["tags"]
print(tags[0])

It's Ted the Bellhop's first night on the job...and the hotel's very unusual guests are about to place him in some outrageous predicaments. It seems that this evening's room service is serving up one unbelievable happening after another., Tim Roth, Antonio Banderas, Jennifer Beals, Crime, Comedy, hotel, new year's eve, witch


## 3. Model Training 

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

vectorizer=TfidfVectorizer()
tfidf_matrix=vectorizer.fit_transform(tags)
print(tfidf_matrix[0])


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 48 stored elements and shape (1, 25748)>
  Coords	Values
  (0, 12096)	0.1565104009948833
  (0, 22880)	0.18588996900413085
  (0, 23051)	0.09346761493199393
  (0, 2386)	0.23878809842980947
  (0, 8756)	0.1092159946038139
  (0, 16094)	0.11916095725422207
  (0, 16517)	0.05598935047287782
  (0, 12368)	0.12469026510440573
  (0, 1118)	0.03517877443636785
  (0, 11105)	0.2850594717632179
  (0, 24499)	0.12840795574863212
  (0, 24212)	0.1723946991034821
  (0, 10123)	0.18790453946328864
  (0, 1398)	0.0768830607479857
  (0, 403)	0.09643929326308162
  (0, 23296)	0.03459285168233992
  (0, 17586)	0.13271585486253826
  (0, 10836)	0.07977852678403118
  (0, 11541)	0.04114055815170533
  (0, 21485)	0.12000788746746673
  (0, 16744)	0.17905817198606314
  (0, 17950)	0.22776582909393336
  (0, 20495)	0.13530679166880422
  (0, 23048)	0.06195349104052079
  (0, 23122)	0.0907812179692401
  (0, 8024)	0.18401444877667442
  (0, 19756)	0.15947065723388304
  (

In [66]:
model=NearestNeighbors(n_neighbors=5, algorithm="brute", metric="cosine")
fit_result=model.fit(tfidf_matrix)

## 4. Movie recommender function 

In [69]:
def get_movie_recommendations(movie_title):
  
    movie_index = encoded_data_df[encoded_data_df["title"] == movie_title].index[0]

    distances, indices = model.kneighbors(tfidf_matrix[movie_index])
   
    similar_movies = [(encoded_data_df["title"][i], distances[0][j]) for j, i in enumerate(indices[0])]
    
    return similar_movies[1:]

In [70]:
input_movie = "How to Train Your Dragon"
recommendations = get_movie_recommendations(input_movie)
print("Film recommendations '{}'".format(input_movie))
for movie, distance in recommendations:
    print("- Film: {}".format(movie))

Film recommendations 'How to Train Your Dragon'
- Film: How to Train Your Dragon 2
- Film: Dragon Nest: Warriors' Dawn
- Film: Pete's Dragon
- Film: Eragon


## 5. Model Deployment 

In [71]:
import pickle

# Save the assets
pickle.dump(model, open("../models/model.pkl", "wb"))
pickle.dump(tfidf_matrix, open("../data/tfidf_matrix.pkl", "wb"))
pickle.dump(encoded_data_df, open("../data/encoded_features_df.pkl", "wb"))