# STEP 1: Import required libraries

In [2]:
import pandas as pd
import numpy as np

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

# STEP 2: Data imports (3 datasets already mentioned)

Reading all three csv files

In [5]:
df_netflix = pd.read_csv('Netflix tv shows/titles.csv')
df_amazon = pd.read_csv('amazon prime/titles.csv')
df_hbo = pd.read_csv('HBO Max TV/titles.csv')

In [6]:
#pass and concat all dataset into the list and keep the axis = 0 since we need to concat the data row wise.
df = pd.concat([df_netflix, df_amazon, df_hbo], axis=0)
df.head()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts300399,Five Came Back: The Reference Films,SHOW,This collection includes 12 World War II-era p...,1945,TV-MA,51,['documentation'],['US'],1.0,,,,0.6,
1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",['US'],,tt0075314,8.2,808582.0,40.965,8.179
2,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,"['drama', 'action', 'thriller', 'european']",['US'],,tt0068473,7.7,107673.0,10.01,7.3
3,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"['fantasy', 'action', 'comedy']",['GB'],,tt0071853,8.2,534486.0,15.461,7.811
4,tm120801,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,1967,,150,"['war', 'action']","['GB', 'US']",,tt0061578,7.7,72662.0,20.398,7.6


These are the features

In [7]:
df.columns

Index(['id', 'title', 'type', 'description', 'release_year',
       'age_certification', 'runtime', 'genres', 'production_countries',
       'seasons', 'imdb_id', 'imdb_score', 'imdb_votes', 'tmdb_popularity',
       'tmdb_score'],
      dtype='object')

# STEP 3: Data cleaning and preprocessing

In [8]:
#remove duplicate values
df_movies = df.drop_duplicates()
df_movies.duplicated().sum()

0

In [9]:
#drop unnecessary columns
df_movies.drop(['description', 'age_certification'], axis = 1, inplace=True)

dataset after removing unnecessary columns and dropping duplicate values

In [10]:
df_movies

Unnamed: 0,id,title,type,release_year,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts300399,Five Came Back: The Reference Films,SHOW,1945,51,['documentation'],['US'],1.0,,,,0.600,
1,tm84618,Taxi Driver,MOVIE,1976,114,"['drama', 'crime']",['US'],,tt0075314,8.2,808582.0,40.965,8.179
2,tm154986,Deliverance,MOVIE,1972,109,"['drama', 'action', 'thriller', 'european']",['US'],,tt0068473,7.7,107673.0,10.010,7.300
3,tm127384,Monty Python and the Holy Grail,MOVIE,1975,91,"['fantasy', 'action', 'comedy']",['GB'],,tt0071853,8.2,534486.0,15.461,7.811
4,tm120801,The Dirty Dozen,MOVIE,1967,150,"['war', 'action']","['GB', 'US']",,tt0061578,7.7,72662.0,20.398,7.600
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3289,tm1082718,Romeo Santos: Utopia Live from MetLife Stadium,MOVIE,2021,103,"['romance', 'music']",['PR'],,,,,8.425,8.100
3290,tm1067128,Algo Azul,MOVIE,2021,90,['comedy'],['PA'],,tt9257620,5.9,50.0,1.400,2.000
3291,tm1121489,Entre Nos: What She Said,MOVIE,2021,28,['comedy'],[],,tt15532762,,,,
3292,tm1121486,Entre Nos: The Winners 2,MOVIE,2021,28,['comedy'],[],,tt15532736,,,,


**working with production_countries column**

In [11]:
#as we can see that the countries are in list format, we need to remove the square bracket and '' marks i.e. make values as only string. Also their are some rows that have multiple countries in the list and few of them are empty. we will work on this now.
df['production_countries']

0             ['US']
1             ['US']
2             ['US']
3             ['GB']
4       ['GB', 'US']
            ...     
3289          ['PR']
3290          ['PA']
3291              []
3292              []
3293          ['US']
Name: production_countries, Length: 19015, dtype: object

1. Remove unwanted characters from the 'production_countries' column

In [12]:
#call production_counties and convert it to string.
#now we call replace. The .str.replace() method is used to remove '[' and ']' characters, and any single quote.
# The 'regex=True' flag allows the .str.replace() method to interpret the patterns as regular expressions.

# to remove "[" we use the pattern code as str.replace(r "\[", '' ,regex=True)
#to remove "'" we use pattern code as str.replace(r "'", '' ,regex=True)
# similarly to remove "]" we use the code str.replace(r "\]", '',regex=True)

#replace function here has 3 arguments 1st is the string we want to replace enclosed in "", then the string with which we want to replace (here we used empty string ''), and 3rd is regex
#we can store this value into the same column after modification as
df_movies['production_countries'] = df_movies['production_countries'].str.replace(r"\[",'', regex=True).str.replace(r"'",'', regex=True).str.replace(r"\]", '',regex=True)

df_movies['production_countries']

0           US
1           US
2           US
3           GB
4       GB, US
         ...  
3289        PR
3290        PA
3291          
3292          
3293        US
Name: production_countries, Length: 18980, dtype: object

2. Extract the first country from the cleaned 'production_countries' column

In [13]:
#now we need to get only the 1st country from the column which has more than one country.
#The .str.split(',') splits the string into a list using commas as the delimiter, then .str[0] selects the first element.
#also we can store this into new column 'lead_prod_country' that represents the primary production country of each movie 
df_movies['lead_prod_country'] = df_movies['production_countries'].str.split(',').str[0]
df_movies['lead_prod_country']

0       US
1       US
2       US
3       GB
4       GB
        ..
3289    PR
3290    PA
3291      
3292      
3293    US
Name: lead_prod_country, Length: 18980, dtype: object

3. Calculate the number of countries involved in the production of each movie

In [14]:
#since the production_countries column still have all countries seperated by 
# ',' as string. so we can directly split it into list using ',' as delimiter as in above and apply str.len function on the same so get no of countries.

df_movies['prod_countries_count'] = df_movies['production_countries'].str.split(',').str.len()
df_movies['prod_countries_count']

0       1
1       1
2       1
3       1
4       2
       ..
3289    1
3290    1
3291    1
3292    1
3293    1
Name: prod_countries_count, Length: 18980, dtype: int64

 4. Replace any empty values in the 'lead_prod_country' column with NaN (Not a Number)

In [15]:
#now we have worked with '[', ']', ',' and more than one countries. the rows that have empty value i.e. which has no country mentioned we need to replace those empty rows with NaN. 
#use np.nan fucntion to do the same fron numpy library
#Handling missing data with NaN is important for accurate data analysis and prevents errors in downstream processing
df_movies['lead_prod_country'] = df_movies['lead_prod_country'].replace('', np.nan)
df_movies['lead_prod_country']

0        US
1        US
2        US
3        GB
4        GB
       ... 
3289     PR
3290     PA
3291    NaN
3292    NaN
3293     US
Name: lead_prod_country, Length: 18980, dtype: object

**Working with genres column**

In [16]:
df_movies['genres']

0                                 ['documentation']
1                                ['drama', 'crime']
2       ['drama', 'action', 'thriller', 'european']
3                   ['fantasy', 'action', 'comedy']
4                                 ['war', 'action']
                           ...                     
3289                           ['romance', 'music']
3290                                     ['comedy']
3291                                     ['comedy']
3292                                     ['comedy']
3293                              ['documentation']
Name: genres, Length: 18980, dtype: object

In [17]:
#1. Remove unwanted characters from the 'genres' column
df_movies['genres'] = df_movies['genres'].str.replace(r"\[",'', regex=True).str.replace(r"'",'', regex=True).str.replace(r"\]", '',regex=True)
df_movies['genres']

0                           documentation
1                            drama, crime
2       drama, action, thriller, european
3                 fantasy, action, comedy
4                             war, action
                      ...                
3289                       romance, music
3290                               comedy
3291                               comedy
3292                               comedy
3293                        documentation
Name: genres, Length: 18980, dtype: object

In [18]:
#2. Extract the first genre from the cleaned 'genres' column
df_movies['main_genres'] = df_movies['genres'].str.split(',').str[0]

#3. Replace any empty values in the 'main_genre' column with NaN (Not a Number)
df_movies['lead_prod_country'] = df_movies['lead_prod_country'].replace('', np.nan)
df_movies['main_genres']

0       documentation
1               drama
2               drama
3             fantasy
4                 war
            ...      
3289          romance
3290           comedy
3291           comedy
3292           comedy
3293    documentation
Name: main_genres, Length: 18980, dtype: object

remove unwanted column from df_movies. 

In [19]:
#here we have already worked with genres and production_countries column and we do not need them, so we can drop them permanently
df_movies.drop(['genres', 'production_countries'], axis=1, inplace=True)

In [20]:
df_movies.columns

Index(['id', 'title', 'type', 'release_year', 'runtime', 'seasons', 'imdb_id',
       'imdb_score', 'imdb_votes', 'tmdb_popularity', 'tmdb_score',
       'lead_prod_country', 'prod_countries_count', 'main_genres'],
      dtype='object')

**Dealing with missing values in our data - DROP MISSING VALUES**

In [21]:
#we can see the total movies and features
df_movies.shape

(18980, 14)

In [22]:
#let's check number of missing values
df_movies.isnull().sum()

id                          0
title                       1
type                        0
release_year                0
runtime                     0
seasons                 14772
imdb_id                  1394
imdb_score               1873
imdb_votes               1910
tmdb_popularity           670
tmdb_score               2656
lead_prod_country        1160
prod_countries_count        0
main_genres                 0
dtype: int64

In [23]:
# Drop rows with any missing values to clean the dataset
df_movies.dropna(inplace=True)

# Set the 'title' column as the DataFrame index
df_movies.set_index('title', inplace=True)

# Drop the 'id' and 'imdb_id' columns as they are not needed for further analysis
df_movies.drop(['id', 'imdb_id'], axis=1, inplace=True)

In [24]:
df_movies.head()

Unnamed: 0_level_0,type,release_year,runtime,seasons,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,lead_prod_country,prod_countries_count,main_genres
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Monty Python's Flying Circus,SHOW,1969,30,4.0,8.8,73424.0,17.617,8.306,GB,1,comedy
Seinfeld,SHOW,1989,24,9.0,8.9,308824.0,130.213,8.301,US,1,comedy
Knight Rider,SHOW,1982,51,4.0,6.9,34115.0,50.267,7.5,US,1,scifi
Thomas & Friends,SHOW,1984,10,24.0,6.5,5104.0,42.196,6.5,GB,1,animation
Saved by the Bell,SHOW,1989,23,5.0,7.1,35034.0,19.855,8.0,US,1,family


### Encoding Categorical Features

In [25]:
#here few columns like type, lead_prod_country and main_genres have values in string format. we do encoding to convert it into numerical data

#1. Create dummy variables for categorical columns ('type', 'lead_prod_country', 'main_genre')

dummies = pd.get_dummies(df_movies[['type', 'lead_prod_country', 'main_genres']], drop_first=True)

#here this will create seperate column for all different values for a perticular column and put it as 0 or 1 depending whether value is present or not. e.g. type have 2 values show and movie. so 2 columns are made for these 2 values. and now at each column it is written as 0 or 1 indicating if the value is present or not.
dummies.astype(int)


Unnamed: 0_level_0,lead_prod_country_AR,lead_prod_country_AT,lead_prod_country_AU,lead_prod_country_BE,lead_prod_country_BR,lead_prod_country_CA,lead_prod_country_CH,lead_prod_country_CL,lead_prod_country_CN,lead_prod_country_CO,...,main_genres_history,main_genres_horror,main_genres_music,main_genres_reality,main_genres_romance,main_genres_scifi,main_genres_sport,main_genres_thriller,main_genres_war,main_genres_western
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Monty Python's Flying Circus,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Seinfeld,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Knight Rider,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
Thomas & Friends,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Saved by the Bell,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Level Playing Field,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Os Ausentes,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Through Our Eyes,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Sweet Life: Los Angeles,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [26]:
#2. Concatenate the dummy variables with the original DataFrame i.e. concat the above columns with df_movies columns. 
#we create a new data after this step as df_movies_dum
df_movies_dum = pd.concat([df_movies, dummies], axis=1)

In [27]:
#3. Drop the original categorical columns after creating dummy variables since now these column are of no use as they are in string format
df_movies_dum.drop(['type', 'lead_prod_country', 'main_genres'], axis=1, inplace=True)

In [28]:
df_movies_dum

Unnamed: 0_level_0,release_year,runtime,seasons,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,prod_countries_count,lead_prod_country_AR,lead_prod_country_AT,...,main_genres_history,main_genres_horror,main_genres_music,main_genres_reality,main_genres_romance,main_genres_scifi,main_genres_sport,main_genres_thriller,main_genres_war,main_genres_western
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Monty Python's Flying Circus,1969,30,4.0,8.8,73424.0,17.617,8.306,1,False,False,...,False,False,False,False,False,False,False,False,False,False
Seinfeld,1989,24,9.0,8.9,308824.0,130.213,8.301,1,False,False,...,False,False,False,False,False,False,False,False,False,False
Knight Rider,1982,51,4.0,6.9,34115.0,50.267,7.500,1,False,False,...,False,False,False,False,False,True,False,False,False,False
Thomas & Friends,1984,10,24.0,6.5,5104.0,42.196,6.500,1,False,False,...,False,False,False,False,False,False,False,False,False,False
Saved by the Bell,1989,23,5.0,7.1,35034.0,19.855,8.000,1,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Level Playing Field,2021,26,1.0,5.5,60.0,4.595,5.000,1,False,False,...,False,False,False,False,False,False,False,False,False,False
Os Ausentes,2021,46,1.0,5.9,59.0,4.624,10.000,1,False,False,...,False,False,False,False,False,False,False,False,False,False
Through Our Eyes,2021,33,1.0,6.1,38.0,0.840,1.000,1,False,False,...,False,False,False,False,False,False,False,False,False,False
Sweet Life: Los Angeles,2021,34,2.0,4.0,137.0,2.579,5.500,1,False,False,...,False,False,False,True,False,False,False,False,False,False


### Scaling (Minmaxscaler)

In [29]:
#their are some of the the columns in our dataset which have continuous values (like imdb_votes column) and some of these values are large where some small and model gives preferece to large ones. We have to convert these values into 0 and 1 range using minmaxscalar algo so that model gives equal importance to all values. 

# Apply MinMaxScaler to scale the data for model training
scaler = MinMaxScaler() #create object for minmaxscalar library

df_scaled = scaler.fit_transform(df_movies_dum) #new data created

#data created is in numpy array form so convert it back to dataframe form
df_scaled = pd.DataFrame(df_scaled, columns=df_movies_dum.columns)

# Display the scaled DataFrame

df_scaled

Unnamed: 0,release_year,runtime,seasons,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,prod_countries_count,lead_prod_country_AR,lead_prod_country_AT,...,main_genres_history,main_genres_horror,main_genres_music,main_genres_reality,main_genres_romance,main_genres_scifi,main_genres_sport,main_genres_thriller,main_genres_war,main_genres_western
0,0.397727,0.168539,0.058824,0.9125,0.037009,0.007913,0.815870,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.625000,0.134831,0.156863,0.9250,0.155671,0.058490,0.815326,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.545455,0.286517,0.058824,0.6750,0.017194,0.022579,0.728261,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.568182,0.056180,0.450980,0.6250,0.002570,0.018954,0.619565,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.625000,0.129213,0.078431,0.7000,0.017658,0.008919,0.782609,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3296,0.988636,0.146067,0.000000,0.5000,0.000028,0.002064,0.456522,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3297,0.988636,0.258427,0.000000,0.5500,0.000027,0.002077,1.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3298,0.988636,0.185393,0.000000,0.5750,0.000017,0.000377,0.021739,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3299,0.988636,0.191011,0.019608,0.3125,0.000067,0.001158,0.510870,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


# Step 4: DBSCAM

run a loop to get best epsilon and minpoints

In [30]:
# Define the range of epsilon (eps) and minimum samples (min_samples) parameters for DBSCAN

eps_array = [0.2, 0.5, 1]  # List of different epsilon values 
min_samples_array = [5, 10, 30]  # List of different min_samples values 

In [31]:

# Iterate over each combination of eps and min_samples
for eps in eps_array:
    for min_samples in min_samples_array:
        # Initialize and fit the DBSCAN model with the current parameters
        #we pass the eps and min_samples value from loop as parameter into this function
        clusterer = DBSCAN(eps=eps, min_samples=min_samples).fit(df_scaled)
        
        # Retrieve all the cluster using 'labels_' parameter from the fitted model
        cluster_labels = clusterer.labels_
        
        # Check if the algorithm found only one cluster or marked all points as noise (-1 label for noise)
        if len(set(cluster_labels)) == 1:
            continue  # Skip this combination as it does not provide meaningful clusters
        
        #if more than on cluster is generated i.e. no noise proper cluster formed then calculate silhoutte score
        # Calculate the silhouette score to evaluate the quality of the clustering
        silhouette_avg = silhouette_score(df_scaled, cluster_labels)
        
        # Print the current parameters, number of clusters, and the silhouette score
        print("For eps =", eps,
              "For min_samples =", min_samples,
              "Count clusters =", len(set(cluster_labels)), #to get unique clusters only use set and to calculate the no of clusters use len function
              "The average silhouette_score is :", silhouette_avg)


For eps = 0.2 For min_samples = 5 Count clusters = 75 The average silhouette_score is : 0.47550532170236703
For eps = 0.2 For min_samples = 10 Count clusters = 37 The average silhouette_score is : 0.3993967985760923
For eps = 0.2 For min_samples = 30 Count clusters = 17 The average silhouette_score is : 0.26622393639424324
For eps = 0.5 For min_samples = 5 Count clusters = 92 The average silhouette_score is : 0.6270549336467924
For eps = 0.5 For min_samples = 10 Count clusters = 56 The average silhouette_score is : 0.5839214397660182
For eps = 0.5 For min_samples = 30 Count clusters = 21 The average silhouette_score is : 0.40484722413607616
For eps = 1 For min_samples = 5 Count clusters = 94 The average silhouette_score is : 0.6344598661713358
For eps = 1 For min_samples = 10 Count clusters = 57 The average silhouette_score is : 0.5908442378003532
For eps = 1 For min_samples = 30 Count clusters = 22 The average silhouette_score is : 0.41532041249606516


### DBSCAN with best hypterparameters (eps = 1, minpoints = 5)

In [32]:
#best values are the one which give us maximum value for silhauette score
dbscan_model = DBSCAN(eps=1, min_samples=5).fit(df_scaled)
print("For eps =", 1,
      "For min_samples =", 5,
      "Count clusters =", len(set(dbscan_model.labels_)),
      "The average silhouette_score is :", silhouette_score(df_scaled, dbscan_model.labels_))

For eps = 1 For min_samples = 5 Count clusters = 94 The average silhouette_score is : 0.6344598661713358


save clusters for recommendations

In [33]:
#create a new column in df_movies as dbscan_clusters which will store the cluster to which the particular movie belong to. -1 means the movie belong to noise point (considered as outlier)
df_movies['dbscan_clusters'] = dbscan_model.labels_
df_movies

Unnamed: 0_level_0,type,release_year,runtime,seasons,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,lead_prod_country,prod_countries_count,main_genres,dbscan_clusters
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Monty Python's Flying Circus,SHOW,1969,30,4.0,8.8,73424.0,17.617,8.306,GB,1,comedy,0
Seinfeld,SHOW,1989,24,9.0,8.9,308824.0,130.213,8.301,US,1,comedy,1
Knight Rider,SHOW,1982,51,4.0,6.9,34115.0,50.267,7.500,US,1,scifi,2
Thomas & Friends,SHOW,1984,10,24.0,6.5,5104.0,42.196,6.500,GB,1,animation,3
Saved by the Bell,SHOW,1989,23,5.0,7.1,35034.0,19.855,8.000,US,1,family,4
...,...,...,...,...,...,...,...,...,...,...,...,...
Level Playing Field,SHOW,2021,26,1.0,5.5,60.0,4.595,5.000,US,1,documentation,35
Os Ausentes,SHOW,2021,46,1.0,5.9,59.0,4.624,10.000,BR,1,action,-1
Through Our Eyes,SHOW,2021,33,1.0,6.1,38.0,0.840,1.000,US,1,documentation,35
Sweet Life: Los Angeles,SHOW,2021,34,2.0,4.0,137.0,2.579,5.500,US,1,reality,5


# Step 5: Movie Recommendation Function

since now our data is ready and we can use the clustring result to recommend movies. 

In [34]:
import random

#fucntion will take movie name as argument
def recommend_movie(movie_name: str):
    # Convert the input movie name to lowercase for case-insensitive matching
    movie_name = movie_name.lower()

    # Create a new column 'name' with lowercase movie names for comparison
    df_movies['name'] = df_movies.index.str.lower()

    # Find the movie that matches the input name
    movie = df_movies[df_movies['name'].str.contains(movie_name, na=False)]

    if not movie.empty:
        # Get the cluster label of the input movie. i.e. match the cluster value of input movie and the movies that match the input movies.
        cluster = movie['dbscan_clusters'].values[0]

        # Get all movies in the same cluster
        cluster_movies = df_movies[df_movies['dbscan_clusters'] == cluster]

        # If there are more than 5 movies in the cluster, randomly select 5
        #if len(cluster_movies) >= 5:
            #recommended_movies = random.sample(list(cluster_movies.index), 5)
        #else:
            # If fewer than 5, return all the movies in the cluster
        recommended_movies = list(cluster_movies.index)

        # Print the recommended movies
        print('--- We can recommend you these movies ---')
        for m in recommended_movies:
            print(m)
    else:
        print('Movie not found in the database.')

now we can pass a movie and see the recommendation result by our model

In [35]:
s = input("Enter movie name")
print("\n\n")
recommend_movie(s)




--- We can recommend you these movies ---
Stargate SG-1
My Babysitter's a Vampire
Travelers
Dark Matter
Between
Creeped Out
Tales from the Cryptkeeper
Todd and the Book of Pure Evil
Strange Days at Blake Holsey High
Class of the Titans
Chaotic
Jane and the Dragon
Kid vs. Kat
The Future Is Wild
Di-Gata Defenders
Xiaolin Chronicles
Annedroids
Mortal Kombat: Legacy


# Streamlit APP 

In [36]:
#save df_movies dataset
df_movies.to_csv("clustered_movies.csv", index = False)