In [None]:
# Importing necessary libraries
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [None]:
# Load the dataset and display first 5 rows of the dataframe
df = pd.read_csv('https://github.com/ArinB/MSBA-CA-Data/raw/main/CA05/movies_recommendation_data.csv')
df.head()

Unnamed: 0,Movie ID,Movie Name,IMDB Rating,Biography,Drama,Thriller,Comedy,Crime,Mystery,History,Label
0,58,The Imitation Game,8.0,1,1,1,0,0,0,0,0
1,8,Ex Machina,7.7,0,1,0,0,0,1,0,0
2,46,A Beautiful Mind,8.2,1,1,0,0,0,0,0,0
3,62,Good Will Hunting,8.3,0,1,0,0,0,0,0,0
4,97,Forrest Gump,8.8,0,1,0,0,0,0,0,0


In [None]:
# Displaying a concise summary of the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Movie ID     30 non-null     int64  
 1   Movie Name   30 non-null     object 
 2   IMDB Rating  30 non-null     float64
 3   Biography    30 non-null     int64  
 4   Drama        30 non-null     int64  
 5   Thriller     30 non-null     int64  
 6   Comedy       30 non-null     int64  
 7   Crime        30 non-null     int64  
 8   Mystery      30 non-null     int64  
 9   History      30 non-null     int64  
 10  Label        30 non-null     int64  
dtypes: float64(1), int64(9), object(1)
memory usage: 2.7+ KB


In [None]:
# Generating descriptive statistics
df.describe()

Unnamed: 0,Movie ID,IMDB Rating,Biography,Drama,Thriller,Comedy,Crime,Mystery,History,Label
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,48.133333,7.696667,0.233333,0.6,0.1,0.1,0.133333,0.1,0.1,0.0
std,29.288969,0.666169,0.430183,0.498273,0.305129,0.305129,0.345746,0.305129,0.305129,0.0
min,1.0,5.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,27.75,7.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,48.5,7.75,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,64.25,8.175,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,98.0,8.8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


## EDA

In [None]:
# Visualizing the distribution of IMDB ratings using a histogram
# We can see the rating is left skew
fig = px.histogram(df, x ='IMDB Rating', histfunc='count')
fig.show()

In [None]:
# Defining a list of genre columns present in the dataset
genre_columns = ['Biography', 'Drama', 'Thriller', 'Comedy', 'Crime', 'Mystery', 'History']

In [None]:
# Creating a bar chart for each genre to visualize the count of movies in each genre category
# Most of the columns contain drama, while other genre are very unique in the plot
fig = go.Figure()

for genre in genre_columns:
    genre_counts = df[genre].value_counts()
    fig.add_trace(go.Bar(x = genre_counts.index, y=genre_counts.values, name=genre))

dropdown_buttons = [
        {'label': "All", 'method': "update", 'args': [{"visible": [True, True, True, True, True, True, True]}, {"title": "All"}]},
    {'label': "Biography", 'method': "update", 'args': [{"visible": [True, False, False, False, False, False, False]}, {"title": "Biography",}]},
    {'label': "Drama", 'method': "update", 'args': [{"visible": [False, True, False, False, False, False, False]}, {"title": "Drama"}]},
    {'label': "Thriller", 'method': "update", 'args': [{"visible": [False, False, True, False, False, False, False]}, {"title": "Thriller"}]},
    {'label': "Comedy", 'method': "update", 'args': [{"visible": [False, False, False, True, False, False, False]}, {"title": "Comedy"}]},
    {'label': "Crime", 'method': "update", 'args': [{"visible": [False, False, False, False, True, False, False]}, {"title": "Crime"}]},
    {'label': "Mystery", 'method': "update", 'args': [{"visible": [False, False, False, False, False, True, False]}, {"title": "Mystery"}]},
    {'label': "History", 'method': "update", 'args': [{"visible": [False, False, False, False, False, False, True]}, {"title": "History"}]},
]

# Update the figure to add dropdown menu
fig.update_layout({
  		'updatemenus': [
        {'active': 0, 'buttons': dropdown_buttons}
        ]})

# Show the plot
fig.show()

## KNN neighbor

In [None]:
# Importing KNeighborsClassifier from sklearn and preparing the data for KNN classification
from sklearn.neighbors import KNeighborsClassifier

X = df.drop(['Movie ID','Label','Movie Name'], axis = 1)
y = df['Label']

knn = KNeighborsClassifier(n_neighbors = 5)

knn.fit(X, y)

In [None]:
# Creating a new dataframe with specific values to predict its label using the trained KNN model
data = {
    'IMDB Rating': [7.2],
    'Biography': ['Yes'],
    'Drama': ['Yes'],
    'Thriller': ['No'],
    'Comedy': ['No'],
    'Crime': ['No'],
    'Mystery': ['No'],
    'History': ['Yes']
}

predict_data = pd.DataFrame(data)
predict_data

Unnamed: 0,IMDB Rating,Biography,Drama,Thriller,Comedy,Crime,Mystery,History
0,7.2,Yes,Yes,No,No,No,No,Yes


In [None]:
# Transformation
predict_data.replace({'Yes': 1, 'No': 0}, inplace=True)
predict_data

Unnamed: 0,IMDB Rating,Biography,Drama,Thriller,Comedy,Crime,Mystery,History
0,7.2,1,1,0,0,0,0,1


In [None]:
# Using the trained KNN model to find the five closest neighbors to the provided data point
recommend_movie = knn.kneighbors(predict_data)[1]

movie_list = []

for movie in recommend_movie:
    print(df.loc[movie]['Movie Name'])
    rec_movie = df.loc[movie]['Movie Name']
    rec_movie_list = rec_movie.values.tolist()



28    12 Years a Slave
27       Hacksaw Ridge
29      Queen of Katwe
16      The Wind Rises
2     A Beautiful Mind
Name: Movie Name, dtype: object


In [None]:
# Printing a list of the top 5 recommended movies based on the closest neighbors found by the KNN model
print("These are the top 5 movies we recommend:")
for i, movie in enumerate(rec_movie_list, start=1):
    print(f"{i}. {movie}")

These are the top 5 movies we recommend:
1. 12 Years a Slave
2. Hacksaw Ridge
3. Queen of Katwe
4. The Wind Rises
5. A Beautiful Mind
