In [101]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier

In [102]:
title_ratings_df = pd.read_csv('./test-data/title.ratings.tsv', delimiter='\t')
title_ratings_df.head()


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1965
1,tt0000002,5.8,263
2,tt0000003,6.5,1808
3,tt0000004,5.6,178
4,tt0000005,6.2,2605


In [103]:
title_basics_df = pd.read_csv('./test-data/title.basics.tsv', sep='\t', na_values='\\N', usecols=['tconst', 'genres', 'titleType', 'startYear'])
title_basics_df.head()

Unnamed: 0,tconst,titleType,startYear,genres
0,tt0000001,short,1894.0,"Documentary,Short"
1,tt0000002,short,1892.0,"Animation,Short"
2,tt0000003,short,1892.0,"Animation,Comedy,Romance"
3,tt0000004,short,1892.0,"Animation,Short"
4,tt0000005,short,1893.0,"Comedy,Short"


In [104]:
merged_df = pd.merge(title_ratings_df, title_basics_df, on='tconst')
merged_df.head()

Unnamed: 0,tconst,averageRating,numVotes,titleType,startYear,genres
0,tt0000001,5.7,1965,short,1894.0,"Documentary,Short"
1,tt0000002,5.8,263,short,1892.0,"Animation,Short"
2,tt0000003,6.5,1808,short,1892.0,"Animation,Comedy,Romance"
3,tt0000004,5.6,178,short,1892.0,"Animation,Short"
4,tt0000005,6.2,2605,short,1893.0,"Comedy,Short"


In [105]:
merged_df = merged_df.dropna(subset=['genres'])
merged_df = merged_df[merged_df['titleType'] == 'movie']
merged_df.head()

Unnamed: 0,tconst,averageRating,numVotes,titleType,startYear,genres
8,tt0000009,5.3,205,movie,1894.0,Romance
144,tt0000147,5.3,469,movie,1897.0,"Documentary,News,Sport"
358,tt0000574,6.0,823,movie,1906.0,"Action,Adventure,Biography"
366,tt0000591,4.4,20,movie,1907.0,Drama
380,tt0000615,4.3,24,movie,1907.0,Drama


In [106]:
# genre_year_ratings = merged_df.groupby(['genres', 'startYear'])['averageRating'].mean().reset_index()
merged_df = merged_df.groupby(['genres', 'startYear'])['averageRating'].mean().reset_index()
merged_df.head()


Unnamed: 0,genres,startYear,averageRating
0,Action,1914.0,6.566667
1,Action,1916.0,5.45
2,Action,1919.0,4.5
3,Action,1922.0,5.2
4,Action,1925.0,7.8


In [107]:
from sklearn.preprocessing import LabelEncoder
genre_encoder = LabelEncoder()
merged_df["genre_code"] = genre_encoder.fit_transform(merged_df["genres"])
merged_df.head()


Unnamed: 0,genres,startYear,averageRating,genre_code
0,Action,1914.0,6.566667,0
1,Action,1916.0,5.45,0
2,Action,1919.0,4.5,0
3,Action,1922.0,5.2,0
4,Action,1925.0,7.8,0


In [108]:
# pivoted_df = genre_year_ratings.pivot(index='startYear', columns='genres', values='averageRating').fillna(0)


In [109]:
# X = pivoted_df.iloc[:, :-1].values
# y = pivoted_df.iloc[:, -1].values

In [110]:
# bins = [0, 3, 6, 9, 10]

# # Create a new column in the dataframe with the ratings binned
# merged_df['rating_category'] = pd.cut(merged_df['averageRating'], bins, labels=['low', 'medium', 'high', 'very high'])


In [111]:
X = merged_df[['startYear', 'averageRating']]
y = merged_df['genre_code']

In [112]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [113]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

In [122]:
year = 2023
avg_rating = 6.7
genre_code = knn.predict([[year, avg_rating]])[0]
genre = genre_encoder.inverse_transform([genre_code])[0]

print(f"The predicted genre for the year {year} with an average rating of {avg_rating} is {genre}.")

The predicted genre for the year 2023 with an average rating of 6.7 is Action,Mystery,Thriller.


