In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold


In [5]:
# Loading the bad film data
bad_film_data = pd.read_csv('BadFilmData.csv')
bad_film_data

Unnamed: 0,id,title,genres,averageRating,numVotes,releaseYear
0,tt5988370,Reis,"Biography, Drama",1.0,74166,2017
1,tt7221896,Cumali Ceber: Allah Seni Alsin,Comedy,1.0,39466,2017
2,tt13423846,321 Action,Drama,1.0,10215,2020
3,tt20255706,The Pogmentary,"Biography, Documentary, Sport",1.1,13805,2022
4,tt21403988,The Last Pharaoh,"Documentary, Drama, History",1.1,12019,2023
...,...,...,...,...,...,...
995,tt4334266,The Bad Batch,"Action, Horror, Mystery",5.2,36189,2016
996,tt1712170,Alex Cross,"Action, Crime, Mystery",5.2,35725,2012
997,tt0463872,Asterix at the Olympic Games,"Adventure, Comedy, Family",5.2,33127,2008
998,tt1731697,The Lords of Salem,"Horror, Thriller",5.2,32998,2012


In [6]:
# Summary of the dataset
bad_film_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             1000 non-null   object 
 1   title          1000 non-null   object 
 2   genres         1000 non-null   object 
 3   averageRating  1000 non-null   float64
 4   numVotes       1000 non-null   int64  
 5   releaseYear    1000 non-null   int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 47.0+ KB


In [7]:
#Checking for columns with null/NaN values
bad_film_data.isna().sum()

id               0
title            0
genres           0
averageRating    0
numVotes         0
releaseYear      0
dtype: int64

Getting rid of the unneeded id column

In [8]:
bad_film_data.drop(['id'],axis=1,inplace=True)
bad_film_data['averageRating']=(bad_film_data['averageRating']*10)
bad_film_data

Unnamed: 0,title,genres,averageRating,numVotes,releaseYear
0,Reis,"Biography, Drama",10.0,74166,2017
1,Cumali Ceber: Allah Seni Alsin,Comedy,10.0,39466,2017
2,321 Action,Drama,10.0,10215,2020
3,The Pogmentary,"Biography, Documentary, Sport",11.0,13805,2022
4,The Last Pharaoh,"Documentary, Drama, History",11.0,12019,2023
...,...,...,...,...,...
995,The Bad Batch,"Action, Horror, Mystery",52.0,36189,2016
996,Alex Cross,"Action, Crime, Mystery",52.0,35725,2012
997,Asterix at the Olympic Games,"Adventure, Comedy, Family",52.0,33127,2008
998,The Lords of Salem,"Horror, Thriller",52.0,32998,2012


In [9]:
print(bad_film_data['genres'].value_counts())

Comedy                         46
Action, Adventure, Comedy      42
Horror, Mystery, Thriller      40
Horror, Thriller               34
Comedy, Romance                28
                               ..
Action, Adventure               1
Drama, War                      1
Comedy, Reality-TV, Romance     1
Action, Comedy, Music           1
Comedy, Drama, Fantasy          1
Name: genres, Length: 194, dtype: int64


In [10]:
# Looking for a comedy film genre to analyze
ComedyFilm=bad_film_data
ComedyFilm['Indexes']=ComedyFilm['genres'].str.find("Comedy")
ComedyFilm

Unnamed: 0,title,genres,averageRating,numVotes,releaseYear,Indexes
0,Reis,"Biography, Drama",10.0,74166,2017,-1
1,Cumali Ceber: Allah Seni Alsin,Comedy,10.0,39466,2017,0
2,321 Action,Drama,10.0,10215,2020,-1
3,The Pogmentary,"Biography, Documentary, Sport",11.0,13805,2022,-1
4,The Last Pharaoh,"Documentary, Drama, History",11.0,12019,2023,-1
...,...,...,...,...,...,...
995,The Bad Batch,"Action, Horror, Mystery",52.0,36189,2016,-1
996,Alex Cross,"Action, Crime, Mystery",52.0,35725,2012,-1
997,Asterix at the Olympic Games,"Adventure, Comedy, Family",52.0,33127,2008,11
998,The Lords of Salem,"Horror, Thriller",52.0,32998,2012,-1


In [11]:
isComedy=ComedyFilm[ComedyFilm['Indexes'] > 0]
isComedy

Unnamed: 0,title,genres,averageRating,numVotes,releaseYear,Indexes
19,Turks in Space,"Action, Comedy, Sci-Fi",15.0,16829,2006,8
20,Velma,"Adventure, Animation, Comedy",16.0,79537,2023,22
28,Santa Inc.,"Animation, Comedy, Fantasy",17.0,15623,2021,11
42,Student of the Year 2,"Action, Comedy, Drama",22.0,22952,2019,8
49,Heropanti 2,"Action, Comedy, Crime",23.0,27732,2022,8
...,...,...,...,...,...,...
965,Miss Congeniality 2: Armed & Fabulous,"Action, Comedy, Crime",52.0,81428,2005,8
978,Cats & Dogs,"Action, Adventure, Comedy",52.0,63310,2001,19
988,The Medallion,"Action, Comedy, Fantasy",52.0,44504,2003,8
993,Beast,"Action, Comedy, Crime",52.0,37361,2022,8


### Separating Features and Target Variable

In [12]:
x=isComedy.iloc[:,3:-1]
y=isComedy.iloc[:,2]


In [13]:
x

Unnamed: 0,numVotes,releaseYear
19,16829,2006
20,79537,2023
28,15623,2021
42,22952,2019
49,27732,2022
...,...,...
965,81428,2005
978,63310,2001
988,44504,2003
993,37361,2022


In [14]:
y

19     15.0
20     16.0
28     17.0
42     22.0
49     23.0
       ... 
965    52.0
978    52.0
988    52.0
993    52.0
997    52.0
Name: averageRating, Length: 161, dtype: float64

In [15]:
# Splitting the data into training and test sets
x_train , x_test , y_train,y_test=train_test_split(x, y, test_size = 0.2, shuffle = True, random_state = 0)

In [16]:
K=9
knn=KNeighborsClassifier(K)
knn.fit(x_train, y_train)
yPredSklearn=knn.predict(x_test)
print(yPredSklearn)

[46. 46. 46. 46. 46. 27. 46. 40. 16. 46. 52. 32. 32. 32. 48. 47. 40. 51.
 40. 43. 46. 51. 41. 51. 46. 48. 46. 47. 48. 16. 46. 40. 46.]


The accuracy changes in a wave-like pattern as we try different values of k. For example, when 𝑘=3 and k=17, the accuracy is 6.06%. This pattern might repeat if the test size is set to 0.
With a test size of 0.2, accuracy hits roughly about 6% every couple of steps. Testing k=9 with test sizes from 0.1 to 0.9 gives these accuracy results: 0.00, 0.03, 0.02, 0.06, 0.03, 0.04, 0.05, 0.04, 0.02. Accuracy goes up and down, but never goes higher than ~6%.

In [17]:
accuracy_score(y_test, yPredSklearn)

0.030303030303030304

# Goals
1. Finding the Score for a "Bad" Comedy: Use predictions to figure out the average score that makes a comedy film "bad."
2. Predicting Future Bad Comedy Scores: Use the average number of votes and release year to guess the future scores of bad comedy films.

K-fold Testing

In [18]:
X=bad_film_data.iloc[:,4]
Y=bad_film_data.iloc[:,2]

In [19]:
X

0      2017
1      2017
2      2020
3      2022
4      2023
       ... 
995    2016
996    2012
997    2008
998    2012
999    2024
Name: releaseYear, Length: 1000, dtype: int64

In [20]:
Y

0      10.0
1      10.0
2      10.0
3      11.0
4      11.0
       ... 
995    52.0
996    52.0
997    52.0
998    52.0
999    52.0
Name: averageRating, Length: 1000, dtype: float64

In [21]:
KF=KFold(n_splits=3, shuffle=True, random_state=42)

In [22]:
k_neighbors = 3
accuracies = []

for train_index, test_index in KF.split(x):
    X_train, Y_test = X[train_index], Y[test_index]
    X_train, Y_test = X[train_index], Y[test_index]
    
    knn = KNeighborsClassifier(n_neighbors=k_neighbors)
    knn.fit(x_train, y_train)
    
    y_pred = knn.predict(x_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

average_accuracy = np.mean(accuracies)
print(f"Accuracies for each fold: {accuracies}")
print(f"Average Accuracy across all folds: {average_accuracy:.2f}")

Accuracies for each fold: [0.06060606060606061, 0.06060606060606061, 0.06060606060606061]
Average Accuracy across all folds: 0.06
