In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
class Model(nn.Module):
    def __init__(self, in_features = 28, h1 = 8, h2 = 9, out_features = 2):
        super().__init__()
        self.fc1 = nn.Linear(in_features, h1)
        self.fc2 = nn.Linear(h1, h2)
        self.out = nn.Linear(h2, out_features)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.out(x)

        return x

In [3]:
df_filipe = pd.read_csv('filipe_films_pandas.csv')
df_letterboxd_dump = pd.read_csv('letterboxd_db_pandas.csv')

In [4]:
df_filipe = df_filipe.drop_duplicates(subset = ['letterboxdId'])
df_letterboxd_dump = df_letterboxd_dump.drop_duplicates(subset = ['letterboxdId'])
filipe_films_set = set(df_filipe['letterboxdId'])
filipe_films_set
print(type(df_letterboxd_dump))
# df_letterboxd_dump.head()

<class 'pandas.core.frame.DataFrame'>


In [5]:
# I'm dropping all the films from filipe's set.
df_letterboxd_dump.shape
for film in filipe_films_set:
   df_letterboxd_dump.drop(df_letterboxd_dump.loc[df_letterboxd_dump['letterboxdId'] == film].index, inplace=True)
df_letterboxd_dump.shape

(11105, 18)

In [6]:
# combining them
df_combined = pd.concat([df_filipe, df_letterboxd_dump])
df_combined.shape

(18733, 18)

In [7]:
# check to make sure they're combined
# j = random.randint(0,18732)
# print(df_combined.iloc[j])

In [8]:
# more cleaning
df_combined = df_combined[df_combined['averageRating'] != 0]
df_combined = df_combined[df_combined['runtime'] != 0]
df_combined = df_combined[df_combined['watchedCount'] != 0]
df_combined = df_combined[df_combined['fansCount'] != 0]
df_combined = df_combined[df_combined['likesCount'] != 0]
df_combined = df_combined[df_combined['reviewsCount'] != 0]
df_combined = df_combined[df_combined['listsCount'] != 0]
# df_combined = df_combined[df_combined['tagline'] != 'No tagline found']
# df_combined = df_combined[df_combined['themes'] != 'No themes found']
df_combined.shape

(15785, 18)

In [9]:
# add new rows for each genre
# you need to create new columns, one for each genre
# you need to loop through each row in the dataframe and turn each value to zero or one based on if its genre is in the genre array
# you need to drop the genre column
df_combined['Action'] = 0
df_combined['Adventure'] = 0
df_combined['Animation'] = 0
df_combined['Comedy'] = 0
df_combined['Crime'] = 0
df_combined['Documentary'] = 0
df_combined['Drama'] = 0
df_combined['Family'] = 0
df_combined['Fantasy'] = 0
df_combined['History'] = 0
df_combined['Horror'] = 0
df_combined['Music'] = 0
df_combined['Mystery'] = 0
df_combined['Romance'] = 0
df_combined['ScienceFiction'] = 0
df_combined['Thriller'] = 0
df_combined['TVMovie'] = 0
df_combined['War'] = 0
df_combined['Western'] = 0
df_combined['Unknown'] = 0
df_combined.shape

for index, row in df_combined.iterrows():
    for genre in row['genre'].strip('{}').split(','):
        df_combined.at[index, genre] = 1

In [10]:
# remove unncessesary features dataset
df_combined = df_combined.drop(columns = ['id','name','letterboxdId','created','updated','themes','tagline','fullSummary', 'genre'])
df_combined = df_combined.drop(df_combined.columns[-1], axis=1)

# standardize values
scaler = StandardScaler()
df_standardized = scaler.fit_transform(df_combined)
df_standardized = pd.DataFrame(df_standardized, columns=df_combined.columns)

X = df_standardized.drop(columns = ['filipeHasWatched'])

y = df_combined['filipeHasWatched']
# convert to numpy arrays
X = X.values
y = y.values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11, test_size=0.2)

In [11]:
# Instantiate model stuff here
torch.manual_seed(41)
# model = Model()
X_train = torch.FloatTensor(X_train)
X_test = torch.FloatTensor(X_test)
y_train = torch.LongTensor(y_train)
y_test = torch.LongTensor(y_test)
# criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

In [12]:
# Instantiate KNN here
knn = KNeighborsClassifier(n_neighbors = 500)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred))

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



              precision    recall  f1-score   support

           0       0.64      0.91      0.76      1786
           1       0.75      0.34      0.47      1371

    accuracy                           0.66      3157
   macro avg       0.70      0.63      0.61      3157
weighted avg       0.69      0.66      0.63      3157



In [13]:
plt.plot(range(epochs),losses)
plt.xlabel('epochs')
plt.ylabel('losses')
plt.savefig('first_training_loss_results.png', bbox_inches='tight')

NameError: name 'epochs' is not defined