# Rating prediction

In [52]:
# Import necessary libraries
import pandas as pd
import numpy as np
import json
import duckdb

# Machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

In [53]:
# create a connection to a file called 'file.db'
con = duckdb.connect('imdb_reviews.db')

In [54]:
con.execute("DESCRIBE").fetchdf()

Unnamed: 0,table_name,column_names,column_types,temporary
0,imdb_review_test,"[primaryTitle, originalTitle, startYear, endYe...","[VARCHAR, VARCHAR, VARCHAR, VARCHAR, VARCHAR, ...",False
1,imdb_review_train,"[primaryTitle, originalTitle, startYear, endYe...","[VARCHAR, VARCHAR, VARCHAR, VARCHAR, BIGINT, D...",False
2,imdb_review_validation,"[primaryTitle, originalTitle, startYear, endYe...","[VARCHAR, VARCHAR, VARCHAR, VARCHAR, VARCHAR, ...",False
3,movie_writer,"[movie, writer]","[VARCHAR, VARCHAR]",False
4,profile,"[movie, director]","[VARCHAR, VARCHAR]",False


In [55]:
df = con.execute("SELECT runtimeMinutes, numVotes, label FROM imdb_review_train").fetchdf()
df.head()

Unnamed: 0,runtimeMinutes,numVotes,label
0,91,13679.0,True
1,143,2178.0,True
2,65,10911.0,True
3,63,4312.0,True
4,67,87784.0,True


In [56]:
# Step 0: Prepare dataset and remove rows with NaN values (not possible with decision tree model)
df_no_nan = df.dropna()

# Step 1: Define explanatory and target variables
X = df_no_nan[['numVotes', 'runtimeMinutes']]
y = df_no_nan['label']

# Step 2: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.25, random_state=0)

# Step 3: Normalize the data for numerical stability
ss_train = StandardScaler()
X_train = ss_train.fit_transform(X_train)

ss_test = StandardScaler()
X_test = ss_test.fit_transform(X_test)

# Step 4: Fit a decision tree model to the training data
clf = DecisionTreeClassifier()

# Step 5: Make predictions on the testing data
clf = clf.fit(X_train,y_train)

# Step 6: Calculate the accuracy score by comparing the actual values and predicted values.
predictions = clf.predict(X_test)

cm = confusion_matrix(y_test, predictions)

TN, FP, FN, TP = confusion_matrix(y_test, predictions).ravel()

print('True Positive(TP)  = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)

True Positive(TP)  =  512
False Positive(FP) =  270
True Negative(TN)  =  599
False Negative(FN) =  409


In [57]:
df_validation = con.execute("SELECT runtimeMinutes, numVotes FROM imdb_review_validation").fetchdf()
# fill all NaN with 0, THIS NEEDS TO BE CHANGED!! But the model does not accept NaN values
df_validation = df_validation.fillna(0)

ss_train = StandardScaler()
df_validation = ss_train.fit_transform(df_validation)

predictions_validation = clf.predict(df_validation)

predictions_validation

pd.DataFrame(predictions_validation).to_csv('validation_results.csv', index=False, header=False)

In [58]:
df_test = con.execute("SELECT runtimeMinutes, numVotes FROM imdb_review_test").fetchdf()
# fill all NaN with 0, THIS NEEDS TO BE CHANGED!! But the model does not accept NaN values
df_test = df_test.fillna(0)

ss_train = StandardScaler()
df_test = ss_train.fit_transform(df_test)

predictions_test = clf.predict(df_test)

pd.DataFrame(predictions_test).to_csv('test_results.csv', index=False, header=False)

In [59]:
con.close()