# Vote Average Prediction

### Predict movie average rating (vote_average_prediction) using linear regression.

### 1 Import libraries

In [8]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


### Reading a cleaned dataset

In [12]:
df = pd.read_csv('../data/cleaned/TMDB_movie_dataset_cleaned.csv')

df.head()
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7060 entries, 0 to 7059
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    7060 non-null   int64  
 1   title                 7060 non-null   object 
 2   vote_average          7060 non-null   float64
 3   vote_count            7060 non-null   int64  
 4   status                7060 non-null   object 
 5   release_date          7060 non-null   object 
 6   revenue               7060 non-null   float64
 7   runtime               7060 non-null   int64  
 8   adult                 7060 non-null   bool   
 9   budget                7060 non-null   float64
 10  original_language     7060 non-null   object 
 11  popularity            7060 non-null   float64
 12  genres                7060 non-null   object 
 13  production_companies  7060 non-null   object 
 14  production_countries  7060 non-null   object 
 15  spoken_languages     

Unnamed: 0,id,vote_average,vote_count,revenue,runtime,budget,popularity,companies_count,release_year,release_month,release_day_of_week
count,7060.0,7060.0,7060.0,7060.0,7060.0,7060.0,7060.0,7060.0,7060.0,7060.0,7060.0
mean,121201.5,6.540372,2153.80949,91720540.0,109.73102,31705440.0,25.393574,3.543059,2001.889235,7.012323,3.156232
std,191688.0,0.793936,3352.688912,178935800.0,20.685463,41892990.0,70.942102,2.386031,16.482109,3.384672,1.240843
min,5.0,1.908,101.0,1.0,5.0,5.0,0.6,0.0,1915.0,1.0,0.0
25%,9387.75,6.031,335.0,8339707.0,95.0,6000000.0,12.334,2.0,1995.0,4.0,2.0
50%,16284.5,6.566,917.0,30111690.0,106.0,17000000.0,16.646,3.0,2006.0,7.0,3.0
75%,164341.2,7.107,2387.5,94586700.0,120.0,40000000.0,24.8145,5.0,2014.0,10.0,4.0
max,1040148.0,8.707,34495.0,2923706000.0,339.0,460000000.0,2994.357,30.0,2023.0,12.0,6.0


### Feature and target variable selection

In [13]:
numerical_features = ['budget', 'revenue', 'runtime', 'popularity', 'vote_count', 'companies_count',
                      'release_year', 'release_month', 'release_day_of_week']

categorical_features = ['original_language', 'adult']

X = df[numerical_features + categorical_features]
y = df['vote_average']


### Processing passes

In [15]:
X.loc[:, numerical_features] = X.loc[:, numerical_features].fillna(X[numerical_features].median())

X.loc[:, categorical_features] = X.loc[:, categorical_features].fillna(X[categorical_features].mode().iloc[0])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Creation of Transformers

In [16]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


### Building a pipeline with a model

In [17]:
model = RandomForestRegressor(n_estimators=100, random_state=42)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', model)])

### Model training

In [18]:
pipeline.fit(X_train, y_train)

### Prediction

In [19]:
y_pred = pipeline.predict(X_test)

### Model evaluation

In [20]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

In [21]:
print(f"RMSE: {rmse:.3f}")
print(f"R2 Score: {r2:.3f}")

RMSE: 0.540
R2 Score: 0.520
