# Random Forest

In [5]:
import pandas as pd
import os
from IPython.display import display
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [6]:
TMDB_filename = os.path.join(os.getcwd(), "TMDB_tv_dataset_v3.csv")
df = pd.read_csv(TMDB_filename)

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV

In [13]:
X = df[['number_of_seasons', 'number_of_episodes', 'vote_count', 'vote_average', 'episode_run_time']]
y = df['popularity']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predictions on test data
rf_predictions_test = rf.predict(X_test)

# Predictions on train data
rf_predictions_train = rf.predict(X_train)

# Calculate evaluation metrics for test set
test_mae = mean_absolute_error(y_test, rf_predictions_test)
test_mse = mean_squared_error(y_test, rf_predictions_test)
test_r2 = r2_score(y_test, rf_predictions_test)

# Calculate evaluation metrics for train set
train_mae = mean_absolute_error(y_train, rf_predictions_train)
train_mse = mean_squared_error(y_train, rf_predictions_train)
train_r2 = r2_score(y_train, rf_predictions_train)

print("Results with regular dataset, 4 numerical features:")
# Print Train set evaluation metrics
print(f'Train MAE: {train_mae:.4f}')
print(f'Train MSE: {train_mse:.4f}')
print(f'Train R^2: {train_r2:.4f}')

# Print Test set evaluation metrics
print(f'Test MAE: {test_mae:.4f}')
print(f'Test MSE: {test_mse:.4f}')
print(f'Test R^2: {test_r2:.4f}')

Results with regular dataset, 4 numerical features:
Train MAE: 1.0953
Train MSE: 5.4330
Train R^2: 0.8242
Test MAE: 1.7127
Test MSE: 12.3445
Test R^2: 0.5991


In [14]:
X = df[['number_of_seasons_log', 'number_of_episodes_log', 'vote_count_log', 'vote_average_log', 'episode_run_time']]
y = df['popularity_log']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predictions on test data
rf_predictions_test = rf.predict(X_test)

# Predictions on train data
rf_predictions_train = rf.predict(X_train)

# Calculate evaluation metrics for test set
test_mae = mean_absolute_error(y_test, rf_predictions_test)
test_mse = mean_squared_error(y_test, rf_predictions_test)
test_r2 = r2_score(y_test, rf_predictions_test)

# Calculate evaluation metrics for train set
train_mae = mean_absolute_error(y_train, rf_predictions_train)
train_mse = mean_squared_error(y_train, rf_predictions_train)
train_r2 = r2_score(y_train, rf_predictions_train)

print("Results with Log Transformation, 4 features:")
# Print Train set evaluation metrics
print(f'Train MAE: {train_mae:.4f}')
print(f'Train MSE: {train_mse:.4f}')
print(f'Train R^2: {train_r2:.4f}')

# Print Test set evaluation metrics
print(f'Test MAE: {test_mae:.4f}')
print(f'Test MSE: {test_mse:.4f}')
print(f'Test R^2: {test_r2:.4f}')

Results with Log Transformation, 4 features:
Train MAE: 0.2227
Train MSE: 0.1292
Train R^2: 0.7903
Test MAE: 0.3047
Test MSE: 0.2261
Test R^2: 0.6313


In [16]:
X = df[['number_of_seasons', 'number_of_episodes', 'vote_count', 'vote_average', 
        'episode_run_time', 'Action & Adventure', 'Animation', 'Comedy', 'Crime', 
        'Documentary', 'Drama', 'Family', 'History', 'Kids', 'Music', 'Musical', 
        'Mystery', 'News', 'Reality', 'Romance', 'Sci-Fi & Fantasy', 'Soap', 
        'Talk', 'Unknown', 'War & Politics', 'Western']]
y = df['popularity']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predictions on test data
rf_predictions_test = rf.predict(X_test)

# Predictions on train data
rf_predictions_train = rf.predict(X_train)

# Calculate evaluation metrics for test set
test_mae = mean_absolute_error(y_test, rf_predictions_test)
test_mse = mean_squared_error(y_test, rf_predictions_test)
test_r2 = r2_score(y_test, rf_predictions_test)

# Calculate evaluation metrics for train set
train_mae = mean_absolute_error(y_train, rf_predictions_train)
train_mse = mean_squared_error(y_train, rf_predictions_train)
train_r2 = r2_score(y_train, rf_predictions_train)

print("Results with Genres Features:")
# Print Train set evaluation metrics
print(f'Train MAE: {train_mae:.4f}')
print(f'Train MSE: {train_mse:.4f}')
print(f'Train R^2: {train_r2:.4f}')

# Print Test set evaluation metrics
print(f'Test MAE: {test_mae:.4f}')
print(f'Test MSE: {test_mse:.4f}')
print(f'Test R^2: {test_r2:.4f}')

Results with Genres Features:
Train MAE: 0.8937
Train MSE: 3.7112
Train R^2: 0.8799
Test MAE: 1.6614
Test MSE: 11.7101
Test R^2: 0.6197


In [17]:
X = df[['number_of_seasons_log', 'number_of_episodes_log', 'vote_count_log', 
        'vote_average_log', 'episode_run_time_log', 'Action & Adventure', 
        'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 
        'History', 'Kids', 'Music', 'Musical', 'Mystery', 'News', 'Reality', 
        'Romance', 'Sci-Fi & Fantasy', 'Soap', 'Talk', 'Unknown', 'War & Politics', 
        'Western', 'original-language_en', 'original-language_zh', 'original-language_ja', 
        'original-language_ko', 'original-language_de', 'original-language_fr', 
        'original-language_es', 'original-language_pt', 'original-language_ru', 
        'original-language_nl', 'original-language_ar', 'original-language_da', 
        'original-language_cn', 'original-language_th', 'original-language_tr', 
        'original-language_it', 'original-language_hi', 'original-language_sv', 
        'original-language_cs', 'original-language_no', 'created-by_Shotaro Ishinomori', 
        'created-by_John de Mol', 'created-by_Adrián Suar', 'created-by_Simon Fuller', 
        'created-by_Ekta Kapoor', 'created-by_Na Young-seok', 'created-by_Yang Li-Hua', 
        'created-by_Joseph Barbera, William Hanna', 'created-by_R.J. Nuevas', 
        'created-by_Mark Burnett', 'networks_BBC One', 'networks_YouTube', 
        'networks_Netflix', 'networks_ITV1', 'networks_BBC Two', 'networks_ABC', 
        'networks_NBC', 'networks_TVB Jade', 'networks_CBS', 'networks_Channel 4', 
        'networks_ZDF', 'origin-country_US', 'origin-country_JP', 'origin-country_GB', 
        'origin-country_CN', 'origin-country_DE', 'origin-country_KR', 'origin-country_CA', 
        'origin-country_FR', 'origin-country_AU', 'origin-country_BR', 'origin-country_NL', 
        'origin-country_RU', 'origin-country_ES', 'origin-country_TH', 'origin-country_HK', 
        'origin-country_IN', 'origin-country_DK', 'origin-country_PH', 'origin-country_IT', 
        'origin-country_TR', 'origin-country_SE', 'origin-country_NO', 'origin-country_TW', 
        'origin-country_BE', 'origin-country_CZ', 'origin-country_MX', 'production-companies_US', 
        'production-companies_JP', 'production-companies_GB', 'production-companies_CN', 
        'production-companies_DE', 'production-companies_KR', 'production-companies_CA', 
        'production-companies_FR', 'production-companies_AU', 'production-companies_BR', 
        'production-companies_NL', 'production-companies_RU', 'production-companies_ES', 
        'production-companies_TH', 'production-companies_HK', 'production-companies_IN', 
        'production-companies_DK', 'production-companies_PH', 'production-companies_IT', 
        'production-companies_TR', 'production-companies_SE', 'production-companies_NO', 
        'production-companies_TW', 'production-companies_BE', 'production-companies_CZ', 
        'production-companies_MX']]
y = df['popularity_log']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predictions on test data
rf_predictions_test = rf.predict(X_test)

# Predictions on train data
rf_predictions_train = rf.predict(X_train)

# Calculate evaluation metrics for test set
test_mae = mean_absolute_error(y_test, rf_predictions_test)
test_mse = mean_squared_error(y_test, rf_predictions_test)
test_r2 = r2_score(y_test, rf_predictions_test)

# Calculate evaluation metrics for train set
train_mae = mean_absolute_error(y_train, rf_predictions_train)
train_mse = mean_squared_error(y_train, rf_predictions_train)
train_r2 = r2_score(y_train, rf_predictions_train)

print("Results with One-Hot Encoded Features:")
# Print Train set evaluation metrics
print(f'Train MAE: {train_mae:.4f}')
print(f'Train MSE: {train_mse:.4f}')
print(f'Train R^2: {train_r2:.4f}')

# Print Test set evaluation metrics
print(f'Test MAE: {test_mae:.4f}')
print(f'Test MSE: {test_mse:.4f}')
print(f'Test R^2: {test_r2:.4f}')

Results with One-Hot Encoded Features:
Train MAE: 0.1208
Train MSE: 0.0401
Train R^2: 0.9348
Test MAE: 0.2751
Test MSE: 0.1962
Test R^2: 0.6800
