# Preciting IMDB Scores
## Build a regression model to predict the IMDb score of a TV show or movie based on features such as release year, runtime, genres, and production countries.

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import json
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from pathlib import Path
import pandas as pd
from pathlib import Path
import pandas as pd
import ast

In [2]:


# Define the path to the directory containing the CSV files
data_dir = Path("C:/Users/Qazi Fabia Hoq/OneDrive/Documents/GitHub/netflix-rating-project/resources")

# Read the showData.csv file into a DataFrame
show_data_path = data_dir / "showData.csv"
show_data = pd.read_csv(show_data_path)

# Display the first few rows of the DataFrame to verify that it was read correctly
show_data.head()


Unnamed: 0.1,Unnamed: 0,id,title,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_score
0,0,ts300399,Five Came Back: The Reference Films,1945,TV-MA,51,['documentation'],['US'],1.0,
1,1,ts22164,Monty Python's Flying Circus,1969,TV-14,30,"['comedy', 'european']",['GB'],4.0,8.8
2,2,ts45948,Monty Python's Fliegender Zirkus,1972,TV-MA,43,['comedy'],[],1.0,8.1
3,3,ts20681,Seinfeld,1989,TV-PG,24,['comedy'],['US'],9.0,8.9
4,4,ts22082,Knight Rider,1982,TV-PG,51,"['scifi', 'action', 'crime', 'drama']",['US'],4.0,6.9


In [3]:
# Read the movieData.csv file into a DataFrame
movie_data_path = data_dir / "movieData.csv"
movie_data = pd.read_csv(movie_data_path)

# Display the first few rows of the DataFrame to verify that it was read correctly
movie_data.head()


Unnamed: 0.1,Unnamed: 0,id,title,release_year,age_certification,runtime,genres,production_countries,imdb_score
0,0,tm84618,Taxi Driver,1976,R,114,"['drama', 'crime']",['US'],8.2
1,1,tm154986,Deliverance,1972,R,109,"['drama', 'action', 'thriller', 'european']",['US'],7.7
2,2,tm127384,Monty Python and the Holy Grail,1975,PG,91,"['fantasy', 'action', 'comedy']",['GB'],8.2
3,3,tm120801,The Dirty Dozen,1967,,150,"['war', 'action']","['GB', 'US']",7.7
4,4,tm70993,Life of Brian,1979,R,94,['comedy'],['GB'],8.0


# Data Preprocessing

In [4]:
combined_data = pd.concat([show_data, movie_data], ignore_index=True)


In [5]:
combined_data.to_csv('C:/Users/Qazi Fabia Hoq/OneDrive/Documents/GitHub/netflix-rating-project/resources/combinedmovieshow_data.csv', index=False)


In [6]:
combined_data = pd.read_csv('C:/Users/Qazi Fabia Hoq/OneDrive/Documents/GitHub/netflix-rating-project/resources/combinedmovieshow_data.csv')


In [7]:
combined_data.head()


Unnamed: 0.1,Unnamed: 0,id,title,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_score
0,0,ts300399,Five Came Back: The Reference Films,1945,TV-MA,51,['documentation'],['US'],1.0,
1,1,ts22164,Monty Python's Flying Circus,1969,TV-14,30,"['comedy', 'european']",['GB'],4.0,8.8
2,2,ts45948,Monty Python's Fliegender Zirkus,1972,TV-MA,43,['comedy'],[],1.0,8.1
3,3,ts20681,Seinfeld,1989,TV-PG,24,['comedy'],['US'],9.0,8.9
4,4,ts22082,Knight Rider,1982,TV-PG,51,"['scifi', 'action', 'crime', 'drama']",['US'],4.0,6.9


In [8]:
# Initialize an empty set to store unique genres
unique_genres = set()

# Iterate over the 'genres' column and add each genre to the set
combined_data['genres'].apply(lambda x: unique_genres.update([genre.strip().strip(" '[]") for genre in x.split(',')]))

# Print the unique genres
print(unique_genres)


{'', 'crime', 'sport', 'war', 'european', 'animation', 'drama', 'fantasy', 'history', 'reality', 'music', 'western', 'thriller', 'horror', 'family', 'documentation', 'scifi', 'action', 'romance', 'comedy'}


In [9]:
# Clean up the 'genres' column
combined_data['genres'] = combined_data['genres'].apply(lambda x: [genre.strip().strip(" '[]") for genre in x.split(',')])

# Extract unique genres
unique_genres = set()
combined_data['genres'].apply(lambda x: unique_genres.update(x))

# Encode genres
for genre in unique_genres:
    combined_data[genre] = combined_data['genres'].apply(lambda x: 1 if genre in x else 0)

# Print unique genres
print(unique_genres)


{'', 'crime', 'sport', 'war', 'european', 'animation', 'drama', 'fantasy', 'history', 'reality', 'music', 'western', 'thriller', 'horror', 'family', 'documentation', 'scifi', 'action', 'romance', 'comedy'}


In [10]:
# Remove rows with missing values in the target variable
combined_data = combined_data.dropna(subset=['imdb_score'])

In [11]:
combined_data = combined_data.dropna(subset=['release_year', 'runtime', 'production_countries'])
# Impute missing values in 'release_year' with the median
release_year_median = combined_data['release_year'].median()
combined_data['release_year'].fillna(release_year_median, inplace=True)

# Impute missing values in 'runtime' with the median
runtime_median = combined_data['runtime'].median()
combined_data['runtime'].fillna(runtime_median, inplace=True)
# Impute missing values in 'production_countries' with a new category 'Unknown'
combined_data['production_countries'].fillna('Unknown', inplace=True)


# Feature Selection

In [12]:
X = combined_data[['release_year', 'runtime', 'western', 'fantasy', 'history', 'romance', 'crime', 'european', 'comedy', 'action', 'sport', 'thriller', 'music', 'animation', 'war', 'reality', 'horror', 'scifi', 'family', 'drama', 'documentation']]
y = combined_data['imdb_score']


# Split The Data

In [13]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shape of the training and test sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (4294, 21)
X_test shape: (1074, 21)
y_train shape: (4294,)
y_test shape: (1074,)


# Model Selection

In [14]:

# Initialize the models
models = {
    'Linear Regression': LinearRegression()
}


# Train The Model

In [15]:
for name, model in models.items():
    print(f"Training {name}...")
    # Train the model
    model.fit(X_train, y_train)
    print(f"{name} trained.")


Training Linear Regression...
Linear Regression trained.


In [16]:
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)


Coefficients: [-0.02773994 -0.00526004 -0.11579916  0.18058725  0.41580719 -0.12664494
  0.14586746 -0.01860548 -0.00803708 -0.20385965  0.03635262 -0.07652127
  0.00420428  0.44966579  0.22323952 -0.09041945 -0.48724668 -0.00947496
 -0.43177198  0.51540985  0.72176608]
Intercept: 62.5086948263234


# Make Prediction

In [17]:
# Make predictions
y_pred = model.predict(X_test)

# Print predictions
print("Predictions:", y_pred)

Predictions: [6.42671293 6.4073848  6.61414361 ... 6.68420992 6.50555588 6.14995543]


# Evaluate The Performance

In [18]:
# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

# Print the metrics
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Absolute Error: 0.823485196119407
Mean Squared Error: 1.1243918647340323
R-squared: 0.15549410044771972


# Analysis of IMDb Score Prediction Model

## Model Overview:

The regression model was built to predict IMDb scores for TV shows and movies. It considers various features such as release year, runtime, genres, and production countries. This analysis aims to understand the factors influencing IMDb scores and predict scores for new titles.

## Model Coefficients and Intercept:


### Coefficients: 
The coefficients represent the impact of each feature on the IMDb score. For example, a positive coefficient for a feature indicates that an increase in that feature is associated with a higher IMDb score, while a negative coefficient suggests the opposite.
### Release Year: 
Coefficient of -0.0277 implies that newer titles tend to have slightly lower scores.This coefficient suggests that for each unit increase in the release year of a title, the IMDb score is expected to decrease by approximately 0.0277 points. In other words, newer titles tend to have slightly lower scores according to the model. This could be due to changing audience preferences, evolving standards in filmmaking, or other factors that influence how newer content is received compared to older content.


Runtime: A negative coefficient (-0.0053) suggests that longer runtimes are associated with slightly lower IMDb scores.The negative coefficient for runtime implies that for each additional unit of runtime (e.g., minute), the IMDb score is expected to decrease by approximately 0.0053 points. This suggests that longer runtimes are associated with slightly lower IMDb scores. This relationship might reflect audience preferences for shorter, more concise content, where longer runtimes could lead to viewer fatigue or a perception of pacing issues.

### Genres: 
Coefficients for individual genres indicate their impact on IMDb scores. Positive coefficients suggest a positive influence, while negative coefficients suggest a negative influence.
Production Countries: Similarly, coefficients for production countries indicate their impact on IMDb scores.
### Intercept: 
The intercept (62.5087) represents the IMDb score when all other features are zero. It can be interpreted as the base score for a movie or TV show with no specific characteristics.

## Model Performance Metrics:

### Mean Absolute Error (MAE):
The MAE of 0.8235 indicates that, on average, the model's predictions differ from the actual IMDb scores by approximately 0.8235 points.A lower MAE suggests that the model's predictions are closer to the actual scores, indicating better performance.
### Mean Squared Error (MSE): 
The MSE of 1.1244 measures the average squared difference between predicted and actual scores, indicating the model's overall accuracy.It gives more weight to large errors, making it useful for understanding the magnitude of the errors in the model's predictions.
### R-squared (R²): 

The R² value of 0.1555 suggests that the model explains 15.55% of the variance in the IMDb scores, indicating a moderate level of predictive power.It indicates the proportion of the variance in the dependent variable (IMDb scores) that is predictable from the independent variables (features) in the model.This means that the model captures some but not all of the factors that influence IMDb scores.In this case, an R² of 0.1555 suggests that there is room for improvement in the model's predictive power. Further refinement of the model, such as adding more features or using more advanced modeling techniques, may help improve its performance.


## Predictions:

The model's predictions range from 6.4267 to 6.1499, indicating the IMDb scores it assigned to the test dataset.

## Conclusion:
The linear regression model provides valuable insights into the factors influencing IMDb scores. However, its performance, as indicated by MAE, MSE, and R², suggests that further refinement may be needed to improve prediction accuracy. Future iterations could include additional features or use more advanced modeling techniques to enhance predictive performance.

Additionally, considering the presence of missing values in the dataset, exploring other models such as deep neural networks could be beneficial. Neural networks are capable of capturing complex relationships in the data and may provide more accurate predictions. Integrating such models into the analysis could lead to a more comprehensive understanding of IMDb score prediction for TV shows and movies.

Note that detailed analysis and implementation of neural networks are presented in the "Large_model_training" and "small_model_notebook" notebooks.