In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import pickle
import requests
import json

# Dataset:  IMDb movies.csv

Source: Stefano Leone (2020). "IMDb movies extensive dataset", Kaggle, https://www.kaggle.com/stefanoleone992/imdb-extensive-dataset/version/2?select=IMDb+movies.csv

Description: 81k+ movies and 175k+ cast members scraped from IMDb.


In [2]:
# Read the csv file into a pandas DataFrame

movies = pd.read_csv('Resources/IMDb movies.csv')
movies.info()

FileNotFoundError: [Errno 2] No such file or directory: 'Resources/IMDb movies.csv'

In [None]:
# Extract items "title", "year", "country", "avg_vote", "budget" & "worlwide_gross_income"
movies_tygcvbi = movies[["title", "year", "genre", "country", "avg_vote", "budget", "worlwide_gross_income"]]

# Remove NaN values
movies_tygcvbi = movies_tygcvbi.dropna()

# Removing items "TV Movie 2019" from cloumn "year"
movies_tygcvbi = movies_tygcvbi[movies_tygcvbi["year"] != "TV Movie 2019" ]

# Remove "Currencies" from columns "budget" & "worlwide_gross_income"
movies_tygcvbi["budget"] = movies_tygcvbi["budget"].str.split(" ").str[1]
movies_tygcvbi["worlwide_gross_income"] =movies_tygcvbi["worlwide_gross_income"].str.split(" ").str[1]

# Change columns "year", "budget" & "worlwide_gross_income" to "int64"
movies_tygcvbi = movies_tygcvbi.astype({"year": "int64"})
movies_tygcvbi = movies_tygcvbi.astype({"budget": "int64"})
movies_tygcvbi = movies_tygcvbi.astype({"worlwide_gross_income": "int64"})

movies_tygcvbi.head()

In [None]:
movies_tygcvbi.dtypes

In [None]:
# "country": "USA"
# "year": 2010 ~ 2019
# "budget": => $ 1,000,000 (1149 Data Points)

movies_tygcvbi = movies_tygcvbi[(movies_tygcvbi["country"] == "USA") &
                                (movies_tygcvbi["year"] >= 2010) &
                                (movies_tygcvbi["year"] <= 2019) &
                                (movies_tygcvbi["budget"] >= 1000000)]

# len(movies_tygcvbi)
movies_tygcvbi.head()

In [None]:
# Assign the data to X and y
# Note: Sklearn requires a two-dimensional array of values
# so we use reshape to create this

X = movies_tygcvbi["budget"].values.reshape(-1, 1)
y = movies_tygcvbi["worlwide_gross_income"].values.reshape(-1, 1)

print("Shape: ", X.shape, y.shape)

In [None]:
# Plot the data to see if a linear trend exists

plt.scatter(X, y)

In [None]:
# Use sklearn's `train_test_split` to split the data into training and testing

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Create the model

from sklearn.linear_model import LinearRegression

model = LinearRegression()

In [None]:
# Fit the model to the training data. 

model.fit(X_train, y_train)

In [None]:
# Calculate the mean_squared_error and the r-squared value
# for the testing data

from sklearn.metrics import mean_squared_error, r2_score

# Use our model to predict a value
predicted = model.predict(X_test)

mse = mean_squared_error(y_test, predicted)
print(f"Mean Squared Error (MSE): {mse}")

r2 = r2_score(y_test, predicted)
print(f"R-squared (R2): {r2}")

In [None]:
# Call the `score` method on the model to show the r2 score

model.score(X_test, y_test)

In [None]:
# Save the model as "movies_model_overall.pkl"

pickle.dump(model, open('movies_model_overall.pkl','wb'))

In [None]:
# Load model and predict income based on $ 200 million.

model = pickle.load(open('movies_model_overall.pkl','rb'))
print(model.predict([[200000000]]))