# Setup and data cleaning

In [277]:
# Dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [119]:
# Import IMDB movie data
title_basics = pd.read_csv('../data/title.basics.tsv',delimiter='\t',encoding='utf-8-sig',low_memory=False)

# Import ratings
title_ratings = pd.read_csv('../data/title.ratings.tsv',delimiter='\t',encoding='utf-8-sig')

# Import crew
title_crew = pd.read_csv('../data/title.crew.tsv',delimiter='\t',encoding='utf-8-sig')

# Import name basics
name_basics = pd.read_csv('../data/name.basics.tsv',delimiter='\t',encoding='utf-8-sig')

# Import box office data from BoxOfficeMojo
box_office = pd.read_csv('../data/boxoffice.csv')

# Import Oscar data from Wikipedia: https://en.wikipedia.org/wiki/List_of_Academy_Award-winning_films
oscars = pd.read_csv('../data/oscars_cleaned.csv')

In [120]:
# Filter non-movies, adult movies
title_filtered = title_basics[title_basics['titleType']=='movie']
title_filtered = title_filtered[title_filtered['isAdult']==0]

In [129]:
# Split out genres and join
genres_split = title_filtered["genres"].str.split(",", n=2, expand=True)
joined = title_filtered.join(genres_split)

In [131]:
# Rename and drop some columns
cleaned = joined.rename(columns = {'tconst':'IMDB ID', 'titleType': 'Type', 'primaryTitle': 'Title', 'originalTitle': 'Title (original)', 'startYear': 'Year', 'runtimeMinutes': 'Runtime (min)', 0:'Genre (main)', 1:'Genre (sub 1)', 2:'Genre (sub 2)'})
cleaned = cleaned.drop(columns=['endYear', 'genres', 'Genre (sub 1)', 'Genre (sub 2)', 'Title (original)'])

In [274]:
# Drop empty rows
cleaned = cleaned.dropna(axis='rows')

In [140]:
# Merge basic set and rating
with_ratings = cleaned.set_index('IMDB ID').join(title_ratings.set_index('tconst'))
with_ratings = with_ratings.rename(columns = {'averageRating': 'Rating (avg.)', 'numVotes': 'Votes'})

# Merge box office and Oscars
merged = pd.merge(box_office, oscars, left_on='title', right_on='Film', how='outer')

In [263]:
# Merge both of above to make combined set
combined = with_ratings.merge(merged,how='left', left_on='Title', right_on='title')

In [266]:
# Drop, rename
combined = combined.drop(['Type', 'isAdult', 'Year_y', 'year', 'rank', 'title', 'Film'], axis=1)
combined = combined.rename(columns = {'Year_x': 'Year', 'studio': 'Studio', 'lifetime_gross': 'Gross (lifetime)'})

In [276]:
combined.count()

Title               540781
Year                540781
Runtime (min)       540781
Genre (main)        540781
Rating (avg.)       243681
Votes               243681
Studio               27293
Gross (lifetime)     27357
Awards                1312
Nominations           1312
dtype: int64

## More data etc. we could integrate

In [None]:
# https://towardsdatascience.com/collecting-movie-data-445ca1ead8e5
# Budget
# Country (e.g. most popular genres by country)

# Calculate avg gross ($) per movie
#box_plot['Avg. lifetime gross'] = box_plot['lifetime_gross']/box_plot['title']

# Reset index to get year in column
#box_plot.reset_index()

# Machine Learning (WIP)

In [None]:
# Assign the data to X and y
# Note: Sklearn requires a two-dimensional array of values so we use reshape to create this
X = brains["weight"].values.reshape(-1, 1)
y = brains["size"].values.reshape(-1, 1)
print("Shape: ", X.shape, y.shape)

# Assign the data to X and y
#X = foam[["foam", "beer"]]
#y = foam["time"].values.reshape(-1, 1)
#print(X.shape, y.shape)

In [None]:
# Plot the data to see if a linear trend exists
plt.scatter(X, y)
plt.xlabel("Brain Weight")
plt.ylabel("Head Size")

In [None]:
# Use sklearn's `train_test_split` to split the data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Use train_test_split to create training and testing data
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Create the model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

# Create the model using LinearRegression
#from sklearn.linear_model import LinearRegression
#model = LinearRegression()

In [None]:
# Fit the model to the training data. 
model.fit(X_train, y_train)

# Fit the model to the training data and calculate the scores for the training and testing data
#model.fit(X_train, y_train)

In [None]:
# Calculate the mean_squared_error and the r-squared value for the testing data
from sklearn.metrics import mean_squared_error, r2_score

# Use our model to make predictions
predicted = model.predict(X_test)

# Score the predictions with mse and r2
mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2 ): {r2}")

#training_score = model.score(X_train, y_train)
#testing_score = model.score(X_test, y_test)

#print(f"Training Score: {training_score}")
#print(f"Testing Score: {testing_score}")

In [None]:
# Call the `score` method on the model to show the r2 score
model.score(X_test, y_test)

In [None]:
# Plot the Residuals for the Training and Testing data
#plt.scatter(model.predict(X_train), model.predict(X_train) - y_train, c="blue", label="Training Data")
#plt.scatter(model.predict(X_test), model.predict(X_test) - y_test, c="orange", label="Testing Data")
#plt.legend()
#plt.hlines(y=0, xmin=y.min(), xmax=y.max())
#plt.title("Residual Plot")