In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
from scipy.stats import linregress
import datetime

In [2]:
#Reading in Movies CSV files
movies_meta_path = "../ImportData/movies_metadata.csv"
movies_meta_df = pd.read_csv(movies_meta_path)

movies_updated_path= "../ImportData/MoviesOnStreamingPlatforms_updated.csv"
movies_streaming_df = pd.read_csv(movies_updated_path)

In [3]:
#Convert the release_date to datetime so we can use the datetime import
movies_meta_df['release_date'] = pd.to_datetime(movies_meta_df['release_date'])

In [4]:
movies_meta_df.dtypes

id                              object
production_countries            object
release_date            datetime64[ns]
revenue                        float64
runtime                        float64
spoken_languages                object
status                          object
tagline                         object
title                           object
video                           object
vote_average                   float64
vote_count                     float64
dtype: object

In [5]:
#Extracting the year from the release_date
movies_meta_df['Year'] = movies_meta_df['release_date'].dt.strftime('%Y')

#Droping unecessary columns from movies_meta_df
movies_meta_df = movies_meta_df.drop(columns = ['production_countries', 'release_date', 'revenue', 'spoken_languages', 'status', 'tagline', 'video'])

movies_meta_df = movies_meta_df.rename(columns={'title': 'Title'})

#Examining the movies_meta_df
movies_meta_df.head()

Unnamed: 0,id,runtime,Title,vote_average,vote_count,Year
0,862,81.0,Toy Story,7.7,5415.0,1995
1,8844,104.0,Jumanji,6.9,2413.0,1995
2,15602,101.0,Grumpier Old Men,6.5,92.0,1995
3,31357,127.0,Waiting to Exhale,6.1,34.0,1995
4,11862,106.0,Father of the Bride Part II,5.7,173.0,1995


In [6]:
# Drop Movies that do not have a year entry in meta_df
movies_meta_df.dropna(subset = ["Year"], inplace=True)

In [7]:
#Droping unecessary columns from movies_streaming_df
movies_streaming_df = movies_streaming_df.drop(columns = ['Unnamed: 0', 'Type', 'Directors', 'Country', 'Language', 'Runtime'])

In [8]:
movies_meta_df.dtypes

id               object
runtime         float64
Title            object
vote_average    float64
vote_count      float64
Year             object
dtype: object

In [9]:
movies_streaming_df.dtypes

ID                   int64
Title               object
Year                 int64
Age                 object
IMDb               float64
Rotten Tomatoes     object
Netflix              int64
Hulu                 int64
Prime Video          int64
Disney+              int64
Genres              object
dtype: object

In [10]:
#Converting the Year column to integer type
movies_meta_df['Year'] = movies_meta_df.Year.astype(int)

In [11]:
#Combining the 2 data sets
movies_merged_df = pd.merge(movies_streaming_df, movies_meta_df,  how='left', left_on=['Title','Year'], right_on = ['Title','Year'])

In [12]:
movies_merged_df.to_csv("movies_merged_df.csv")