In [18]:
import numpy as np
import pandas as pd
import csv
from sqlalchemy import create_engine

# Extract

### Extract CSVs into DataFrames

In [19]:
imdb_path = "resources/imdb_top_1000.csv"
academy_award_path = "resources/academy_awards.csv"
imdb_df = pd.read_csv(imdb_path)
academy_award_df = pd.read_csv(academy_award_path)

# Transform

### Transform IMDB dataframe

In [20]:
imdb_df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


Data Cleaning

In [21]:
#change Runtime to remove "min" text and make it easier to query runtimes later on
#converts the column to integer as well
imdb_df["Runtime"]=imdb_df["Runtime"].str.replace(" min","",case=False).astype(int)

imdb_df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [22]:
#clean gross column to remove commas to make database easy to query
# transform gross column: remove commas, fillna 
imdb_df['Gross'] = imdb_df['Gross'].str.replace(',', '')
# transform gross column: convert to int
imdb_df['Gross'] = imdb_df['Gross'].astype(int,errors='ignore')
imdb_df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [23]:
#convert Series Titles to string. This will help with merging on the academy award table later
#it also makes sure all series titles can be queried as strings in the database
#check column types to investigate
imdb_df=imdb_df.astype({"Series_Title":"string"})
imdb_df.dtypes

Poster_Link       object
Series_Title      string
Released_Year     object
Certificate       object
Runtime            int32
Genre             object
IMDB_Rating      float64
Overview          object
Meta_score       float64
Director          object
Star1             object
Star2             object
Star3             object
Star4             object
No_of_Votes        int64
Gross             object
dtype: object

In [24]:
#change released_year to integer and find any problem rows
imdb_df["Released_Year"]=imdb_df["Released_Year"].apply(pd.to_numeric,errors='coerce')
imdb_df.loc[imdb_df["Released_Year"].isnull()]
#there was one result, Apollo 13, which had an incorrect release year in the data.
#we have decided to manually enter the release year for our dataset

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
966,https://m.media-amazon.com/images/M/MV5BNjEzYj...,Apollo 13,,U,140,"Adventure, Drama, History",7.6,NASA must devise a strategy to return Apollo 1...,77.0,Ron Howard,Tom Hanks,Bill Paxton,Kevin Bacon,Gary Sinise,269197,173837933


In [25]:
#manually adjust entry
imdb_df.at[966,"Released_Year"]=1995
imdb_df.loc[imdb_df["Series_Title"]=="Apollo 13"]

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
966,https://m.media-amazon.com/images/M/MV5BNjEzYj...,Apollo 13,1995.0,U,140,"Adventure, Drama, History",7.6,NASA must devise a strategy to return Apollo 1...,77.0,Ron Howard,Tom Hanks,Bill Paxton,Kevin Bacon,Gary Sinise,269197,173837933


In [26]:
#convert all release years from float to integer
imdb_df["Released_Year"]=imdb_df["Released_Year"].astype(int)
imdb_df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


### Transform Academy Award dataframe

In [27]:
# Selecting columns in academy award dataframe
academy_award_df2 = academy_award_df[['Award','Winner','Name','Film']]

# drop empty film & Name cells
academy_award_df2 = academy_award_df2.dropna(subset=['Film'])
academy_award_df2 = academy_award_df2.dropna(subset=['Name'])

# drop empty winner cells as they are only nominees 
academy_award_df2 = academy_award_df2.dropna(subset=['Winner'])

# # dropping times where a movie is credited with an award twice (for example, a movie wins best cinematography, and there's 2 cinematographers)
academy_award_df2 = academy_award_df2.drop_duplicates(subset=['Award','Film'])
academy_award_df2 = academy_award_df2.drop_duplicates(subset=['Award','Name'])

print(f"Number of rows in Clean Academy Award table: {len(academy_award_df2.index)}")

academy_award_df2.head()


Number of rows in Clean Academy Award table: 1775


Unnamed: 0,Award,Winner,Name,Film
1,Actor,1.0,Emil Jannings,The Last Command
3,Actress,1.0,Janet Gaynor,7th Heaven
6,Art Direction,1.0,William Cameron Menzies,The Dove; Tempest
9,Cinematography,1.0,Charles Rosher,Sunrise
11,Directing (Comedy Picture),1.0,Lewis Milestone,Two Arabian Knights


In [28]:
#convert the name and film columns to strings. 
#this will help with merging later on to the movies dataframe
#it also ensures all movie titles can be queried as strings in the database
academy_award_df2=academy_award_df2.astype({"Film":"string"})
academy_award_df2=academy_award_df2.astype({"Name":"string"})
academy_award_df2.dtypes

Award      object
Winner    float64
Name       string
Film       string
dtype: object

### ERD TABLES

generate unique movie_id and director_id

In [29]:
#create unique identifiers for movie titles and directors
unique_rows=imdb_df[["Series_Title","Released_Year","Runtime","Genre","IMDB_Rating","Meta_score","Director","Gross"]]
#first create a column where each unique director is given an id
#the limitation of this is if there are two directors with the same name, they will be given the same unique id
unique_rows["director_id"]=unique_rows.groupby(["Director"]).ngroup()
#next create unique movie id. This method will give a unique id to each movie with a different title, director and release year.
#this should stop two different movies with the same title being given the same id
unique_rows["movie_id"]=unique_rows.groupby(["Series_Title","Director","Released_Year"]).ngroup()
unique_rows.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Meta_score,Director,Gross,director_id,movie_id
0,The Shawshank Redemption,1994,142,Drama,9.3,80.0,Frank Darabont,28341469,141,876
1,The Godfather,1972,175,"Crime, Drama",9.2,100.0,Francis Ford Coppola,134966411,137,787
2,The Dark Knight,2008,152,"Action, Crime, Drama",9.0,84.0,Christopher Nolan,534858444,83,767
3,The Godfather: Part II,1974,202,"Crime, Drama",9.0,90.0,Francis Ford Coppola,57300000,137,788
4,12 Angry Men,1957,96,"Crime, Drama",9.0,96.0,Sidney Lumet,4360000,456,1


movies table

In [30]:
#movies table
#get column needed to load into database
movies_table=unique_rows[["movie_id","Series_Title","Released_Year","Runtime","Genre","IMDB_Rating","Meta_score","Gross"]]
movies_table=movies_table.rename(columns={"Series_Title":"movie_title","Released_Year":"release_year","Runtime":"runtime","Genre":"genre","IMDB_Rating":"imdb_score","Meta_score":"meta_score","Gross":"gross"})
movies_table.head()

Unnamed: 0,movie_id,movie_title,release_year,runtime,genre,imdb_score,meta_score,gross
0,876,The Shawshank Redemption,1994,142,Drama,9.3,80.0,28341469
1,787,The Godfather,1972,175,"Crime, Drama",9.2,100.0,134966411
2,767,The Dark Knight,2008,152,"Action, Crime, Drama",9.0,84.0,534858444
3,788,The Godfather: Part II,1974,202,"Crime, Drama",9.0,90.0,57300000
4,1,12 Angry Men,1957,96,"Crime, Drama",9.0,96.0,4360000


directors and directed_by tables

In [31]:
#directors table
#all directors who have directed movies in the IMDB top 1000
directors=unique_rows[["director_id","Director"]]
directors=directors.rename(columns={"Director":"director_name"})
directors=directors.drop_duplicates()
directors.reset_index(drop=True,inplace=True)
directors.head()

Unnamed: 0,director_id,director_name
0,141,Frank Darabont
1,137,Francis Ford Coppola
2,83,Christopher Nolan
3,456,Sidney Lumet
4,383,Peter Jackson


In [32]:
#directed_by table
#all movies 
directored_by=unique_rows[["movie_id","director_id"]]
directored_by.head()

Unnamed: 0,movie_id,director_id
0,876,141
1,787,137
2,767,83
3,788,137
4,1,456


oscar_winners and oscar_categories tables

In [33]:
academy_award_merge=academy_award_df2.copy()
unique_rows_merge=unique_rows.copy()
academy_award_merge["Film"]=academy_award_merge["Film"].str.strip()
academy_award_merge["Name"]=academy_award_merge["Name"].str.strip()
unique_rows_merge["Series_Title"]=unique_rows_merge["Series_Title"].str.strip()


In [44]:
#Merging IMDB table with Academy Award table so only academy awards to top 1000 movies remain
#Some film titles are in the Film column of Academy Award table, these are merged here
oscar_on_film=pd.merge(academy_award_merge,unique_rows_merge,how='left',left_on="Film",right_on="Series_Title")
oscar_on_film=oscar_on_film.dropna(subset=["IMDB_Rating"])
oscar_on_film=oscar_on_film.drop(columns=["Film","Name"])
oscar_on_film.head()

Unnamed: 0,Award,Winner,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Meta_score,Director,Gross,director_id,movie_id
12,Special Award,1.0,The Circus,1928.0,72.0,"Comedy, Romance",8.1,90.0,Charles Chaplin,,74.0,762.0
24,Directing,1.0,All Quiet on the Western Front,1930.0,152.0,"Drama, War",8.0,91.0,Lewis Milestone,3270000.0,292.0,42.0
60,Actor,1.0,It Happened One Night,1934.0,105.0,"Comedy, Romance",8.1,87.0,Frank Capra,4360000.0,140.0,388.0
61,Actress,1.0,It Happened One Night,1934.0,105.0,"Comedy, Romance",8.1,87.0,Frank Capra,4360000.0,140.0,388.0
122,Special Award,1.0,A Star Is Born,2018.0,136.0,"Drama, Music, Romance",7.6,88.0,Bradley Cooper,215288866.0,59.0,23.0


In [45]:
#some film titles are in the Name column of Academy Award table, these are merged here

oscar_on_name=pd.merge(academy_award_merge,unique_rows_merge,how="left",left_on="Name",right_on="Series_Title")
oscar_on_name=oscar_on_name.dropna(subset=["IMDB_Rating"])
oscar_on_name=oscar_on_name.drop(columns=["Film","Name"])
oscar_on_name.head()

Unnamed: 0,Award,Winner,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Meta_score,Director,Gross,director_id,movie_id
25,Outstanding Production,1.0,All Quiet on the Western Front,1930.0,152.0,"Drama, War",8.0,91.0,Lewis Milestone,3270000,292.0,42.0
58,Writing (Adaptation),1.0,Little Women,2019.0,135.0,"Drama, Romance",7.8,91.0,Greta Gerwig,108101214,171.0,486.0
65,Directing,1.0,It Happened One Night,1934.0,105.0,"Comedy, Romance",8.1,87.0,Frank Capra,4360000,140.0,388.0
69,Outstanding Production,1.0,It Happened One Night,1934.0,105.0,"Comedy, Romance",8.1,87.0,Frank Capra,4360000,140.0,388.0
73,Writing (Adaptation),1.0,It Happened One Night,1934.0,105.0,"Comedy, Romance",8.1,87.0,Frank Capra,4360000,140.0,388.0


In [36]:
#oscar_winners table
#both dataframes above are merged for a complete list of academy awards given to films in iMBD top 1000
#a movie cannot win in the same category more than once therefore the category and movie_id make a composite key
oscar_winners=pd.merge(oscar_on_film,oscar_on_name,how='outer')
oscar_winners=oscar_winners.drop(columns=["Winner","Released_Year","Runtime","Genre","IMDB_Rating","Meta_score","Director","Gross","director_id","Series_Title"])
#return movie id column back to int
oscar_winners["movie_id"]=oscar_winners["movie_id"].astype(int)
oscar_winners=oscar_winners.rename(columns={"Award":"category"})
oscar_winners.head()

Unnamed: 0,category,movie_id
0,Special Award,762
1,Directing,42
2,Actor,388
3,Actress,388
4,Special Award,23


In [46]:
oscar_winners.to_csv("oscies.csv")

In [37]:
# oscar_categories table
# all unique award categories from the academy award dataframe

oscar_categories = pd.DataFrame(academy_award_df2['Award'].unique())

oscar_categories=oscar_categories.rename(columns={0:"category_name"})

oscar_categories.head()

Unnamed: 0,category_name
0,Actor
1,Actress
2,Art Direction
3,Cinematography
4,Directing (Comedy Picture)


movie_genres and genres tables

In [38]:
## create movie_genres table
unique_rows_copy = unique_rows.copy()
genre_list = unique_rows_copy['Genre'].str.split(',')
movie_genres = pd.DataFrame({'movie_id': unique_rows_copy['movie_id'], 'Genre': genre_list})
len_list = list(map(len, movie_genres['Genre']))
movie_genres = pd.DataFrame({'movie_id': np.repeat(movie_genres['movie_id'], len_list), 'Genre': np.concatenate(movie_genres['Genre'].values)})
#striping leading and ending spaces in genre
movie_genres["Genre"] = movie_genres["Genre"].str.strip()
movie_genres=movie_genres.reset_index(drop=True)
#it is impossible for a single movie to have duplicates of genre therefore movie_id and genre columns are a composite key
movie_genres=movie_genres.rename(columns={"Genre":"genre"})
movie_genres.head()

Unnamed: 0,movie_id,genre
0,876,Drama
1,787,Crime
2,787,Drama
3,767,Action
4,767,Crime


In [39]:
#genres table
#all genres are unique therefore it is a single column table of primary keys
genres = pd.DataFrame({'genre': movie_genres['genre'].unique()})
genres.head()


Unnamed: 0,genre
0,Drama
1,Crime
2,Action
3,Adventure
4,Biography


# Load

### Create database connection

In [40]:
connection_string = "postgres:postgres@localhost:5432/movies_project_db"
engine = create_engine(f'postgresql://{connection_string}')

### Load DataFrames into database

In [41]:
premise_transformed.to_sql(name='premise', con=engine, if_exists='append', index=True)

NameError: name 'premise_transformed' is not defined

In [None]:
county_transformed.to_sql(name='county', con=engine, if_exists='append', index=True)