In [1]:
import numpy as np
import pandas as pd
import csv

from sqlalchemy import create_engine

# Extract

### Extract CSVs into DataFrames

In [2]:
imdb_path = "resources/imdb_top_1000.csv"
academy_award_path = "resources/academy_awards.csv"

imdb_df = pd.read_csv(imdb_path)
academy_award_df = pd.read_csv(academy_award_path)

# Transform

### Transform IMDB dataframe

In [3]:
imdb_df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [4]:
# Testing stuff, IMDB movie names

imdb_df['Series_Title'].count()

imdb_df['Series_Title'].nunique()

999

### Transform Academy Award dataframe

In [5]:
academy_award_df.head()

Unnamed: 0,Year,Ceremony,Award,Winner,Name,Film
0,1927/1928,1,Actor,,Richard Barthelmess,The Noose
1,1927/1928,1,Actor,1.0,Emil Jannings,The Last Command
2,1927/1928,1,Actress,,Louise Dresser,A Ship Comes In
3,1927/1928,1,Actress,1.0,Janet Gaynor,7th Heaven
4,1927/1928,1,Actress,,Gloria Swanson,Sadie Thompson


In [6]:
# Selecting columns in academy award dataframe
academy_award_df2 = academy_award_df[['Award','Winner','Name','Film']]

# drop empty film & Name cells
academy_award_df2 = academy_award_df2.dropna(subset=['Film'])
academy_award_df2 = academy_award_df2.dropna(subset=['Name'])

# drop empty winner cells as they are only nominees 
academy_award_df2 = academy_award_df2.dropna(subset=['Winner'])

# # dropping times where a movie is credited with an award twice (for example, a movie wins best cinematography, and there's 2 cinematographers)
academy_award_df2 = academy_award_df2.drop_duplicates(subset=['Award','Film'])
academy_award_df2 = academy_award_df2.drop_duplicates(subset=['Award','Name'])

academy_award_df2


Unnamed: 0,Award,Winner,Name,Film
1,Actor,1.0,Emil Jannings,The Last Command
3,Actress,1.0,Janet Gaynor,7th Heaven
6,Art Direction,1.0,William Cameron Menzies,The Dove; Tempest
9,Cinematography,1.0,Charles Rosher,Sunrise
11,Directing (Comedy Picture),1.0,Lewis Milestone,Two Arabian Knights
...,...,...,...,...
9936,Sound Editing,1.0,Mad Max: Fury Road,Mark Mangini and David White
9942,Sound Mixing,1.0,Mad Max: Fury Road,"Chris Jenkins, Gregg Rudloff and Ben Osmo"
9946,Visual Effects,1.0,Ex Machina,"Andrew Whitehurst, Paul Norris, Mark Ardington..."
9951,Writing (Adapted Screenplay),1.0,The Big Short,Screenplay by Charles Randolph and Adam McKay


### ERD TABLES

In [27]:
#create unique identifiers for movie titles and directors
unique_rows=imdb_df[["Series_Title","Released_Year","Runtime","Genre","IMDB_Rating","Meta_score","Director","Gross"]]
#first create a column where each unique director is given an id
#the limitation of this is if there are two directors with the same name, they will be given the same unique id
unique_rows["director_id"]=unique_rows.groupby(["Director"]).ngroup()
#next create unique movie id. This method will give a unique id to each movie with a different title, director and release year.
#this should stop two different movies with the same title being given the same id
unique_rows["movie_id"]=unique_rows.groupby(["Series_Title","Director","Released_Year"]).ngroup()
unique_rows


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Meta_score,Director,Gross,director_id,movie_id
0,The Shawshank Redemption,1994,142 min,Drama,9.3,80.0,Frank Darabont,28341469,141,876
1,The Godfather,1972,175 min,"Crime, Drama",9.2,100.0,Francis Ford Coppola,134966411,137,787
2,The Dark Knight,2008,152 min,"Action, Crime, Drama",9.0,84.0,Christopher Nolan,534858444,83,767
3,The Godfather: Part II,1974,202 min,"Crime, Drama",9.0,90.0,Francis Ford Coppola,57300000,137,788
4,12 Angry Men,1957,96 min,"Crime, Drama",9.0,96.0,Sidney Lumet,4360000,456,1
...,...,...,...,...,...,...,...,...,...,...
995,Breakfast at Tiffany's,1961,115 min,"Comedy, Drama, Romance",7.6,76.0,Blake Edwards,,50,131
996,Giant,1956,201 min,"Drama, Western",7.6,84.0,George Stevens,,164,299
997,From Here to Eternity,1953,118 min,"Drama, Romance, War",7.6,85.0,Fred Zinnemann,30500000,145,287
998,Lifeboat,1944,97 min,"Drama, War",7.6,78.0,Alfred Hitchcock,,22,482


In [30]:
print(unique_rows["movie_id"].sort_values(ascending=True))
print(unique_rows["director_id"].sort_values(ascending=True))

754      0
4        1
215      2
84       3
114      4
      ... 
330    995
860    996
772    997
708    998
211    999
Name: movie_id, Length: 1000, dtype: int64
65       0
612      1
734      2
577      3
735      4
      ... 
138    544
334    545
221    545
92     546
366    547
Name: director_id, Length: 1000, dtype: int64


In [34]:
#movies table
movies_table=unique_rows[["movie_id","Series_Title","Released_Year","Runtime","Genre","IMDB_Rating","Meta_score","Gross"]]
movies_table

Unnamed: 0,movie_id,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Meta_score,Gross
0,876,The Shawshank Redemption,1994,142 min,Drama,9.3,80.0,28341469
1,787,The Godfather,1972,175 min,"Crime, Drama",9.2,100.0,134966411
2,767,The Dark Knight,2008,152 min,"Action, Crime, Drama",9.0,84.0,534858444
3,788,The Godfather: Part II,1974,202 min,"Crime, Drama",9.0,90.0,57300000
4,1,12 Angry Men,1957,96 min,"Crime, Drama",9.0,96.0,4360000
...,...,...,...,...,...,...,...,...
995,131,Breakfast at Tiffany's,1961,115 min,"Comedy, Drama, Romance",7.6,76.0,
996,299,Giant,1956,201 min,"Drama, Western",7.6,84.0,
997,287,From Here to Eternity,1953,118 min,"Drama, Romance, War",7.6,85.0,30500000
998,482,Lifeboat,1944,97 min,"Drama, War",7.6,78.0,


In [33]:
#directors table
#all directors who have directed movies in the IMDB top 1000
directors=unique_rows[["director_id","Director"]]
directors

Unnamed: 0,director_id,Director
0,141,Frank Darabont
1,137,Francis Ford Coppola
2,83,Christopher Nolan
3,137,Francis Ford Coppola
4,456,Sidney Lumet
...,...,...
995,50,Blake Edwards
996,164,George Stevens
997,145,Fred Zinnemann
998,22,Alfred Hitchcock


In [35]:
#directed_by table
#all movies 
directored_by=unique_rows[["movie_id","director_id"]]
directored_by

Unnamed: 0,movie_id,director_id
0,876,141
1,787,137
2,767,83
3,788,137
4,1,456
...,...,...
995,131,50
996,299,164
997,287,145
998,482,22


Oscars Tables

In [36]:
#Merging IMDB table with Academy Award table so only academy awards to top 1000 movies remain
#Some film titles are in the Film column of Academy Award table, these are merged here

oscar_on_film=pd.merge(academy_award_df2,unique_rows,how='left',left_on="Film",right_on="Series_Title")
oscar_on_film=oscar_on_film.dropna(subset=["IMDB_Rating"])
oscar_on_film=oscar_on_film.drop(columns=["Film","Name"])
oscar_on_film

Unnamed: 0,Award,Winner,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Meta_score,Director,Gross,director_id,movie_id
12,Special Award,1.0,The Circus,1928,72 min,"Comedy, Romance",8.1,90.0,Charles Chaplin,,74.0,762.0
24,Directing,1.0,All Quiet on the Western Front,1930,152 min,"Drama, War",8.0,91.0,Lewis Milestone,3270000,292.0,42.0
60,Actor,1.0,It Happened One Night,1934,105 min,"Comedy, Romance",8.1,87.0,Frank Capra,4360000,140.0,388.0
61,Actress,1.0,It Happened One Night,1934,105 min,"Comedy, Romance",8.1,87.0,Frank Capra,4360000,140.0,388.0
122,Special Award,1.0,A Star Is Born,2018,136 min,"Drama, Music, Romance",7.6,88.0,Bradley Cooper,215288866,59.0,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1734,Actor in a Supporting Role,1.0,Whiplash,2014,106 min,"Drama, Music",8.5,88.0,Damien Chazelle,13092000,90.0,971.0
1736,Actress in a Supporting Role,1.0,Boyhood,2014,165 min,Drama,7.9,100.0,Richard Linklater,25379975,408.0,127.0
1754,Actor in a Leading Role,1.0,The Revenant,2015,156 min,"Action, Adventure, Drama",8.0,76.0,Alejandro G. Iñárritu,183637894,14.0,870.0
1755,Actor in a Supporting Role,1.0,Bridge of Spies,2015,142 min,"Drama, History, Thriller",7.6,81.0,Steven Spielberg,72313754,470.0,134.0


In [37]:
#some film titles are in the Name column of Academy Award table, these are merged here

oscar_on_name=pd.merge(academy_award_df2,unique_rows,how="left",left_on="Name",right_on="Series_Title")
oscar_on_name=oscar_on_name.dropna(subset=["IMDB_Rating"])
oscar_on_name=oscar_on_name.drop(columns=["Film","Name"])
oscar_on_name

Unnamed: 0,Award,Winner,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Meta_score,Director,Gross,director_id,movie_id
1758,Animated Feature Film,1.0,Inside Out,2015,95 min,"Animation, Adventure, Comedy",8.1,94.0,Pete Docter,356461711,380.0,381.0
1759,Costume Design,1.0,Mad Max: Fury Road,2015,120 min,"Action, Adventure, Sci-Fi",8.1,90.0,George Miller,154058340,159.0,501.0
1762,Film Editing,1.0,Mad Max: Fury Road,2015,120 min,"Action, Adventure, Sci-Fi",8.1,90.0,George Miller,154058340,159.0,501.0
1763,Makeup and Hairstyling,1.0,Mad Max: Fury Road,2015,120 min,"Action, Adventure, Sci-Fi",8.1,90.0,George Miller,154058340,159.0,501.0
1764,Music (Original Score),1.0,The Hateful Eight,2015,168 min,"Crime, Drama, Mystery",7.8,68.0,Quentin Tarantino,54117416,391.0,799.0
1766,Best Picture,1.0,Spotlight,2015,129 min,"Biography, Crime, Drama",8.1,93.0,Tom McCarthy,45055776,502.0,710.0
1767,Production Design,1.0,Mad Max: Fury Road,2015,120 min,"Action, Adventure, Sci-Fi",8.1,90.0,George Miller,154058340,159.0,501.0
1770,Sound Editing,1.0,Mad Max: Fury Road,2015,120 min,"Action, Adventure, Sci-Fi",8.1,90.0,George Miller,154058340,159.0,501.0
1771,Sound Mixing,1.0,Mad Max: Fury Road,2015,120 min,"Action, Adventure, Sci-Fi",8.1,90.0,George Miller,154058340,159.0,501.0
1772,Visual Effects,1.0,Ex Machina,2014,108 min,"Drama, Sci-Fi, Thriller",7.7,78.0,Alex Garland,25442958,16.0,264.0


In [42]:
#oscar_winners table
#both dataframes above are merged for a complete list of academy awards given to films in iMBD top 1000

oscar_winners=pd.merge(oscar_on_film,oscar_on_name,how='outer')
oscar_winners=oscar_winners.drop(columns=["Winner","Released_Year","Runtime","Genre","IMDB_Rating","Meta_score","Director","Gross","director_id"])
#return movie id column back to int
oscar_winners["movie_id"]=oscar_winners["movie_id"].astype(int)
oscar_winners

Unnamed: 0,Award,Series_Title,movie_id
0,Special Award,The Circus,762
1,Directing,All Quiet on the Western Front,42
2,Actor,It Happened One Night,388
3,Actress,It Happened One Night,388
4,Special Award,A Star Is Born,23
...,...,...,...
130,Sound Editing,Mad Max: Fury Road,501
131,Sound Mixing,Mad Max: Fury Road,501
132,Visual Effects,Ex Machina,264
133,Writing (Adapted Screenplay),The Big Short,747


In [13]:
# oscar_categories table
# all unique award categories from the academy award dataframe

oscar_categories_df = pd.DataFrame(academy_award_df2['Award'].unique())

oscar_categories_df=oscar_categories_df.rename(columns={0:"Category Name"})

oscar_categories_df

Unnamed: 0,Category Name
0,Actor
1,Actress
2,Art Direction
3,Cinematography
4,Directing (Comedy Picture)
...,...
100,Animated Feature Film
101,Writing (Adapted Screenplay)
102,Sound Mixing
103,Makeup and Hairstyling


Movie Genre Tables

In [14]:
#movie_genres table
#1NF table of columns: (1) movie titles and (2) genres 

In [15]:
#genres tables
#unique values from movie_genres["genres"]

# Load

### Create database connection

In [16]:
connection_string = "postgres:postgres@localhost:5432/movies_project_db"
engine = create_engine(f'postgresql://{connection_string}')

### Load DataFrames into database

In [17]:
premise_transformed.to_sql(name='premise', con=engine, if_exists='append', index=True)

NameError: name 'premise_transformed' is not defined

In [None]:
county_transformed.to_sql(name='county', con=engine, if_exists='append', index=True)