In [1]:
import numpy as np
import pandas as pd
import csv

from sqlalchemy import create_engine

# Extract

### Extract CSVs into DataFrames

In [2]:
imdb_path = "resources/imdb_top_1000.csv"
academy_award_path = "resources/academy_awards.csv"

imdb_df = pd.read_csv(imdb_path)
academy_award_df = pd.read_csv(academy_award_path)

# Transform

### Transform IMDB dataframe

In [3]:
imdb_df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


Data Cleaning

In [4]:
#change Runtime to remove "min" text and make it easier to query runtimes later on
#converts the column to integer as well
imdb_df["Runtime"]=imdb_df["Runtime"].str.replace(" min","",case=False).astype(int)
imdb_df

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,https://m.media-amazon.com/images/M/MV5BNGEwMT...,Breakfast at Tiffany's,1961,A,115,"Comedy, Drama, Romance",7.6,A young New York socialite becomes interested ...,76.0,Blake Edwards,Audrey Hepburn,George Peppard,Patricia Neal,Buddy Ebsen,166544,
996,https://m.media-amazon.com/images/M/MV5BODk3Yj...,Giant,1956,G,201,"Drama, Western",7.6,Sprawling epic covering the life of a Texas ca...,84.0,George Stevens,Elizabeth Taylor,Rock Hudson,James Dean,Carroll Baker,34075,
997,https://m.media-amazon.com/images/M/MV5BM2U3Yz...,From Here to Eternity,1953,Passed,118,"Drama, Romance, War",7.6,"In Hawaii in 1941, a private is cruelly punish...",85.0,Fred Zinnemann,Burt Lancaster,Montgomery Clift,Deborah Kerr,Donna Reed,43374,30500000
998,https://m.media-amazon.com/images/M/MV5BZTBmMj...,Lifeboat,1944,,97,"Drama, War",7.6,Several survivors of a torpedoed merchant ship...,78.0,Alfred Hitchcock,Tallulah Bankhead,John Hodiak,Walter Slezak,William Bendix,26471,


In [5]:
#check column types to investigate
imdb_df.dtypes

Poster_Link       object
Series_Title      object
Released_Year     object
Certificate       object
Runtime            int32
Genre             object
IMDB_Rating      float64
Overview          object
Meta_score       float64
Director          object
Star1             object
Star2             object
Star3             object
Star4             object
No_of_Votes        int64
Gross             object
dtype: object

In [6]:
#change released_year to integer and find any problem rows
imdb_df["Released_Year"]=imdb_df["Released_Year"].apply(pd.to_numeric,errors='coerce')
imdb_df.loc[imdb_df["Released_Year"].isnull()]
#there was one result, Apollo 13, which had an incorrect release year in the data.
#we have decided to manually enter the release year for our dataset

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
966,https://m.media-amazon.com/images/M/MV5BNjEzYj...,Apollo 13,,U,140,"Adventure, Drama, History",7.6,NASA must devise a strategy to return Apollo 1...,77.0,Ron Howard,Tom Hanks,Bill Paxton,Kevin Bacon,Gary Sinise,269197,173837933


In [7]:
#manually adjust entry
imdb_df.at[966,"Released_Year"]=1995
imdb_df.loc[imdb_df["Series_Title"]=="Apollo 13"]

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
966,https://m.media-amazon.com/images/M/MV5BNjEzYj...,Apollo 13,1995.0,U,140,"Adventure, Drama, History",7.6,NASA must devise a strategy to return Apollo 1...,77.0,Ron Howard,Tom Hanks,Bill Paxton,Kevin Bacon,Gary Sinise,269197,173837933


In [8]:
#convert all release years from float to integer
imdb_df["Released_Year"]=imdb_df["Released_Year"].astype(int)
imdb_df

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,https://m.media-amazon.com/images/M/MV5BNGEwMT...,Breakfast at Tiffany's,1961,A,115,"Comedy, Drama, Romance",7.6,A young New York socialite becomes interested ...,76.0,Blake Edwards,Audrey Hepburn,George Peppard,Patricia Neal,Buddy Ebsen,166544,
996,https://m.media-amazon.com/images/M/MV5BODk3Yj...,Giant,1956,G,201,"Drama, Western",7.6,Sprawling epic covering the life of a Texas ca...,84.0,George Stevens,Elizabeth Taylor,Rock Hudson,James Dean,Carroll Baker,34075,
997,https://m.media-amazon.com/images/M/MV5BM2U3Yz...,From Here to Eternity,1953,Passed,118,"Drama, Romance, War",7.6,"In Hawaii in 1941, a private is cruelly punish...",85.0,Fred Zinnemann,Burt Lancaster,Montgomery Clift,Deborah Kerr,Donna Reed,43374,30500000
998,https://m.media-amazon.com/images/M/MV5BZTBmMj...,Lifeboat,1944,,97,"Drama, War",7.6,Several survivors of a torpedoed merchant ship...,78.0,Alfred Hitchcock,Tallulah Bankhead,John Hodiak,Walter Slezak,William Bendix,26471,


### Transform Academy Award dataframe

In [9]:
academy_award_df.head()

Unnamed: 0,Year,Ceremony,Award,Winner,Name,Film
0,1927/1928,1,Actor,,Richard Barthelmess,The Noose
1,1927/1928,1,Actor,1.0,Emil Jannings,The Last Command
2,1927/1928,1,Actress,,Louise Dresser,A Ship Comes In
3,1927/1928,1,Actress,1.0,Janet Gaynor,7th Heaven
4,1927/1928,1,Actress,,Gloria Swanson,Sadie Thompson


In [10]:
# Selecting columns in academy award dataframe
academy_award_df2 = academy_award_df[['Award','Winner','Name','Film']]

# drop empty film & Name cells
academy_award_df2 = academy_award_df2.dropna(subset=['Film'])
academy_award_df2 = academy_award_df2.dropna(subset=['Name'])

# drop empty winner cells as they are only nominees 
academy_award_df2 = academy_award_df2.dropna(subset=['Winner'])

# # dropping times where a movie is credited with an award twice (for example, a movie wins best cinematography, and there's 2 cinematographers)
academy_award_df2 = academy_award_df2.drop_duplicates(subset=['Award','Film'])
academy_award_df2 = academy_award_df2.drop_duplicates(subset=['Award','Name'])

academy_award_df2


Unnamed: 0,Award,Winner,Name,Film
1,Actor,1.0,Emil Jannings,The Last Command
3,Actress,1.0,Janet Gaynor,7th Heaven
6,Art Direction,1.0,William Cameron Menzies,The Dove; Tempest
9,Cinematography,1.0,Charles Rosher,Sunrise
11,Directing (Comedy Picture),1.0,Lewis Milestone,Two Arabian Knights
...,...,...,...,...
9936,Sound Editing,1.0,Mad Max: Fury Road,Mark Mangini and David White
9942,Sound Mixing,1.0,Mad Max: Fury Road,"Chris Jenkins, Gregg Rudloff and Ben Osmo"
9946,Visual Effects,1.0,Ex Machina,"Andrew Whitehurst, Paul Norris, Mark Ardington..."
9951,Writing (Adapted Screenplay),1.0,The Big Short,Screenplay by Charles Randolph and Adam McKay


### ERD TABLES

generate unique movie_id and director_id

In [11]:
#create unique identifiers for movie titles and directors
unique_rows=imdb_df[["Series_Title","Released_Year","Runtime","Genre","IMDB_Rating","Meta_score","Director","Gross"]]
#first create a column where each unique director is given an id
#the limitation of this is if there are two directors with the same name, they will be given the same unique id
unique_rows["director_id"]=unique_rows.groupby(["Director"]).ngroup()
#next create unique movie id. This method will give a unique id to each movie with a different title, director and release year.
#this should stop two different movies with the same title being given the same id
unique_rows["movie_id"]=unique_rows.groupby(["Series_Title","Director","Released_Year"]).ngroup()
unique_rows


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Meta_score,Director,Gross,director_id,movie_id
0,The Shawshank Redemption,1994,142,Drama,9.3,80.0,Frank Darabont,28341469,141,876
1,The Godfather,1972,175,"Crime, Drama",9.2,100.0,Francis Ford Coppola,134966411,137,787
2,The Dark Knight,2008,152,"Action, Crime, Drama",9.0,84.0,Christopher Nolan,534858444,83,767
3,The Godfather: Part II,1974,202,"Crime, Drama",9.0,90.0,Francis Ford Coppola,57300000,137,788
4,12 Angry Men,1957,96,"Crime, Drama",9.0,96.0,Sidney Lumet,4360000,456,1
...,...,...,...,...,...,...,...,...,...,...
995,Breakfast at Tiffany's,1961,115,"Comedy, Drama, Romance",7.6,76.0,Blake Edwards,,50,131
996,Giant,1956,201,"Drama, Western",7.6,84.0,George Stevens,,164,299
997,From Here to Eternity,1953,118,"Drama, Romance, War",7.6,85.0,Fred Zinnemann,30500000,145,287
998,Lifeboat,1944,97,"Drama, War",7.6,78.0,Alfred Hitchcock,,22,482


In [12]:
print(unique_rows["movie_id"].sort_values(ascending=True))
print(unique_rows["director_id"].sort_values(ascending=True))

754      0
4        1
215      2
84       3
114      4
      ... 
330    995
860    996
772    997
708    998
211    999
Name: movie_id, Length: 1000, dtype: int64
65       0
612      1
734      2
577      3
735      4
      ... 
138    544
334    545
221    545
92     546
366    547
Name: director_id, Length: 1000, dtype: int64


movies table

In [44]:
#movies table
#get column needed to load into database
movies_table=unique_rows[["movie_id","Series_Title","Released_Year","Runtime","Genre","IMDB_Rating","Meta_score","Gross"]]
movies_table=movies_table.rename(columns={"Series_Title":"movie_title","Released_Year":"release_year","Runtime":"runtime","Genre":"genre","IMDB_Rating":"imdb_score","Meta_score":"meta_score","Gross":"gross"})
movies_table

Unnamed: 0,movie_id,movie_title,release_year,runtime,genre,imdb_score,meta_score,gross
0,876,The Shawshank Redemption,1994,142,Drama,9.3,80.0,28341469
1,787,The Godfather,1972,175,"Crime, Drama",9.2,100.0,134966411
2,767,The Dark Knight,2008,152,"Action, Crime, Drama",9.0,84.0,534858444
3,788,The Godfather: Part II,1974,202,"Crime, Drama",9.0,90.0,57300000
4,1,12 Angry Men,1957,96,"Crime, Drama",9.0,96.0,4360000
...,...,...,...,...,...,...,...,...
995,131,Breakfast at Tiffany's,1961,115,"Comedy, Drama, Romance",7.6,76.0,
996,299,Giant,1956,201,"Drama, Western",7.6,84.0,
997,287,From Here to Eternity,1953,118,"Drama, Romance, War",7.6,85.0,30500000
998,482,Lifeboat,1944,97,"Drama, War",7.6,78.0,


directors and directed_by tables

In [42]:
#directors table
#all directors who have directed movies in the IMDB top 1000
directors=unique_rows[["director_id","Director"]]
directors=directors.rename(columns={"Director":"director_name"})
directors=directors.drop_duplicates()
directors.reset_index(drop=True,inplace=True)
directors

Unnamed: 0,director_id,director_name
0,141,Frank Darabont
1,137,Francis Ford Coppola
2,83,Christopher Nolan
3,456,Sidney Lumet
4,383,Peter Jackson
...,...,...
543,312,Martin Rosen
544,531,Wolfgang Reitherman
545,407,Richard Lester
546,50,Blake Edwards


In [15]:
#directed_by table
#all movies 
directored_by=unique_rows[["movie_id","director_id"]]
directored_by

Unnamed: 0,movie_id,director_id
0,876,141
1,787,137
2,767,83
3,788,137
4,1,456
...,...,...
995,131,50
996,299,164
997,287,145
998,482,22


oscar_winners and oscar_categories tables

In [16]:
#Merging IMDB table with Academy Award table so only academy awards to top 1000 movies remain
#Some film titles are in the Film column of Academy Award table, these are merged here

oscar_on_film=pd.merge(academy_award_df2,unique_rows,how='left',left_on="Film",right_on="Series_Title")
oscar_on_film=oscar_on_film.dropna(subset=["IMDB_Rating"])
oscar_on_film=oscar_on_film.drop(columns=["Film","Name"])
oscar_on_film

Unnamed: 0,Award,Winner,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Meta_score,Director,Gross,director_id,movie_id
12,Special Award,1.0,The Circus,1928.0,72.0,"Comedy, Romance",8.1,90.0,Charles Chaplin,,74.0,762.0
24,Directing,1.0,All Quiet on the Western Front,1930.0,152.0,"Drama, War",8.0,91.0,Lewis Milestone,3270000,292.0,42.0
60,Actor,1.0,It Happened One Night,1934.0,105.0,"Comedy, Romance",8.1,87.0,Frank Capra,4360000,140.0,388.0
61,Actress,1.0,It Happened One Night,1934.0,105.0,"Comedy, Romance",8.1,87.0,Frank Capra,4360000,140.0,388.0
122,Special Award,1.0,A Star Is Born,2018.0,136.0,"Drama, Music, Romance",7.6,88.0,Bradley Cooper,215288866,59.0,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1734,Actor in a Supporting Role,1.0,Whiplash,2014.0,106.0,"Drama, Music",8.5,88.0,Damien Chazelle,13092000,90.0,971.0
1736,Actress in a Supporting Role,1.0,Boyhood,2014.0,165.0,Drama,7.9,100.0,Richard Linklater,25379975,408.0,127.0
1754,Actor in a Leading Role,1.0,The Revenant,2015.0,156.0,"Action, Adventure, Drama",8.0,76.0,Alejandro G. Iñárritu,183637894,14.0,870.0
1755,Actor in a Supporting Role,1.0,Bridge of Spies,2015.0,142.0,"Drama, History, Thriller",7.6,81.0,Steven Spielberg,72313754,470.0,134.0


In [17]:
#some film titles are in the Name column of Academy Award table, these are merged here

oscar_on_name=pd.merge(academy_award_df2,unique_rows,how="left",left_on="Name",right_on="Series_Title")
oscar_on_name=oscar_on_name.dropna(subset=["IMDB_Rating"])
oscar_on_name=oscar_on_name.drop(columns=["Film","Name"])
oscar_on_name

Unnamed: 0,Award,Winner,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Meta_score,Director,Gross,director_id,movie_id
1758,Animated Feature Film,1.0,Inside Out,2015.0,95.0,"Animation, Adventure, Comedy",8.1,94.0,Pete Docter,356461711,380.0,381.0
1759,Costume Design,1.0,Mad Max: Fury Road,2015.0,120.0,"Action, Adventure, Sci-Fi",8.1,90.0,George Miller,154058340,159.0,501.0
1762,Film Editing,1.0,Mad Max: Fury Road,2015.0,120.0,"Action, Adventure, Sci-Fi",8.1,90.0,George Miller,154058340,159.0,501.0
1763,Makeup and Hairstyling,1.0,Mad Max: Fury Road,2015.0,120.0,"Action, Adventure, Sci-Fi",8.1,90.0,George Miller,154058340,159.0,501.0
1764,Music (Original Score),1.0,The Hateful Eight,2015.0,168.0,"Crime, Drama, Mystery",7.8,68.0,Quentin Tarantino,54117416,391.0,799.0
1766,Best Picture,1.0,Spotlight,2015.0,129.0,"Biography, Crime, Drama",8.1,93.0,Tom McCarthy,45055776,502.0,710.0
1767,Production Design,1.0,Mad Max: Fury Road,2015.0,120.0,"Action, Adventure, Sci-Fi",8.1,90.0,George Miller,154058340,159.0,501.0
1770,Sound Editing,1.0,Mad Max: Fury Road,2015.0,120.0,"Action, Adventure, Sci-Fi",8.1,90.0,George Miller,154058340,159.0,501.0
1771,Sound Mixing,1.0,Mad Max: Fury Road,2015.0,120.0,"Action, Adventure, Sci-Fi",8.1,90.0,George Miller,154058340,159.0,501.0
1772,Visual Effects,1.0,Ex Machina,2014.0,108.0,"Drama, Sci-Fi, Thriller",7.7,78.0,Alex Garland,25442958,16.0,264.0


In [26]:
#oscar_winners table
#both dataframes above are merged for a complete list of academy awards given to films in iMBD top 1000
#a movie cannot win in the same category more than once therefore the category and movie_id make a composite key
oscar_winners=pd.merge(oscar_on_film,oscar_on_name,how='outer')
oscar_winners=oscar_winners.drop(columns=["Winner","Released_Year","Runtime","Genre","IMDB_Rating","Meta_score","Director","Gross","director_id","Series_Title"])
#return movie id column back to int
oscar_winners["movie_id"]=oscar_winners["movie_id"].astype(int)
oscar_winners=oscar_winners.rename(columns={"Award":"category"})
oscar_winners

Unnamed: 0,category,movie_id
0,Special Award,762
1,Directing,42
2,Actor,388
3,Actress,388
4,Special Award,23
...,...,...
130,Sound Editing,501
131,Sound Mixing,501
132,Visual Effects,264
133,Writing (Adapted Screenplay),747


In [27]:
# oscar_categories table
# all unique award categories from the academy award dataframe

oscar_categories = pd.DataFrame(academy_award_df2['Award'].unique())

oscar_categories=oscar_categories.rename(columns={0:"category_name"})

oscar_categories

Unnamed: 0,category_name
0,Actor
1,Actress
2,Art Direction
3,Cinematography
4,Directing (Comedy Picture)
...,...
100,Animated Feature Film
101,Writing (Adapted Screenplay)
102,Sound Mixing
103,Makeup and Hairstyling


movie_genres and genres tables

In [28]:
## create movie_genres table
unique_rows_copy = unique_rows.copy()
genre_list = unique_rows_copy['Genre'].str.split(',')
movie_genres = pd.DataFrame({'movie_id': unique_rows_copy['movie_id'], 'Genre': genre_list})
len_list = list(map(len, movie_genres['Genre']))
movie_genres = pd.DataFrame({'movie_id': np.repeat(movie_genres['movie_id'], len_list), 'Genre': np.concatenate(movie_genres['Genre'].values)})
movie_genres=movie_genres.reset_index(drop=True)
#it is impossible for a single movie to have duplicates of genre therefore movie_id and genre columns are a composite key
movie_genres=movie_genres.rename(columns={"Genre":"genre"})
movie_genres

Unnamed: 0,movie_id,genre
0,876,Drama
1,787,Crime
2,787,Drama
3,767,Action
4,767,Crime
...,...,...
2536,482,Drama
2537,482,War
2538,739,Crime
2539,739,Mystery


In [30]:
#genres table
#all genres are unique therefore it is a single column table of primary keys
genres = pd.DataFrame({'genre': movie_genres['genre'].unique()})
genres


Unnamed: 0,genre
0,Drama
1,Crime
2,Drama
3,Action
4,Crime
5,Adventure
6,Biography
7,History
8,Sci-Fi
9,Romance


# Load

### Create database connection

In [22]:
connection_string = "postgres:postgres@localhost:5432/movies_project_db"
engine = create_engine(f'postgresql://{connection_string}')

### Load DataFrames into database

In [23]:
premise_transformed.to_sql(name='premise', con=engine, if_exists='append', index=True)

NameError: name 'premise_transformed' is not defined

In [None]:
county_transformed.to_sql(name='county', con=engine, if_exists='append', index=True)