In [2]:
import numpy as np
import pandas as pd
import csv
import datetime as dt
from sqlalchemy import create_engine

# Extract

### Extract CSVs into DataFrames

In [3]:
imdb_path = "resources/imdb_top_1000.csv"
academy_award_path = "resources/academy_awards.csv"

imdb_df = pd.read_csv(imdb_path)
academy_award_df = pd.read_csv(academy_award_path)

# Transform

### Transform IMDB dataframe

In [4]:
imdb_df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [5]:
imdb_df_copy = imdb_df.copy()


In [18]:
## create movie and genre table

genre_list = imdb_df_copy['Genre'].str.split(',')
movie_genres = pd.DataFrame({'Series_Title': imdb_df_copy['Series_Title'], 'Genre': genre_list})
len_list = list(map(len, movie_genres['Genre']))
movie_genres = pd.DataFrame({'Series_Title': np.repeat(movie_genres['Series_Title'], len_list), 'Genre': np.concatenate(movie_genres['Genre'].values)})
movie_genres


Unnamed: 0,Series_Title,Genre
0,The Shawshank Redemption,Drama
1,The Godfather,Crime
1,The Godfather,Drama
2,The Dark Knight,Action
2,The Dark Knight,Crime
...,...,...
998,Lifeboat,Drama
998,Lifeboat,War
999,The 39 Steps,Crime
999,The 39 Steps,Mystery


In [19]:
genres = pd.Series(movie_genres['Genre'].unique())
genres

0          Drama
1          Crime
2          Drama
3         Action
4          Crime
5      Adventure
6      Biography
7        History
8         Sci-Fi
9        Romance
10       Western
11       Fantasy
12        Comedy
13      Thriller
14     Adventure
15     Animation
16        Family
17           War
18       Mystery
19         Music
20        Comedy
21        Horror
22     Biography
23        Action
24       Western
25       Mystery
26        Horror
27       Musical
28     Film-Noir
29         Sport
30     Film-Noir
31       Fantasy
32        Family
33      Thriller
dtype: object

In [28]:
# create movies table

movies = imdb_df_copy[['Series_Title', 'Runtime', 'Released_Year', 'IMDB_Rating', 'Meta_score', 'Gross']].copy()

#transform 'Runtime'
movies['Runtime'] = movies['Runtime'].str.replace(' min', '').astype(int)


movies.dtypes

Series_Title      object
Runtime            int64
Released_Year     object
IMDB_Rating      float64
Meta_score       float64
Gross             object
dtype: object

In [26]:
# transform gross

movies['Gross'] = movies['Gross'].str.replace(',', '').fillna(0)


AttributeError: Can only use .str accessor with string values!

In [None]:
# transform released year
movies['Released_Year'] = movies.astype({"Released_Year": "int64"}, errors="raise")

movies['Released_Year'] = movies['Released_Year'].replace('PG','1995').astype(int)

In [13]:
movies_table.astype({"Released_Year": "int64"}, errors="ignore")

movies_table['Released_Year'] = movies_table['Released_Year'].replace('PG','1995').astype(int)

0      1994
1      1972
2      2008
3      1974
4      1957
       ... 
995    1961
996    1956
997    1953
998    1944
999    1935
Name: Released_Year, Length: 1000, dtype: object

In [15]:
movies_table['Released_Year'].to_numeric(errors="coerce")
movies_table.loc[movies_table['Released_Year'] != int]

AttributeError: 'Series' object has no attribute 'to_numeric'

In [None]:
movies_table['Gross'] = movies_table['Gross'].str.replace(',', '').fillna(0).astype(int)
movies_table['Released_Year'] = movies_table['Released_Year'].replace('PG','1995').astype(int)
movies_table.dtypes


### Transform Academy Award dataframe

In [6]:
academy_award_df.head()

Unnamed: 0,Year,Ceremony,Award,Winner,Name,Film
0,1927/1928,1,Actor,,Richard Barthelmess,The Noose
1,1927/1928,1,Actor,1.0,Emil Jannings,The Last Command
2,1927/1928,1,Actress,,Louise Dresser,A Ship Comes In
3,1927/1928,1,Actress,1.0,Janet Gaynor,7th Heaven
4,1927/1928,1,Actress,,Gloria Swanson,Sadie Thompson


# Load

### Create database connection

In [None]:
connection_string = "postgres:postgres@localhost:5432/movies_project_db"
engine = create_engine(f'postgresql://{connection_string}')

### Load DataFrames into database

In [None]:
premise_transformed.to_sql(name='premise', con=engine, if_exists='append', index=True)

In [None]:
county_transformed.to_sql(name='county', con=engine, if_exists='append', index=True)