In [1]:
import pandas as pd
import os, csv
import numpy as np
from sqlalchemy import create_engine


In [2]:
data = os.path.join('Resources','cleaned_merged','cleaned_netflix_sag_data.csv')
data1 = pd.read_csv(data)
data_df = pd.DataFrame(data1).drop(columns='Unnamed: 0')
data_df

Unnamed: 0,id,show_id,type,director,full_name,title,release_year,description,category,won
0,279,80201906,Movie,Ryan Coogler,Chadwick Boseman,Black Panther,2018,"T'Challa, the superpowered new leader of the h...",CAST IN A MOTION PICTURE,True
1,280,80201906,Movie,Ryan Coogler,Michael B. Jordan,Black Panther,2018,"T'Challa, the superpowered new leader of the h...",CAST IN A MOTION PICTURE,True
2,282,80201906,Movie,Ryan Coogler,Danai Gurira,Black Panther,2018,"T'Challa, the superpowered new leader of the h...",CAST IN A MOTION PICTURE,True
3,283,80201906,Movie,Ryan Coogler,Martin Freeman,Black Panther,2018,"T'Challa, the superpowered new leader of the h...",CAST IN A MOTION PICTURE,True
4,284,80201906,Movie,Ryan Coogler,Daniel Kaluuya,Black Panther,2018,"T'Challa, the superpowered new leader of the h...",CAST IN A MOTION PICTURE,True
...,...,...,...,...,...,...,...,...,...,...
274,28777,607931,Movie,Jocelyn Moorhouse,Kate Nelligan,How To Make An American Quilt,1995,A conflicted young woman spends the summer wit...,CAST IN A MOTION PICTURE,False
275,28778,607931,Movie,Jocelyn Moorhouse,Alfre Woodard,How To Make An American Quilt,1995,A conflicted young woman spends the summer wit...,CAST IN A MOTION PICTURE,False
276,28784,607931,Movie,Jocelyn Moorhouse,Maya Angelou,How To Make An American Quilt,1995,A conflicted young woman spends the summer wit...,CAST IN A MOTION PICTURE,False
277,28788,607931,Movie,Jocelyn Moorhouse,Jean Simmons,How To Make An American Quilt,1995,A conflicted young woman spends the summer wit...,CAST IN A MOTION PICTURE,False


In [3]:
actors = data_df[['id','full_name']].set_index('id').drop_duplicates().copy()

titles = data_df[['show_id','title','type','release_year','description']].set_index('show_id').drop_duplicates().copy()

act_ttl = data_df[['id','show_id']].set_index('id').drop_duplicates().copy()

directors = data_df[['director']].rename(columns={'director':'directors_names'}).drop_duplicates().copy()
directors['director_id'] = range(len(directors['directors_names']))

dir_ttl = data_df[['show_id']].drop_duplicates().copy()
dir_ttl['director_id'] = directors['director_id']
dir_ttl = dir_ttl.dropna()
dir_ttl['director_id'] = [int(value) for value in directors['director_id']]
dir_ttl = dir_ttl.set_index('show_id')

awards = data_df[['category','won']].copy()
awards['award_id'] = range(len(awards['won']))


act_award = data_df[['id']].copy()
act_award['award_id'] = awards['award_id']
act_award = act_award.set_index('id')

dir_award = directors[['director_id']].copy()
dir_award['award_id'] = awards['award_id']
dir_award = dir_award.set_index('director_id')

awards = awards.set_index('award_id')
directors = directors.set_index('director_id')

In [4]:
# combine similar results

awards['category'] = awards['category'].replace(
    {'MALE SUPPORT':'MALE SUPPORTING ROLE',
    ' MALE ACTOR IN A SUPPORTING ROLE':'MALE SUPPORTING ROLE',
    'FEMALE SUPPORT':'FEMALE SUPPORTING ROLE',
    ' FEMALE ACTOR IN A SUPPORTING ROLE':'FEMALE SUPPORTING ROLE',
    'FEMALE LEAD':'FEMALE LEAD ROLE',
    'FEMALE ACTOR IN A LEADING ROLE':'FEMALE LEAD ROLE',
    ' FEMALE ACTOR IN A LEADING ROLE':'FEMALE LEAD ROLE',
    'FEMALE LEAD IN A MOTION PICTURE':'FEMALE LEAD ROLE',
    'MALE LEAD':'MALE LEAD ROLE',
    ' MALE ACTOR IN A LEADING ROLE':'MALE LEAD ROLE',
    'MALE ACTOR IN A LEADING ROLE':'MALE LEAD ROLE'})

In [5]:
awards['category'].value_counts()

CAST IN A MOTION PICTURE       193
MALE SUPPORTING ROLE            28
FEMALE SUPPORTING ROLE          20
FEMALE LEAD ROLE                19
MALE LEAD ROLE                  18
MALE TV MOVIE OR MINISERIES      1
Name: category, dtype: int64

In [6]:
conn = "postgres:postgres@localhost:5432/netflix_sag_db"
engine = create_engine(f'postgresql://{conn}')

In [7]:
engine.table_names()

['actors',
 'titles',
 'act_ttl',
 'directors',
 'dir_ttl',
 'awards',
 'act_award',
 'dir_award']

In [8]:
actors.to_sql(name='actors', con=engine, if_exists='append', index=True)

In [9]:
titles.to_sql(name='titles', con=engine, if_exists='append', index=True)

In [10]:
act_ttl.to_sql(name='act_ttl', con=engine, if_exists='append', index=True)

In [11]:
directors.to_sql(name='directors', con=engine, if_exists='append', index=True)

In [12]:
dir_ttl.to_sql(name='dir_ttl', con=engine, if_exists='append', index=True)

In [13]:
awards.to_sql(name='awards', con=engine, if_exists='append', index=True)

In [14]:
act_award.to_sql(name='act_award', con=engine, if_exists='append', index=True)

In [15]:
dir_award.to_sql(name='dir_award', con=engine, if_exists='append', index=True)