#### In this notebook, the movie data is reworked and new data about crew, director, writers, and actors is included. Then this data will be prepared as SQL tables that can be used in Tableau and Power BI. 

# Create Project

## Load Libraries and Functions

In [1]:
# import pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os


## Load Data

In [2]:
# imdb urls for datasets
names_url = "https://datasets.imdbws.com/name.basics.tsv.gz"
crew_url = "https://datasets.imdbws.com/title.crew.tsv.gz"
principals_url = "https://datasets.imdbws.com/title.principals.tsv.gz"


In [3]:
# loading the data
names = pd.read_csv(names_url, sep='\t', low_memory=False)
crew = pd.read_csv(crew_url, sep='\t', low_memory=False)
principals = pd.read_csv(principals_url, sep='\t', low_memory=False)

# viewing the data
display(names.head(), crew.head(), principals.head())


Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0053137,tt0031983,tt0072308,tt0050419"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0117057,tt0037382,tt0038355,tt0075213"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0057345,tt0056404,tt0054452,tt0049189"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0072562,tt0077975,tt0078723,tt0080455"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0069467,tt0050976,tt0083922"


Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,\N
1,tt0000002,nm0721526,\N
2,tt0000003,nm0721526,\N
3,tt0000004,nm0721526,\N
4,tt0000005,nm0005690,\N


Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0374658,cinematographer,director of photography,\N
3,tt0000002,1,nm0721526,director,\N,\N
4,tt0000002,2,nm1335271,composer,\N,\N


## Get previous data 

### AKAs

In [6]:
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"

akas_df = pd.read_csv(akas_url, sep = "\t", low_memory = False)
akas_df.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [7]:
# get US movies
akas_filter = akas_df["region"] == "US"

akas_df = akas_df[akas_filter]

akas_df["region"].value_counts()

US    1450671
Name: region, dtype: int64

In [8]:
#Removing all \N values
akas_df = akas_df.replace({"\\N":np.nan})

akas_df.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


### Ratings

In [9]:
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

ratings_df = pd.read_csv(ratings_url, sep = "\t", low_memory = False)
ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1986
1,tt0000002,5.8,265
2,tt0000003,6.5,1845
3,tt0000004,5.5,178
4,tt0000005,6.2,2627


In [10]:
#Filtering out non-US ratings
ratings_in_US_filter = ratings_df["tconst"].isin(akas_df["titleId"])

ratings_df = ratings_df[ratings_in_US_filter]
ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1986
1,tt0000002,5.8,265
4,tt0000005,6.2,2627
5,tt0000006,5.1,182
6,tt0000007,5.4,820


In [11]:
#Removing all \N values
ratings_df = ratings_df.replace({"\\N":np.nan})

ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1986
1,tt0000002,5.8,265
4,tt0000005,6.2,2627
5,tt0000006,5.1,182
6,tt0000007,5.4,820


In [12]:
#Checking for duplicated values
ratings_df.duplicated().sum()

0

In [4]:
# also need the basics to work this new data
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"

basics_df = pd.read_csv(basics_url, sep = "\t", low_memory = False)
basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


### Basics

In [13]:
#Filtering out non-US movies
movies_in_US_filter = basics_df["tconst"].isin(akas_df["titleId"])

basics_df = basics_df[movies_in_US_filter]
basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"


In [14]:
#Checking for duplicated values
ratings_df.duplicated().sum()

0

In [18]:
# Removing all \N values
basics_df = basics_df.replace({"\\N": np.nan})

basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45.0,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,,100.0,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70.0,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90.0,Drama
625,tt0000630,movie,Hamlet,Amleto,0,1908,,,Drama


In [15]:
#Filtering out non-movies
isMovie = basics_df["titleType"] == "movie"
basics_df = basics_df[isMovie]

basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
625,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama


In [19]:
#Changing startYear to be an int
basics_df["startYear"] = basics_df["startYear"].astype(float)

#Filters for movies from 2000-2021 inclusive
isOlderThan2000 = basics_df["startYear"] >= 2000
isYoungerThan2022 = basics_df["startYear"] <= 2021

basics_df = basics_df[isOlderThan2000 & isYoungerThan2022]

#Checking to make sure filters work
basics_df.describe()

Unnamed: 0,startYear,endYear
count,135056.0,0.0
mean,2013.186515,
std,5.608006,
min,2000.0,
25%,2009.0,
50%,2014.0,
75%,2018.0,
max,2021.0,


### Crew

In [21]:
#Filtering out crew that are not in basics
movies_in_basics_filter = crew["tconst"].isin(basics_df["tconst"])

crew = crew[movies_in_basics_filter]
crew.head()

Unnamed: 0,tconst,directors,writers
34803,tt0035423,nm0003506,"nm0737216,nm0003506"
61115,tt0062336,"nm0749914,nm0765384","nm0749914,nm1146177"
67667,tt0069049,nm0000080,"nm0000080,nm0462648"
86794,tt0088751,"nm0078540,nm0628399",nm0628399
93931,tt0096056,nm0324875,"nm0234502,nm0324875"


In [22]:
# Removing all \N values
crew = crew.replace({"\\N": np.nan})

crew.head()

Unnamed: 0,tconst,directors,writers
34803,tt0035423,nm0003506,"nm0737216,nm0003506"
61115,tt0062336,"nm0749914,nm0765384","nm0749914,nm1146177"
67667,tt0069049,nm0000080,"nm0000080,nm0462648"
86794,tt0088751,"nm0078540,nm0628399",nm0628399
93931,tt0096056,nm0324875,"nm0234502,nm0324875"


In [24]:
#check for duplicates
crew.duplicated().sum()

0

In [25]:
#Splitting writers and directors into lists
crew["directors_split"] = crew["directors"].str.split(',')
crew["writers_split"] = crew["writers"].str.split(',')
crew.head()

Unnamed: 0,tconst,directors,writers,directors_split,writers_split
34803,tt0035423,nm0003506,"nm0737216,nm0003506",[nm0003506],"[nm0737216, nm0003506]"
61115,tt0062336,"nm0749914,nm0765384","nm0749914,nm1146177","[nm0749914, nm0765384]","[nm0749914, nm1146177]"
67667,tt0069049,nm0000080,"nm0000080,nm0462648",[nm0000080],"[nm0000080, nm0462648]"
86794,tt0088751,"nm0078540,nm0628399",nm0628399,"[nm0078540, nm0628399]",[nm0628399]
93931,tt0096056,nm0324875,"nm0234502,nm0324875",[nm0324875],"[nm0234502, nm0324875]"


In [26]:
#Removing unnecessary directors and writers columns
crew = crew.drop(columns = ["directors", "writers"])
crew.head()

Unnamed: 0,tconst,directors_split,writers_split
34803,tt0035423,[nm0003506],"[nm0737216, nm0003506]"
61115,tt0062336,"[nm0749914, nm0765384]","[nm0749914, nm1146177]"
67667,tt0069049,[nm0000080],"[nm0000080, nm0462648]"
86794,tt0088751,"[nm0078540, nm0628399]",[nm0628399]
93931,tt0096056,[nm0324875],"[nm0234502, nm0324875]"


In [28]:
#Exploding directors and writers
crew = crew.explode("directors_split")
crew = crew.explode("writers_split")
crew.head()

Unnamed: 0,tconst,directors_split,writers_split
34803,tt0035423,nm0003506,nm0737216
34803,tt0035423,nm0003506,nm0003506
61115,tt0062336,nm0749914,nm0749914
61115,tt0062336,nm0749914,nm1146177
61115,tt0062336,nm0765384,nm0749914


In [29]:
#Renaming directors and writers columns
crew = crew .rename(columns = {"directors_split": "director", "writers_split": "writer"})
crew.head()

Unnamed: 0,tconst,director,writer
34803,tt0035423,nm0003506,nm0737216
34803,tt0035423,nm0003506,nm0003506
61115,tt0062336,nm0749914,nm0749914
61115,tt0062336,nm0749914,nm1146177
61115,tt0062336,nm0765384,nm0749914


In [35]:
#create unique writers and directors
unique_director = crew["director"].unique()
unique_director

unique_writer = crew["writer"].unique()
unique_writer

array(['nm0737216', 'nm0003506', 'nm0749914', ..., 'nm5412267',
       'nm6743460', 'nm3471432'], dtype=object)

### Principals

In [32]:
#Filtering out principals that are not in basics
movies_in_basics_filter = principals["tconst"].isin(basics_df["tconst"])

principals = principals[movies_in_basics_filter]
principals.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
283491,tt0035423,10,nm0107463,editor,\N,\N
283492,tt0035423,1,nm0000212,actress,\N,"[""Kate McKay""]"
283493,tt0035423,2,nm0413168,actor,\N,"[""Leopold""]"
283494,tt0035423,3,nm0000630,actor,\N,"[""Stuart Besser""]"
283495,tt0035423,4,nm0005227,actor,\N,"[""Charlie McKay""]"


In [33]:
# Removing all \N values
principals= principals_df.replace({"\\N": np.nan})

principals.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
283491,tt0035423,10,nm0107463,editor,,
283492,tt0035423,1,nm0000212,actress,,"[""Kate McKay""]"
283493,tt0035423,2,nm0413168,actor,,"[""Leopold""]"
283494,tt0035423,3,nm0000630,actor,,"[""Stuart Besser""]"
283495,tt0035423,4,nm0005227,actor,,"[""Charlie McKay""]"


In [34]:
#Checking for duplicates
principals_df.duplicated().sum()

0

### Names

In [36]:
#Filtering out crew
names_in_principals_filter = names["nconst"].isin(principals["nconst"])
names_in_directors_filter = names["nconst"].isin(unique_director)
names_in_writers_filter = names["nconst"].isin(unique_writer)

names = names[names_in_principals_filter | 
                    names_in_directors_filter |
                    names_in_writers_filter]
names.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0053137,tt0031983,tt0072308,tt0050419"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0117057,tt0037382,tt0038355,tt0075213"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0057345,tt0056404,tt0054452,tt0049189"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0072562,tt0077975,tt0078723,tt0080455"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0069467,tt0050976,tt0083922"


In [38]:
# Removing all \N values
names = names.replace({"\\N": np.nan})

names.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987.0,"soundtrack,actor,miscellaneous","tt0053137,tt0031983,tt0072308,tt0050419"
1,nm0000002,Lauren Bacall,1924,2014.0,"actress,soundtrack","tt0117057,tt0037382,tt0038355,tt0075213"
2,nm0000003,Brigitte Bardot,1934,,"actress,soundtrack,music_department","tt0057345,tt0056404,tt0054452,tt0049189"
3,nm0000004,John Belushi,1949,1982.0,"actor,soundtrack,writer","tt0072562,tt0077975,tt0078723,tt0080455"
4,nm0000005,Ingmar Bergman,1918,2007.0,"writer,director,actor","tt0050986,tt0069467,tt0050976,tt0083922"


In [39]:
#Checking for duplicates
names.duplicated().sum()

0

# Save Data to file

In [46]:
#rename akas and basics
akas=akas_df
basics=basics_df
ratings=ratings_df

In [47]:
# viewing the data
display(names.info(), crew.info(), principals.info(), akas.info(), 
        ratings.info(), basics.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 592641 entries, 0 to 12675796
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   nconst             592641 non-null  object
 1   primaryName        592641 non-null  object
 2   birthYear          101013 non-null  object
 3   deathYear          13356 non-null   object
 4   primaryProfession  549839 non-null  object
 5   knownForTitles     591626 non-null  object
dtypes: object(6)
memory usage: 31.7+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 301845 entries, 34803 to 9997605
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   tconst    301845 non-null  object
 1   director  297604 non-null  object
 2   writer    272743 non-null  object
dtypes: object(3)
memory usage: 9.2+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1061963 entries, 283491 to 57095674
Data columns (total 6 colum

None

None

None

None

None

None

In [30]:
#Making data folder if one does not already exist
FOLDER = "New_Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

[]

In [48]:
## Save current dataframes to file.
akas.to_csv("New_Data/title_akas.csv.gz",compression='gzip',index=False)
ratings.to_csv("New_Data/title_ratings.csv.gz",compression='gzip',index=False)
basics.to_csv("New_Data/title_basics.csv.gz",compression='gzip',index=False)
crew.to_csv("New_Data/title_crew.csv.gz",compression='gzip',index=False)
principals.to_csv("New_Data/title_principals.csv.gz",compression='gzip',index=False)
names.to_csv("New_Data/title_names.csv.gz",compression='gzip',index=False)

# SQL Prep

In [9]:
######## CODE TO TEST LOGIN CREDENTIALS
import os, json
os.makedirs(folder, exist_ok=True)

with open(MYSQL_LOGIN) as f:
	login = json.load(f)

if (USER_KEY not in login):
    raise Exception(f"[!] The json file did not have a {USER_KEY} key.")
    
if (PASSWORD_KEY not in login):
    raise Exception(f"[!] The json file did not have a {PASSWORD_KEY} key.")

In [20]:
## UPDATE THESE VARIABLES TO MATCH YOUR OWN PC/DATABASE
# MySQL Database to export 
DB_NAME = "new_movie"

# Json file with mysql login credentials
MYSQL_LOGIN = "C:\\Users\\Elizabeth Spreng\\.secret\\mysql.json"
USER_KEY = "user"
PASSWORD_KEY = "password"

## (Optional) - Change folder
folder = "New_Data_Viz/"

In [29]:
# open the json
with open("C:\\Users\\Elizabeth Spreng\\.secret\\mysql.json", 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

dict_keys(['user', 'password'])

In [33]:
## Change username and password to match your personal MySQL Server settings
username = "****"
password = "************!"

new_movie = f'mysql+pymysql://{username}:{password}@localhost/movies'
engine = create_engine(new_movie)

## Libraries for SQL

In [55]:
import pandas as pd
import os
import numpy as np

from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists

import pymysql
pymysql.install_as_MySQLdb()

In [34]:
# Check if the database exists. If not, create it.
if database_exists(new_movie) == False:
  create_database(new_movie)
else:
  print('The database already exists!')

The database already exists!


##  Reload the data

In [37]:
# load the  cleaned data
basics = pd.read_csv('New_Data/title_basics.csv.gz')
aka = pd.read_csv('New_Data/title_akas.csv.gz')
rating = pd.read_csv('New_Data/title_ratings.csv.gz')
crew = pd.read_csv('New_Data/title_crew.csv.gz')
principals = pd.read_csv('New_Data/title_principals.csv.gz')
names = pd.read_csv('New_Data/title_names.csv.gz')

### Basics with Genres

In [38]:
## create a col with a list of genres
basics['genres_split'] = basics['genres'].str.split(',')
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,genres_split
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118.0,"Comedy,Fantasy,Romance","[Comedy, Fantasy, Romance]"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70.0,Drama,[Drama]
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122.0,Drama,[Drama]
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100.0,"Comedy,Horror,Sci-Fi","[Comedy, Horror, Sci-Fi]"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126.0,Drama,[Drama]
...,...,...,...,...,...,...,...,...,...,...
135051,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019.0,,97.0,"Comedy,Drama,Fantasy","[Comedy, Drama, Fantasy]"
135052,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51.0,Drama,[Drama]
135053,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95.0,"Action,Adventure,Thriller","[Action, Adventure, Thriller]"
135054,tt9916362,movie,Coven,Akelarre,0,2020.0,,92.0,"Drama,History","[Drama, History]"


#### Genres need to be exploded

In [39]:
#create a column and indentify types
genres_split =basics['genres'].str.split(',')

unique_genres=genres_split.explode().unique()
unique_genres

array(['Comedy', 'Fantasy', 'Romance', 'Drama', 'Horror', 'Sci-Fi',
       'Documentary', 'Biography', 'Mystery', 'Thriller', 'Musical',
       'Action', 'Adventure', 'Crime', nan, 'Music', 'Animation',
       'Family', 'War', 'History', 'Adult', 'Sport', 'Western', 'News',
       'Reality-TV', 'Talk-Show', 'Game-Show'], dtype=object)

In [42]:
exploded_genres = basics.explode('genres_split')
exploded_genres

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,genres_split
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118.0,"Comedy,Fantasy,Romance",Comedy
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118.0,"Comedy,Fantasy,Romance",Fantasy
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118.0,"Comedy,Fantasy,Romance",Romance
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70.0,Drama,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122.0,Drama,Drama
...,...,...,...,...,...,...,...,...,...,...
135054,tt9916362,movie,Coven,Akelarre,0,2020.0,,92.0,"Drama,History",Drama
135054,tt9916362,movie,Coven,Akelarre,0,2020.0,,92.0,"Drama,History",History
135055,tt9916428,movie,The Secret of China,Hong xing zhao yao Zhong guo,0,2019.0,,,"Adventure,History,War",Adventure
135055,tt9916428,movie,The Secret of China,Hong xing zhao yao Zhong guo,0,2019.0,,,"Adventure,History,War",History


In [44]:
genres_split.value_counts()

[Documentary]                     24647
[Drama]                           19248
[Comedy]                           8571
[Horror]                           4744
[Comedy, Drama]                    3994
                                  ...  
[Action, Fantasy, War]                1
[Fantasy, Sci-Fi, Western]            1
[Adventure, Musical, Romance]         1
[Biography, History, Thriller]        1
[Comedy, Sci-Fi, Western]             1
Name: genres, Length: 1075, dtype: int64

In [45]:
#create the genres_split column
title_genres = exploded_genres[['tconst', 'genres_split']].copy() 
title_genres.head(10)

Unnamed: 0,tconst,genres_split
0,tt0035423,Comedy
0,tt0035423,Fantasy
0,tt0035423,Romance
1,tt0062336,Drama
2,tt0069049,Drama
3,tt0088751,Comedy
3,tt0088751,Horror
3,tt0088751,Sci-Fi
4,tt0096056,Drama
5,tt0097304,Documentary


In [46]:
## Making the genre mapper dictionary
genre_id_map = dict(zip(unique_genres, range(len(unique_genres))))
genre_id_map

{'Comedy': 0,
 'Fantasy': 1,
 'Romance': 2,
 'Drama': 3,
 'Horror': 4,
 'Sci-Fi': 5,
 'Documentary': 6,
 'Biography': 7,
 'Mystery': 8,
 'Thriller': 9,
 'Musical': 10,
 'Action': 11,
 'Adventure': 12,
 'Crime': 13,
 nan: 14,
 'Music': 15,
 'Animation': 16,
 'Family': 17,
 'War': 18,
 'History': 19,
 'Adult': 20,
 'Sport': 21,
 'Western': 22,
 'News': 23,
 'Reality-TV': 24,
 'Talk-Show': 25,
 'Game-Show': 26}

In [47]:
# create a new genre_id column using the map and drop string genres
title_genres['genre_id'] = title_genres['genres_split'].map(genre_id_map)

In [48]:
#drop the old column
title_genres = title_genres.drop(columns='genres_split')

In [49]:
#double check
title_genres.head(2)

Unnamed: 0,tconst,genre_id
0,tt0035423,0
0,tt0035423,1


In [53]:
#convert the genre dictionary into a dataframe
genre_lookup=pd.DataFrame({'Genre_name': genre_id_map.keys(),
                          'Genre_id': genre_id_map.values()})

In [56]:
## Calculate max string lengths for object columns
key_len = basics['tconst'].fillna('').map(len).max()
title_len = basics['primaryTitle'].fillna('').map(len).max()
## Create a schema dictonary using Sqlalchemy datatype objects
df_schema = {
    "tconst": String(key_len+1), 
    "primaryTitle": Text(title_len+1),
    'startYear':Float(),
    'runtimeMinutes':Integer()}

NameError: name 'String' is not defined

In [52]:
#double check the dataframe
print(df_schema)

NameError: name 'df_schema' is not defined