In [1]:
import pandas as pd
from functions import check_nan
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


In [2]:
actors = pd.read_csv("../data/cleaned/actors_clean.csv")
films = pd.read_csv("../data/cleaned/films_clean.csv")
old_HDD = pd.read_csv("../data/cleaned/old_HDD_clean.csv")


I need to create a database for the common table for actors and films on mySQL. For that, I will use the actors, films and old_HDD data. This is the only way I can join them since the actor and old_HDD share first_name and last_name and film and old_HDD share title. I need to create actors_films and it needs to include actor_id (in actor) and film_id (in film)

In [3]:
actors_subset = actors[['first_name', 'last_name','actor_id']]
films_subset = films[['title','film_id']]

actorss_filmss = old_HDD.merge(actors_subset, on=['first_name', 'last_name'], how='inner')
actorss_filmss = actorss_filmss.merge(films_subset, on='title', how='inner')




In [4]:
actorss_filmss.head()

Unnamed: 0,first_name,last_name,title,category_id,actor_id,film_id
0,PENELOPE,GUINESS,ACADEMY DINOSAUR,6,1,1
1,CHRISTIAN,GABLE,ACADEMY DINOSAUR,6,10,1
2,LUCILLE,TRACY,ACADEMY DINOSAUR,6,20,1
3,SANDRA,PECK,ACADEMY DINOSAUR,6,30,1
4,PENELOPE,GUINESS,ANACONDA CONFESSIONS,2,1,23


In [5]:
selected_columns=['actor_id','film_id']
actors_films=actorss_filmss[selected_columns]
actors_films.head()

Unnamed: 0,actor_id,film_id
0,1,1
1,10,1
2,20,1
3,30,1
4,1,23


I merged the three and selected only the columns that I was interested in. 

In [6]:
actors_films=actors_films.rename(columns={"actor_id":"actor_actor_id", "film_id":"film_film_id"})

In [7]:
actors_films.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   actor_actor_id  1000 non-null   int64
 1   film_film_id    1000 non-null   int64
dtypes: int64(2)
memory usage: 23.4 KB


In [8]:
for c in actors_films.select_dtypes(include="integer"):
    
    actors_films[c]=pd.to_numeric(actors_films[c], downcast="integer")

Downcasted integers

In [9]:
actors_films.to_csv("../data/cleaned/actors_films_clean.csv", index=False)


Saved new file

In [10]:
films_withcategory = films.merge(old_HDD, on=['title'], how='inner')
films_withcategory.head()


Unnamed: 0,film_id,title,description,rental_duration_days,rental_rate,length,replacement_cost,rating,special_features,first_name,last_name,category_id
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",PENELOPE,GUINESS,6
1,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",CHRISTIAN,GABLE,6
2,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",LUCILLE,TRACY,6
3,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",SANDRA,PECK,6
4,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,3,4.99,48,12.99,G,"Trailers,Deleted Scenes",BOB,FAWCETT,11


I also have to put category_id on films. I will again use old_HDD for this

In [11]:
films_withcategory=films_withcategory[["film_id", "title", "description", "rental_duration_days", "rental_rate", "length", "replacement_cost", "rating", "special_features", "category_id"]]

Selecting only the columns I am interested in. 

In [12]:
films_withcategory.duplicated().any()

True

In [13]:
films_withcategory=films_withcategory.drop_duplicates()

In [14]:
films_withcategory.shape

(614, 10)

Dropped duplicates.

In [15]:
films_withcategory=films_withcategory.rename(columns={"category_id":"category_category_id"})

In [16]:
films_withcategory.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 614 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   film_id               614 non-null    int64  
 1   title                 614 non-null    object 
 2   description           614 non-null    object 
 3   rental_duration_days  614 non-null    int64  
 4   rental_rate           614 non-null    float64
 5   length                614 non-null    int64  
 6   replacement_cost      614 non-null    float64
 7   rating                614 non-null    object 
 8   special_features      614 non-null    object 
 9   category_category_id  614 non-null    int64  
dtypes: float64(2), int64(4), object(4)
memory usage: 52.8+ KB


In [17]:
for c in films_withcategory.select_dtypes(include="integer"):
    
    films_withcategory[c]=pd.to_numeric(films_withcategory[c], downcast="integer")

In [18]:
for c in films_withcategory.select_dtypes(include="float"):
    
    films_withcategory[c]=pd.to_numeric(films_withcategory[c], downcast="float")

In [19]:
films_withcategory["rating"] = films_withcategory["rating"].astype("category")

In [20]:
films_withcategory.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 614 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   film_id               614 non-null    int16   
 1   title                 614 non-null    object  
 2   description           614 non-null    object  
 3   rental_duration_days  614 non-null    int8    
 4   rental_rate           614 non-null    float32 
 5   length                614 non-null    int16   
 6   replacement_cost      614 non-null    float32 
 7   rating                614 non-null    category
 8   special_features      614 non-null    object  
 9   category_category_id  614 non-null    int8    
dtypes: category(1), float32(2), int16(2), int8(2), object(3)
memory usage: 28.4+ KB


Downcasted integers and floats and changed rating type to category

In [21]:
films_withcategory.to_csv("../data/cleaned/films_category_clean.csv", index=False)


Saved the file