In [1]:
#import dependencies
import json 
import pandas as pd
import numpy as np
import re
from sqlalchemy import create_engine
#from config import db_password
import time

### Links table cleaning

In [2]:
#load file
raw_links_df = pd.read_csv("../raw_data_tables/raw_links.csv")
print(f'number of records: {len(raw_links_df)}')
print(raw_links_df.head())

number of records: 62423
   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0


In [3]:
#check for null values 
[[column,raw_links_df[column].isnull().sum()] for column in raw_links_df.columns]

[['movieId', 0], ['imdbId', 0], ['tmdbId', 107]]

In [4]:
#null tmbdID's are acceptable in the data

In [5]:
#check data types
raw_links_df.dtypes

movieId      int64
imdbId       int64
tmdbId     float64
dtype: object

In [6]:
#convert tmdbId to int type and fill NaN's with 0
raw_links_df['tmdbId'] = raw_links_df['tmdbId'].fillna(0).astype(int)
raw_links_df.dtypes

movieId    int64
imdbId     int64
tmdbId     int32
dtype: object

In [7]:
#create clean df and confirm 
clean_links_df = raw_links_df
print(f'number of records: {len(clean_links_df)}')
print(clean_links_df.head())
print(clean_links_df.dtypes)

number of records: 62423
   movieId  imdbId  tmdbId
0        1  114709     862
1        2  113497    8844
2        3  113228   15602
3        4  114885   31357
4        5  113041   11862
movieId    int64
imdbId     int64
tmdbId     int32
dtype: object


### Tags table cleaning

In [8]:
#load file
raw_tags_df = pd.read_csv("../raw_data_tables/raw_tags.csv")
print(f'number of records: {len(raw_tags_df)}')
print(raw_tags_df.head())

number of records: 1093360
   userId  movieId               tag   timestamp
0       3      260           classic  1439472355
1       3      260            sci-fi  1439472256
2       4     1732       dark comedy  1573943598
3       4     1732    great dialogue  1573943604
4       4     7569  so bad it's good  1573943455


In [9]:
#check for null values 
[[column,raw_tags_df[column].isnull().sum()] for column in raw_tags_df.columns]

[['userId', 0], ['movieId', 0], ['tag', 16], ['timestamp', 0]]

In [10]:
#null tags are acceptable in the data

In [11]:
#convert Unix timestap to standard format
raw_tags_df['timestamp'] = pd.to_datetime(raw_tags_df['timestamp'], unit='s')
raw_tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,2015-08-13 13:25:55
1,3,260,sci-fi,2015-08-13 13:24:16
2,4,1732,dark comedy,2019-11-16 22:33:18
3,4,1732,great dialogue,2019-11-16 22:33:24
4,4,7569,so bad it's good,2019-11-16 22:30:55


In [12]:
#check data types
raw_tags_df.dtypes

userId                int64
movieId               int64
tag                  object
timestamp    datetime64[ns]
dtype: object

In [13]:
#create clean df and confirm 
clean_tags_df = raw_tags_df
print(f'number of records: {len(clean_tags_df)}')
print(clean_tags_df.head())
print(clean_tags_df.dtypes)

number of records: 1093360
   userId  movieId               tag           timestamp
0       3      260           classic 2015-08-13 13:25:55
1       3      260            sci-fi 2015-08-13 13:24:16
2       4     1732       dark comedy 2019-11-16 22:33:18
3       4     1732    great dialogue 2019-11-16 22:33:24
4       4     7569  so bad it's good 2019-11-16 22:30:55
userId                int64
movieId               int64
tag                  object
timestamp    datetime64[ns]
dtype: object


### Movies table cleaning

In [14]:
#load file
raw_movies_df = pd.read_csv("../raw_data_tables/raw_movies.csv")
print(f'number of records: {len(raw_movies_df)}')
print(raw_movies_df.head())

number of records: 62423
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [15]:
#check for null values 
[[column,raw_movies_df[column].isnull().sum()] for column in raw_movies_df.columns]

[['movieId', 0], ['title', 0], ['genres', 0]]

In [16]:
#regex to split year from title into separate column and format stings in columns
raw_movies_df['year'] = raw_movies_df['title'].str.extract(r'(\(\d{4}\))')
raw_movies_df['title'] = raw_movies_df['title'].str.replace(r'(\(\d{4}\))',"")
raw_movies_df['year'] = raw_movies_df['year'].str.replace('(',"")
raw_movies_df['year'] = raw_movies_df['year'].str.replace(')',"")
raw_movies_df.head()

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """


Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [17]:
#recheck for null values 
[[column,raw_movies_df[column].isnull().sum()] for column in raw_movies_df.columns]

[['movieId', 0], ['title', 0], ['genres', 0], ['year', 410]]

In [18]:
#check data types
raw_movies_df.dtypes

movieId     int64
title      object
genres     object
year       object
dtype: object

In [19]:
#convert year to int type and fill NaN's with 0
raw_movies_df['year'] = raw_movies_df['year'].fillna(0).astype(int)

In [20]:
#check data types
raw_movies_df.dtypes

movieId     int64
title      object
genres     object
year        int32
dtype: object

In [21]:
#create clean df and confirm 
clean_movies_df = raw_movies_df
print(f'number of records: {len(clean_movies_df)}')
print(clean_movies_df.head())
print(clean_movies_df.dtypes)

number of records: 62423
   movieId                         title  \
0        1                    Toy Story    
1        2                      Jumanji    
2        3             Grumpier Old Men    
3        4            Waiting to Exhale    
4        5  Father of the Bride Part II    

                                        genres  year  
0  Adventure|Animation|Children|Comedy|Fantasy  1995  
1                   Adventure|Children|Fantasy  1995  
2                               Comedy|Romance  1995  
3                         Comedy|Drama|Romance  1995  
4                                       Comedy  1995  
movieId     int64
title      object
genres     object
year        int32
dtype: object


### Ratings table cleaning

In [22]:
#load file
raw_ratings_df = pd.read_csv("../raw_data_tables/raw_ratings.csv")
print(f'number of records: {len(raw_ratings_df)}')
print(raw_ratings_df.head())

number of records: 25000095
   userId  movieId  rating   timestamp
0       1      296     5.0  1147880044
1       1      306     3.5  1147868817
2       1      307     5.0  1147868828
3       1      665     5.0  1147878820
4       1      899     3.5  1147868510


In [23]:
#check for null values 
[[column,raw_ratings_df[column].isnull().sum()] for column in raw_ratings_df.columns]

[['userId', 0], ['movieId', 0], ['rating', 0], ['timestamp', 0]]

In [24]:
#convert Unix timestap to standard format
raw_ratings_df['timestamp'] = pd.to_datetime(raw_ratings_df['timestamp'], unit='s')
raw_ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,2006-05-17 15:34:04
1,1,306,3.5,2006-05-17 12:26:57
2,1,307,5.0,2006-05-17 12:27:08
3,1,665,5.0,2006-05-17 15:13:40
4,1,899,3.5,2006-05-17 12:21:50


In [25]:
#check data types
raw_ratings_df.dtypes

userId                int64
movieId               int64
rating              float64
timestamp    datetime64[ns]
dtype: object

In [26]:
#create clean df and confirm 
clean_ratings_df = raw_ratings_df
print(f'number of records: {len(clean_ratings_df)}')
print(clean_ratings_df.head())
print(clean_ratings_df.dtypes)

number of records: 25000095
   userId  movieId  rating           timestamp
0       1      296     5.0 2006-05-17 15:34:04
1       1      306     3.5 2006-05-17 12:26:57
2       1      307     5.0 2006-05-17 12:27:08
3       1      665     5.0 2006-05-17 15:13:40
4       1      899     3.5 2006-05-17 12:21:50
userId                int64
movieId               int64
rating              float64
timestamp    datetime64[ns]
dtype: object


### Genome Scores table cleaning

In [27]:
#load file
raw_genome_scores_df = pd.read_csv("../raw_data_tables/raw_genome_scores.csv")
print(f'number of records: {len(raw_genome_scores_df)}')
print(raw_genome_scores_df.head())

number of records: 15584448
   movieId  tagId  relevance
0        1      1    0.02875
1        1      2    0.02375
2        1      3    0.06250
3        1      4    0.07575
4        1      5    0.14075


In [28]:
#check for null values 
[[column,raw_genome_scores_df[column].isnull().sum()] for column in raw_genome_scores_df.columns]

[['movieId', 0], ['tagId', 0], ['relevance', 0]]

In [29]:
#check data types
raw_genome_scores_df.dtypes

movieId        int64
tagId          int64
relevance    float64
dtype: object

In [30]:
#create clean df and confirm 
clean_genome_scores_df = raw_genome_scores_df
print(f'number of records: {len(clean_genome_scores_df)}')
print(clean_genome_scores_df.head())
print(clean_genome_scores_df.dtypes)

number of records: 15584448
   movieId  tagId  relevance
0        1      1    0.02875
1        1      2    0.02375
2        1      3    0.06250
3        1      4    0.07575
4        1      5    0.14075
movieId        int64
tagId          int64
relevance    float64
dtype: object


### Genome Tags table cleaning

In [31]:
#load file
raw_genome_tags_df = pd.read_csv("../raw_data_tables/raw_genome_tags.csv")
print(f'number of records: {len(raw_genome_tags_df)}')
print(raw_genome_tags_df.head())

number of records: 1128
   tagId           tag
0      1           007
1      2  007 (series)
2      3  18th century
3      4         1920s
4      5         1930s


In [32]:
#check for null values 
[[column,raw_genome_tags_df[column].isnull().sum()] for column in raw_genome_tags_df.columns]

[['tagId', 0], ['tag', 0]]

In [33]:
#check data types
raw_genome_tags_df.dtypes

tagId     int64
tag      object
dtype: object

In [34]:
#create clean df and confirm 
clean_genome_tags_df = raw_genome_tags_df
print(f'number of records: {len(clean_genome_tags_df)}')
print(clean_genome_tags_df.head())
print(clean_genome_tags_df.dtypes)

number of records: 1128
   tagId           tag
0      1           007
1      2  007 (series)
2      3  18th century
3      4         1920s
4      5         1930s
tagId     int64
tag      object
dtype: object


### Load cleaned tables to database