# Perform data quality checks

In [1]:
# Import libraries
import configparser
import os
import pandas as pd
import psycopg2

## Connect to DB

In [2]:
# Read config files to obtain redshift credentials
config = configparser.ConfigParser()
config.read('dwh.cfg')
host, dbname, user, = config.get("DWH","dwh_endpoint"), config.get("DWH", "dwh_db"),  config.get("DWH", "dwh_db_user")
password, port =  config.get("DWH", "dwh_db_password"),  config.get("DWH", "dwh_port")

# Connect to redshift
connection_string = "host={} dbname={} user={} password={} port={}".format(host, dbname, user, password, port)
conn = psycopg2.connect(connection_string)
cur = conn.cursor()

## Data quality checks

## Row count quality check

In [10]:
table_names = [
    'IMDB_TITLE_AKAS', 'IMDB_TITLE_PRINCIPALS', 'IMDB_TITLE_BASICS', 'IMDB_TITLE_CREW', 'IMDB_TITLE_EPISODE', 
    'IMDB_TITLE_RATINGS', 'IMDB_NAME_BASICS', 'KAGGLE_MOVIES_METADATA', 'KAGGLE_KEYWORDS', 'KAGGLE_CREDITS', 
    'KAGGLE_LINKS', 'KAGGLE_RATINGS'
]

max_rows = 0
max_table = ""

for table in table_names:
    sql_string = "SELECT COUNT(*) FROM " + table + ";"
    cur.execute(sql_string)
    conn.commit()
    row = cur.fetchone()
    num_rows = row[0]
    
    if num_rows <= 0:
        raise ValueError('ERROR: table ' + table + ' has 0 rows.')
        
    print(table, num_rows)
        
    if num_rows > max_rows:
        max_rows = num_rows
        max_table = table
        
print(max_table, max_rows)

IMDB_TITLE_AKAS 24978357
IMDB_TITLE_PRINCIPALS 42750666
IMDB_TITLE_BASICS 7554298
IMDB_TITLE_CREW 7554298
IMDB_TITLE_EPISODE 5471901
IMDB_TITLE_RATINGS 1116071
IMDB_NAME_BASICS 10671837
KAGGLE_MOVIES_METADATA 45363
KAGGLE_KEYWORDS 46419
KAGGLE_CREDITS 30457
KAGGLE_LINKS 45843
KAGGLE_RATINGS 26024289
IMDB_TITLE_PRINCIPALS 42750666


## Check if majority of IMDB title IDs in TMDB exists within IMDB dataset.

Set error threshold = 0.2 %

In [11]:
76 / 45843

0.0016578321663067426

In [14]:
query = "SELECT COUNT(*) FROM KAGGLE_LINKS WHERE KAGGLE_LINKS.IMDB_ID NOT IN (SELECT IMDB_TITLE_PRINCIPALS.TITLE_ID FROM IMDB_TITLE_PRINCIPALS );"
cur.execute(query)
conn.commit()
row = cur.fetchone()
non_existing_imdb_ids = row[0]

query = "SELECT COUNT(*) FROM KAGGLE_LINKS"
cur.execute(query)
conn.commit()
row = cur.fetchone()
kaggle_imdb_counts = row[0]

print(non_existing_imdb_ids / kaggle_imdb_counts)

proportion_non_existing = non_existing_imdb_ids / kaggle_imdb_counts

if proportion_non_existing >= (0.002 * kaggle_imdb_counts):
    print("Error threshold exceeded, pls check data quality")

0.0016578321663067426


In [15]:
# Disconnect from redshift
conn.close()