# Data Used in this course

Much of the data used in this course is downloaded using the code in this notebook.


In [15]:
import requests
import pandas as pd
import numpy as np

## IMDB Ratings

In [16]:
# Fetch data from IMDB

# Basics contains things like the name and year of the film/tv show
r = requests.get('https://datasets.imdbws.com/title.basics.tsv.gz')
with open('../data/raw/title.basics.tsv.gz', 'wb') as f:
    f.write(r.content)

    
# Ratings contains average ratingss and number of reviews
r = requests.get('https://datasets.imdbws.com/title.ratings.tsv.gz')
with open('../data/raw/title.ratings.tsv.gz', 'wb') as f:
    f.write(r.content)
    

# AKAS contains info about what regions titles were released in
r = requests.get('https://datasets.imdbws.com/title.akas.tsv.gz')
with open('../data/raw/title.akas.tsv.gz', 'wb') as f:
    f.write(r.content)

# Episode contains data about tv episodes, e.g. the parent show.
r = requests.get('https://datasets.imdbws.com/title.episode.tsv.gz')
with open('../data/raw/title.episode.tsv.gz', 'wb') as f:
    f.write(r.content)


In [17]:
!gunzip -f '../data/raw/title.basics.tsv.gz'
!gunzip -f '../data/raw/title.ratings.tsv.gz'
!gunzip -f '../data/raw/title.akas.tsv.gz'
!gunzip -f '../data/raw/title.episode.tsv.gz'

In [18]:
# These are fairly large datasets. So lets create a small one thats easier to use for some training.
basics = pd.read_table('../data/raw/title.basics.tsv', index_col='tconst')
ratings = pd.read_table('../data/raw/title.ratings.tsv', index_col='tconst')
akas = pd.read_table('../data/raw/title.akas.tsv', index_col=['titleId','ordering'])
episodes = pd.read_table('../data/raw/title.episode.tsv', index_col='tconst')

# Add parent title to episodes
episodes['parentTitle'] = episodes['parentTconst'].map(basics['primaryTitle'])

print('basics:', basics.shape)
print('ratings:', ratings.shape)
print('akas:', akas.shape)
print('episodes:', episodes.shape)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  mask |= (ar1 == a)


basics: (7067813, 8)
ratings: (1063478, 2)
akas: (22699783, 6)
episodes: (5067022, 4)


In [19]:
# Shrink the data down to just data for the GB region.

# Create a new index from just the GB region records listed in the akas dataframe 
gb_index = akas.loc[akas['region'] == 'GB'].index
# This data frame has multiple entries per title per region, so we need to take the index level for just the id.
gb_index_titleid = gb_index.levels[0].drop_duplicates()
gb_index_titleid

Index(['tt0000001', 'tt0000002', 'tt0000003', 'tt0000004', 'tt0000005',
       'tt0000006', 'tt0000007', 'tt0000008', 'tt0000009', 'tt0000010',
       ...
       'tt9916834', 'tt9916836', 'tt9916838', 'tt9916842', 'tt9916844',
       'tt9916846', 'tt9916848', 'tt9916850', 'tt9916852', 'tt9916856'],
      dtype='object', name='titleId', length=4891629)

In [20]:

# Reindexing the other dataframes with this index
akas = akas.reindex(gb_index)
basics = basics.reindex(gb_index_titleid).dropna(how='all')
ratings = ratings.reindex(gb_index_titleid).dropna(how='all')
episodes = episodes.reindex(gb_index_titleid).dropna(how='all')

print('basics:', basics.shape)
print('ratings:', ratings.shape)
print('akas:', akas.shape)
print('episodes:', episodes.shape)

basics: (4884336, 8)
ratings: (761696, 2)
akas: (278912, 6)
episodes: (2972520, 4)


In [21]:
akas.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,title,region,language,types,attributes,isOriginalTitle
titleId,ordering,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
tt0000003,7,Poor Pierrot,GB,\N,imdbDisplay,\N,0
tt0000005,11,The Blacksmith's Forge,GB,\N,\N,informal alternative title,0
tt0000005,4,Blacksmith Shop,GB,\N,\N,informal alternative title,0
tt0000012,18,The Arrival of a Train,GB,\N,imdbDisplay,\N,0
tt0000012,19,Train Pulling into a Station,GB,\N,alternative,\N,0


In [22]:
ratings.head()

Unnamed: 0_level_0,averageRating,numVotes
titleId,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0000001,5.6,1643.0
tt0000002,6.1,198.0
tt0000003,6.5,1337.0
tt0000004,6.2,120.0
tt0000005,6.1,2122.0


In [23]:
basics.head()

Unnamed: 0_level_0,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
titleId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
tt0000001,short,Carmencita,Carmencita,0.0,1894,\N,1,"Documentary,Short"
tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0.0,1892,\N,5,"Animation,Short"
tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0.0,1892,\N,4,"Animation,Comedy,Romance"
tt0000004,short,Un bon bock,Un bon bock,0.0,1892,\N,12,"Animation,Short"
tt0000005,short,Blacksmith Scene,Blacksmith Scene,0.0,1893,\N,1,"Comedy,Short"


In [24]:
episodes.head()

Unnamed: 0_level_0,parentTconst,seasonNumber,episodeNumber,parentTitle
titleId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tt0041951,tt0041038,1,9,The Lone Ranger
tt0042816,tt0989125,1,17,BBC Sunday-Night Theatre
tt0042889,tt0989125,\N,\N,BBC Sunday-Night Theatre
tt0043426,tt0040051,3,42,Studio One in Hollywood
tt0043631,tt0989125,2,16,BBC Sunday-Night Theatre


In [25]:
# Lets join the dataframes for titles, reviews, and episodes
imdb = pd.concat([basics, ratings, episodes], axis=1)

# Keep only the results that have ratings as we're not interested in the rest.
imdb = imdb[~imdb['averageRating'].isna()]

# Set the index name to 'id'
imdb.index.name = 'id'

# Save a raw file that we can demonstrate cleanup with
imdb.fillna(r'\N').to_csv('../data/imdb.csv')

# You might spot above that nulls are represented in the data by \N. we can replace that here.
imdb = imdb.replace(r'\N', np.nan)

# Fix data types
imdb['startYear'] = pd.to_numeric(imdb['startYear'], errors='coerce')
imdb['endYear'] = pd.to_numeric(imdb['endYear'], errors='coerce')
imdb['decade'] = np.floor((imdb['startYear'] / 10)) * 10
imdb['runtimeMinutes'] = pd.to_numeric(imdb['runtimeMinutes'], errors='coerce')
imdb['seasonNumber'] = pd.to_numeric(imdb['seasonNumber'], errors='coerce')
imdb['episodeNumber'] = pd.to_numeric(imdb['episodeNumber'], errors='coerce')


# Save a clean file that we can demonstrate exploration with
imdb.to_csv('../data/imdb_clean.csv')

display(imdb.info())
display(imdb.head())

<class 'pandas.core.frame.DataFrame'>
Index: 761696 entries, tt0000001 to tt9916766
Data columns (total 15 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   titleType       761696 non-null  object 
 1   primaryTitle    761696 non-null  object 
 2   originalTitle   761696 non-null  object 
 3   isAdult         761696 non-null  float64
 4   startYear       761629 non-null  float64
 5   endYear         23836 non-null   float64
 6   runtimeMinutes  591050 non-null  float64
 7   genres          743736 non-null  object 
 8   averageRating   761696 non-null  float64
 9   numVotes        761696 non-null  float64
 10  parentTconst    213296 non-null  object 
 11  seasonNumber    205781 non-null  float64
 12  episodeNumber   205781 non-null  float64
 13  parentTitle     213296 non-null  object 
 14  decade          761629 non-null  float64
dtypes: float64(9), object(6)
memory usage: 93.0+ MB


None

Unnamed: 0_level_0,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,parentTconst,seasonNumber,episodeNumber,parentTitle,decade
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
tt0000001,short,Carmencita,Carmencita,0.0,1894.0,,1.0,"Documentary,Short",5.6,1643.0,,,,,1890.0
tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0.0,1892.0,,5.0,"Animation,Short",6.1,198.0,,,,,1890.0
tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0.0,1892.0,,4.0,"Animation,Comedy,Romance",6.5,1337.0,,,,,1890.0
tt0000004,short,Un bon bock,Un bon bock,0.0,1892.0,,12.0,"Animation,Short",6.2,120.0,,,,,1890.0
tt0000005,short,Blacksmith Scene,Blacksmith Scene,0.0,1893.0,,1.0,"Comedy,Short",6.1,2122.0,,,,,1890.0


## Fake Users

This uses a nice python library called [faker](https://pypi.org/project/Faker/). Not real people...


In [26]:
# Json Document
from faker import Faker
from faker.providers import credit_card
import numpy as np



fake = Faker()
fake.add_provider(credit_card)

data = [{
    "name": fake.name(),
    "address": fake.address(),
    "email": fake.email(),
    "age": np.random.randint(15,100),
    "credit_card_number": fake.credit_card_number()

} for i in range(100)]

import json

with open('data/users.json', 'w') as f:
    json.dump(data, f, indent=4, separators=[',',':'])

FileNotFoundError: [Errno 2] No such file or directory: 'data/users.json'