# Data Used in this course

All the data used in this course is downloaded using the code in this notebook.


In [67]:
import requests
import pandas as pd

## IMDB Ratings

In [68]:
# Fetch data from IMDB
r = requests.get('https://datasets.imdbws.com/title.basics.tsv.gz')

with open('data/raw/title.basics.tsv.gz', 'wb') as f:
    f.write(r.content)

r = requests.get('https://datasets.imdbws.com/title.ratings.tsv.gz')

with open('data/raw/title.ratings.tsv.gz', 'wb') as f:
    f.write(r.content)
    
    
r = requests.get('https://datasets.imdbws.com/title.akas.tsv.gz')

with open('data/raw/title.akas.tsv.gz', 'wb') as f:
    f.write(r.content)

In [69]:
!gunzip -f 'data/raw/title.basics.tsv.gz'
!gunzip -f 'data/raw/title.ratings.tsv.gz'
!gunzip -f 'data/raw/title.akas.tsv.gz'

In [70]:
# These are fairly large datasets. So lets create a small one thats easier to use for some training.
basics = pd.read_table('data/raw/title.basics.tsv', index_col='tconst')
ratings = pd.read_table('data/raw/title.ratings.tsv', index_col='tconst')
akas = pd.read_table('data/raw/title.akas.tsv', index_col=['titleId','ordering'])

print('basics:', basics.shape)
print('ratings:', ratings.shape)
print('akas:', akas.shape)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  mask |= (ar1 == a)


basics: (7064751, 8)
ratings: (1062730, 2)
akas: (22676639, 6)


In [71]:
# Shrink the data down to just data for the GB region.

# Create a new index from just the GB region records listed in the akas dataframe 
gb_index = akas.loc[akas['region'] == 'GB'].index
# This data frame has multiple entries per title per region, so we need to take the index level for just the id.
gb_index_titleid = gb_index.levels[0].drop_duplicates()
gb_index_titleid

Index(['tt0000001', 'tt0000002', 'tt0000003', 'tt0000004', 'tt0000005',
       'tt0000006', 'tt0000007', 'tt0000008', 'tt0000009', 'tt0000010',
       ...
       'tt9916834', 'tt9916836', 'tt9916838', 'tt9916842', 'tt9916844',
       'tt9916846', 'tt9916848', 'tt9916850', 'tt9916852', 'tt9916856'],
      dtype='object', name='titleId', length=4887435)

In [72]:

# Reindexing the other dataframes with this index
akas = akas.reindex(gb_index)
basics = basics.reindex(gb_index_titleid)
raings = ratings.reindex(gb_index_titleid)

print('basics:', basics.shape)
print('ratings:', ratings.shape)
print('akas:', akas.shape)

basics: (4887435, 8)
ratings: (1062730, 2)
akas: (278436, 6)


In [73]:
akas.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,title,region,language,types,attributes,isOriginalTitle
titleId,ordering,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
tt0000003,7,Poor Pierrot,GB,\N,imdbDisplay,\N,0
tt0000005,11,The Blacksmith's Forge,GB,\N,\N,informal alternative title,0
tt0000005,4,Blacksmith Shop,GB,\N,\N,informal alternative title,0
tt0000012,18,The Arrival of a Train,GB,\N,imdbDisplay,\N,0
tt0000012,19,Train Pulling into a Station,GB,\N,alternative,\N,0


In [74]:
ratings.head()

Unnamed: 0_level_0,averageRating,numVotes
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0000001,5.6,1643
tt0000002,6.1,198
tt0000003,6.5,1336
tt0000004,6.2,120
tt0000005,6.1,2119


In [75]:
basics.head()

Unnamed: 0_level_0,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
titleId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
tt0000001,short,Carmencita,Carmencita,0.0,1894,\N,1,"Documentary,Short"
tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0.0,1892,\N,5,"Animation,Short"
tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0.0,1892,\N,4,"Animation,Comedy,Romance"
tt0000004,short,Un bon bock,Un bon bock,0.0,1892,\N,12,"Animation,Short"
tt0000005,short,Blacksmith Scene,Blacksmith Scene,0.0,1893,\N,1,"Comedy,Short"


In [78]:
# Lets join the dataframes for titles and reviews.
imdb = pd.merge(basics, ratings, left_index=True, right_index=True)
imdb.index.name = 'id'
imdb.head()

Unnamed: 0_level_0,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
tt0000001,short,Carmencita,Carmencita,0.0,1894,\N,1,"Documentary,Short",5.6,1643
tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0.0,1892,\N,5,"Animation,Short",6.1,198
tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0.0,1892,\N,4,"Animation,Comedy,Romance",6.5,1336
tt0000004,short,Un bon bock,Un bon bock,0.0,1892,\N,12,"Animation,Short",6.2,120
tt0000005,short,Blacksmith Scene,Blacksmith Scene,0.0,1893,\N,1,"Comedy,Short",6.1,2119


In [79]:
# And finally out to a file.
imdb.to_csv('data/imdb.csv')

## Fake Users

This uses a nice python library called [faker](https://pypi.org/project/Faker/).


In [86]:
# Json Document
from faker import Faker
from faker.providers import credit_card
import numpy as np



fake = Faker()
fake.add_provider(credit_card)

data = [{
    "name": fake.name(),
    "address": fake.address(),
    "email": fake.email(),
    "age": np.random.randint(15,100),
    "credit_card_number": fake.credit_card_number()

} for i in range(100)]

import json

with open('data/users.json', 'w') as f:
    json.dump(data, f, indent=4, separators=[',',':'])