In [1]:
#Libraries and Imports
import pandas as pd
import numpy as np


In [2]:
#set up the url's from IMDB
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"

In [3]:
#load the files with Pandas
basics = pd.read_csv(basics_url,sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url,sep='\t', low_memory=False)
akas = pd.read_csv(akas_url,sep='\t', low_memory=False)

In [4]:
#info for initial dataframes
print("Initial Basics Data")
print(basics.info())
print("\n")
print("Initial Ratings Data")
print(ratings.info())
print("\n")
print("Initial AKA Data")
print(akas.info())

Initial Basics Data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9091590 entries, 0 to 9091589
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 624.3+ MB
None


Initial Ratings Data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250020 entries, 0 to 1250019
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1250020 non-null  object 
 1   averageRating  1250020 non-null  float64
 2   numVotes       1250020 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.6+ MB
None


Initial AKA Data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32616573 en

# Specifications

Your stakeholder only wants you to include information for movies based on the following specifications:

    Exclude any movie with missing values for genre or runtime
    Include only full-length movies (titleType = "movie").
    Include only fictional movies (not from documentary genre)
    Include only movies that were released 2000 - 2021 (include 2000 and 2021)
    Include only movies that were released in the United States

## Title Preproccessing

### Replace "\N" with np.nan

In [5]:
basics = basics.replace({'\\N':np.nan})

### Eliminate movies that are null for runtimeMinutes

In [6]:
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1205499
endYear           8997337
runtimeMinutes    6643446
genres             414109
dtype: int64

In [7]:
#select the rows where the is a value for run time
basics = basics[basics['runtimeMinutes'].notna()]

In [8]:
#verify the null runtime rows were dropped
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 1
startYear           37747
endYear           2402010
runtimeMinutes          0
genres              67197
dtype: int64

### Eliminate movies that are null for genre

In [9]:
#select the rows where there is a vlue for genre
basics = basics[basics['genres'].notna()]

In [10]:
#verify the the null genre rows were dropped
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear           36401
endYear           2336378
runtimeMinutes          0
genres                  0
dtype: int64

### keep only titleType==Movie

In [11]:
#unfilterd list of title types
basics['titleType'].value_counts()

tvEpisode       1055323
short            570724
movie            364788
video            174191
tvMovie           87988
tvSeries          86056
tvSpecial         16325
tvMiniSeries      15901
tvShort            9358
videoGame           293
Name: titleType, dtype: int64

In [12]:
#create filter to select only movies
movie_filter = basics['titleType']=='movie'

In [13]:
#apply filter and confirm
basics = basics[movie_filter]
basics['titleType'].value_counts()

movie    364788
Name: titleType, dtype: int64

### keep startYear 2000-2022

In [14]:
#unfilter list of start year
basics['startYear'].value_counts()

2017    14200
2018    14135
2016    13845
2019    13699
2015    13342
        ...  
1894        1
1899        1
1904        1
1906        1
1896        1
Name: startYear, Length: 129, dtype: int64

In [15]:
#create year filter for 2000 thru 2022
basics = basics[(basics['startYear'] >= '2000') & (basics['startYear'] <= '2022')]

In [16]:
#confirm the year filter has been applied
basics['startYear'].value_counts()

2017    14200
2018    14135
2016    13845
2019    13699
2015    13342
2014    12988
2013    12259
2021    11622
2012    11531
2020    11082
2011    10675
2010    10107
2009     9252
2008     8063
2007     6875
2022     6573
2006     6419
2005     5753
2004     5115
2003     4519
2002     4082
2001     3808
2000     3589
Name: startYear, dtype: int64

### Eliminate movies that include  "Documentary" in genre

In [17]:
#unfiltered genre list
basics['genres'].value_counts()

Documentary                  50487
Drama                        34467
Comedy                       13014
Comedy,Drama                  6216
Horror                        5557
                             ...  
Adventure,Romance,Sport          1
Adult,Documentary,History        1
Action,Music,Sci-Fi              1
Adventure,Fantasy,Western        1
Action,History,Western           1
Name: genres, Length: 1182, dtype: int64

In [18]:
#create filter for Documentary and then apply the invers of the filter 
#Code modified from the LEARN platform
documentary = basics['genres'].str.contains('documentary', case=False)
basics = basics[~documentary]

In [19]:
#confirm the filter was applied
basics['genres'].value_counts()

Drama                          34467
Comedy                         13014
Comedy,Drama                    6216
Horror                          5557
Drama,Romance                   4129
                               ...  
Horror,Reality-TV,Talk-Show        1
Family,Musical,Sport               1
Comedy,Game-Show                   1
Horror,Music,Mystery               1
Action,History,Western             1
Name: genres, Length: 964, dtype: int64

## AKAs Preprocessing

### Replace "\N" with np.nan

In [20]:
akas = akas.replace({'\\N':np.nan})

### Keep only US entries

In [21]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32616573 entries, 0 to 32616572
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 1.9+ GB


In [22]:
#dataset using two letter code for countries
akas['region'].value_counts()

FR    3892587
JP    3891256
DE    3874980
IN    3818952
ES    3815712
       ...   
TV          1
NU          1
PW          1
NR          1
TC          1
Name: region, Length: 246, dtype: int64

In [23]:
#based on the exporlation abovel, create and apply a region filter for US only
region_filter = akas['region']=='US'
akas = akas[region_filter]

In [24]:
#confirm US filter was applied
akas['region'].value_counts()

US    1336620
Name: region, dtype: int64

## Ratings Preprocessing

### Replace "\N" with np.nan (if any)

In [25]:
ratings = ratings.replace({'\\N':np.nan})

## Final Preproccessing and Completed Dataframes

In [26]:
#We will need to filter the basics data with the US only AKA data as the filter
##Code modified from the LEARN platform
keeps = basics['tconst'].isin(akas['titleId'])
basics = basics[keeps]

In [27]:
#display info for the processed data frames
print("Processed Basics Data")
print(basics.info())
print("\n")
print("Processed Ratings Data")
print(ratings.info())
print("\n")
print("Processed AKA Data")
print(akas.info())

Processed Basics Data
<class 'pandas.core.frame.DataFrame'>
Int64Index: 81777 entries, 34789 to 9091355
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          81777 non-null  object
 1   titleType       81777 non-null  object
 2   primaryTitle    81777 non-null  object
 3   originalTitle   81777 non-null  object
 4   isAdult         81777 non-null  object
 5   startYear       81777 non-null  object
 6   endYear         0 non-null      object
 7   runtimeMinutes  81777 non-null  object
 8   genres          81777 non-null  object
dtypes: object(9)
memory usage: 6.2+ MB
None


Processed Ratings Data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250020 entries, 0 to 1250019
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1250020 non-null  object 
 1   averageRating  1250020 non-null  float64
 2   numVotes 

>The data frames have changed by the following amount of entries:
Basics - From 9,091,590 to 81,777. A change of -9,009,813
Ratings - From 1,250,020 to 1,250,020. No change.
AKA's - From 32,616,573 to 13,36,620. A change of -31,279,953

## Saving-Completed-Dataframes-to-the-Repository

In [30]:
#create Data folder using os
import os
os.makedirs('Data/', exist_ok=True)
#confirm folder creations
os.listdir("Data/")

[]

In [31]:
#save each completed data frame to the Data folder
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [32]:
#confirm files have been created
os.listdir("Data/")

['title_akas.csv.gz', 'title_basics.csv.gz', 'title_ratings.csv.gz']