# IMDB2 CORE

## Import and Load Data

In [1]:
import pandas as pd
import numpy as np

In [3]:
# Load basics dataset
basics_url = ('https://datasets.imdbws.com/title.basics.tsv.gz')
df_basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

#Load ratings dataset
ratings_url = ('https://datasets.imdbws.com/title.ratings.tsv.gz')
df_ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

# Load akas dataset
akas_url = ('https://datasets.imdbws.com/title.akas.tsv.gz')
df_akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

## Exploring and Cleaning

**AKAS Dataset**

In [5]:
df_akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [7]:
df_akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35723981 entries, 0 to 35723980
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.1+ GB


In [9]:
df_akas = df_akas.replace({'\\N':np.nan})

In [11]:
df_akas.isna().sum()

titleId                   0
ordering                  0
title                     5
region              1888398
language            6583607
types              30184309
attributes         35465560
isOriginalTitle        2109
dtype: int64

In [13]:
# Keeping only US movies
df_akas.drop(df_akas.loc[df_akas['region'] != 'US'].index, inplace =True)

In [15]:
df_akas['region'].value_counts()

US    1432658
Name: region, dtype: int64

**BASIC DATASET**

In [68]:
df_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
13082,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021,,94,Documentary
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
76059,tt0077684,movie,Histórias de Combóios em Portugal,Histórias de Combóios em Portugal,0,2022,,46,Documentary


In [69]:
df_basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 223079 entries, 13082 to 9806247
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          223079 non-null  object
 1   titleType       223079 non-null  object
 2   primaryTitle    223079 non-null  object
 3   originalTitle   223079 non-null  object
 4   isAdult         223079 non-null  object
 5   startYear       223079 non-null  int64 
 6   endYear         0 non-null       object
 7   runtimeMinutes  223079 non-null  object
 8   genres          223079 non-null  object
dtypes: int64(1), object(8)
memory usage: 17.0+ MB


In [70]:
df_basics = df_basics.replace({'\\N':np.nan})

In [71]:
df_basics.isna().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear              0
endYear           223079
runtimeMinutes         0
genres                 0
dtype: int64

In [72]:
# Eliminate movies that are null for runtimeMinutes and genres (Basics)

df_basics = df_basics.dropna(axis = 0, subset =['runtimeMinutes', 'genres'])

In [73]:
df_basics.isna().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear              0
endYear           223079
runtimeMinutes         0
genres                 0
dtype: int64

In [74]:
df_basics['titleType'].value_counts()

movie    223079
Name: titleType, dtype: int64

In [75]:
# keep only titleType==Movie
df_basics.drop(df_basics.loc[df_basics['titleType'] != 'movie'].index, inplace =True)

In [76]:
# Verifying Drop
df_basics['titleType'].value_counts()

movie    223079
Name: titleType, dtype: int64

In [77]:
pd.set_option("display.max_rows", None)
df_basics['startYear'].value_counts()

2017    14366
2018    14321
2019    14054
2016    13949
2015    13475
2014    13100
2022    12730
2013    12380
2021    12327
2012    11625
2020    11561
2011    10773
2010    10200
2009     9351
2008     8147
2007     6962
2006     6512
2005     5828
2004     5201
2003     4587
2002     4131
2001     3861
2000     3638
Name: startYear, dtype: int64

In [78]:
df_basics = df_basics.dropna(axis = 0, subset =['startYear'])

In [79]:
df_basics.isna().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear              0
endYear           223079
runtimeMinutes         0
genres                 0
dtype: int64

In [80]:
df_basics['startYear'] = df_basics['startYear'].astype('int64')

In [81]:
# keep startYear 2000-2022
## THIS ISN"T WORKING *****
df_basics.drop(df_basics.loc[df_basics['startYear'] > 2022].index, inplace =True)


In [82]:
df_basics.drop(df_basics.loc[df_basics['startYear'] < 2000].index, inplace =True)

In [83]:
pd.set_option("display.max_rows", None)
df_basics['startYear'].value_counts()

2017    14366
2018    14321
2019    14054
2016    13949
2015    13475
2014    13100
2022    12730
2013    12380
2021    12327
2012    11625
2020    11561
2011    10773
2010    10200
2009     9351
2008     8147
2007     6962
2006     6512
2005     5828
2004     5201
2003     4587
2002     4131
2001     3861
2000     3638
Name: startYear, dtype: int64