# IMDB2 CORE

## Import and Load Data

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Load basics dataset
basics_url = ('https://datasets.imdbws.com/title.basics.tsv.gz')
df_basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

#Load ratings dataset
ratings_url = ('https://datasets.imdbws.com/title.ratings.tsv.gz')
df_ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

# Load akas dataset
akas_url = ('https://datasets.imdbws.com/title.akas.tsv.gz')
df_akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

## Exploring and Cleaning

**AKAS Dataset**

In [4]:
df_akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [5]:
df_akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35723981 entries, 0 to 35723980
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.1+ GB


In [6]:
df_akas = df_akas.replace({'\\N':np.nan})

In [7]:
df_akas.isna().sum()

titleId                   0
ordering                  0
title                     5
region              1888398
language            6583607
types              30184309
attributes         35465560
isOriginalTitle        2109
dtype: int64

In [8]:
# Keeping only US movies
df_akas.drop(df_akas.loc[df_akas['region'] != 'US'].index, inplace =True)

In [9]:
df_akas['region'].value_counts()

US    1432658
Name: region, dtype: int64

**BASIC DATASET**

In [10]:
df_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [11]:
df_basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9806297 entries, 0 to 9806296
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 673.3+ MB


In [12]:
df_basics = df_basics.replace({'\\N':np.nan})

In [13]:
df_basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1327682
endYear           9700427
runtimeMinutes    6917265
genres             442042
dtype: int64

In [14]:
# Eliminate movies that are null for runtimeMinutes and genres (Basics)

df_basics = df_basics.dropna(axis = 0, subset =['runtimeMinutes', 'genres'])

In [15]:
df_basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear          164736
endYear           2762837
runtimeMinutes          0
genres                  0
dtype: int64

In [16]:
df_basics['titleType'].value_counts()

tvEpisode       1425631
short            599275
movie            381476
video            180144
tvMovie           91428
tvSeries          90217
tvSpecial         18051
tvMiniSeries      17123
tvShort            8790
videoGame           322
Name: titleType, dtype: int64

In [17]:
# keep only titleType==Movie
df_basics.drop(df_basics.loc[df_basics['titleType'] != 'movie'].index, inplace =True)

In [18]:
# Verifying Drop
df_basics['titleType'].value_counts()

movie    381476
Name: titleType, dtype: int64

In [19]:
pd.set_option("display.max_rows", None)
df_basics['startYear'].value_counts()

2017    14366
2018    14321
2019    14054
2016    13949
2015    13475
2014    13100
2022    12730
2013    12380
2021    12327
2012    11625
2020    11561
2011    10773
2010    10200
2009     9351
2008     8147
2007     6962
2006     6512
2005     5828
2004     5201
2003     4587
2023     4318
2002     4131
2001     3861
2000     3638
1999     3324
1998     3189
1997     3087
1989     3049
1988     2984
1990     2949
1987     2927
1996     2850
1986     2847
1995     2777
1991     2771
1985     2753
1983     2733
1984     2728
1994     2713
1992     2669
1981     2626
1982     2608
1971     2543
1979     2537
1980     2533
1993     2533
1976     2521
1973     2517
1972     2516
1974     2506
1970     2485
1978     2479
1975     2472
1977     2448
1969     2347
1968     2342
1967     2088
1965     1899
1966     1893
1964     1856
1962     1798
1961     1724
1960     1715
1963     1700
1959     1652
1957     1617
1958     1610
1956     1490
1955     1394
1953     1370
1954     1361
1937  

In [20]:
df_basics = df_basics.dropna(axis = 0, subset =['startYear'])

In [21]:
df_basics.isna().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear              0
endYear           375057
runtimeMinutes         0
genres                 0
dtype: int64

In [22]:
df_basics['startYear'] = df_basics['startYear'].astype('int64')

In [23]:
# keep startYear 2000-2022
## THIS ISN"T WORKING *****
df_basics.drop(df_basics.loc[df_basics['startYear'] > 2022].index, inplace =True)


In [24]:
df_basics.drop(df_basics.loc[df_basics['startYear'] < 2000].index, inplace =True)

In [25]:
pd.set_option("display.max_rows", None)
df_basics['startYear'].value_counts()

2017    14366
2018    14321
2019    14054
2016    13949
2015    13475
2014    13100
2022    12730
2013    12380
2021    12327
2012    11625
2020    11561
2011    10773
2010    10200
2009     9351
2008     8147
2007     6962
2006     6512
2005     5828
2004     5201
2003     4587
2002     4131
2001     3861
2000     3638
Name: startYear, dtype: int64

In [27]:
# Exclude movies that are included in the documentary category.
is_documentary = df_basics['genres'].str.contains('documentary',case=False)
df_basics = df_basics[~is_documentary]

In [41]:
df_basics['genres'].value_counts()

Drama                            17028
Comedy                            7126
Horror                            4039
Comedy,Drama                      3997
Drama,Romance                     2595
Thriller                          2398
Comedy,Drama,Romance              2154
Comedy,Romance                    1857
Horror,Thriller                   1539
Drama,Thriller                    1524
Action                            1192
Action,Crime,Drama                1050
Crime,Drama,Thriller               938
Crime,Drama                        920
Comedy,Horror                      835
Romance                            720
Horror,Mystery,Thriller            660
Drama,Family                       605
Family                             602
Animation                          577
Drama,Mystery,Thriller             548
Sci-Fi                             546
Action,Drama                       528
Crime,Drama,Mystery                527
Drama,Horror,Mystery               526
Drama,Horror,Thriller    

In [42]:
# Keep only US movies
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers = df_basics['tconst'].isin(df_akas['titleId'])
keepers

34803      True
61116      True
67669      True
86801      True
93938      True
98043      True
101042     True
106106     True
110477     True
110540     True
111851     True
113286     True
113725     True
113951     True
114297     True
115023     True
115402     True
115828     True
115888     True
115928     True
115944     True
116080     True
116152     True
116227     True
116441     True
116481     True
116690     True
116978     True
117002     True
117134     True
117138     True
117146     True
117325     True
117361     True
117420     True
117615     True
117723     True
117737     True
117752     True
117754     True
117757     True
117784     True
117789     True
117794     True
117796     True
117811     True
117837     True
117839     True
117854     True
117856     True
117897     True
117899     True
117909     True
117969     True
117973     True
117974     True
117976     True
118179     True
118697     True
118698     True
119054     True
119132     True
119333  

In [46]:
df_basics = df_basics[keepers]
df_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
93938,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


**RATINGS DATASET**