In [1]:
import pandas as pd
import numpy as np

### Open dataset as pandas DataFrame

In [2]:
df = pd.read_csv('imdb_2019.tsv', sep = '\t')
df.head()

Unnamed: 0,index,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,87501.0,tt0089435,short,Kokoa,Kokoa,0.0,2019.0,,13.0,
1,89512.0,tt0091490,short,Martina's Playhouse,Martina's Playhouse,0.0,2019.0,,20.0,
2,114407.0,tt0116991,movie,Mariette in Ecstasy,Mariette in Ecstasy,0.0,2019.0,,,
3,126556.0,tt0129960,tvMovie,Eine geschlossene Gesellschaft,Eine geschlossene Gesellschaft,0.0,2019.0,,,
4,166388.0,tt0172112,short,Ambulans,Ambulans,0.0,2019.0,,11.0,


### Unique types of titles

In [5]:
# using python sets
print(set(df['titleType']))

{'movie', 'short', 'tvShort', 'videoGame', 'tvMovie', 'tvSpecial', 'tvSeries', 'tvMiniSeries', 'tvEpisode', 'video'}


In [6]:
# using unique
print(list(df['titleType'].unique()))

['short', 'movie', 'tvMovie', 'video', 'tvSeries', 'tvEpisode', 'tvMiniSeries', 'tvSpecial', 'videoGame', 'tvShort']


In [7]:
# using value_counts
df['titleType'].value_counts()

titleType
tvEpisode       225972
short            35351
movie            15640
video            10450
tvSeries          7154
tvMovie           2431
tvMiniSeries      2000
tvSpecial         1306
videoGame          884
tvShort            151
Name: count, dtype: int64

### Slice `imdb` dataframe to return only the columns `titleType`, `primaryTitle`, `startYear`, and `runtimeMinutes`

In [8]:
df.head(1)

Unnamed: 0,index,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,87501.0,tt0089435,short,Kokoa,Kokoa,0.0,2019.0,,13.0,


In [9]:
df[['titleType', 'primaryTitle', 'startYear', 'runtimeMinutes']]

Unnamed: 0,titleType,primaryTitle,startYear,runtimeMinutes
0,short,Kokoa,2019.0,13.0
1,short,Martina's Playhouse,2019.0,20.0
2,movie,Mariette in Ecstasy,2019.0,
3,tvMovie,Eine geschlossene Gesellschaft,2019.0,
4,short,Ambulans,2019.0,11.0
...,...,...,...,...
301334,tvEpisode,Talent Coaching with IMOR's Bianca Desmore Mit...,2019.0,
301335,tvEpisode,Escape,2019.0,
301336,tvEpisode,Tinne Oltmans,2019.0,
301337,tvEpisode,Luc Janssens,2019.0,


### Create a subset of `imdb` named `tvEpisodes_2019` that only includes the type `tvEpisodes``

In [6]:
tvEpisodes_2019 = df[df['titleType'] == 'tvEpisode']
tvEpisodes_2019

Unnamed: 0,index,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
21,758955.0,tt0782666,tvEpisode,Save the Duckling!,Save the Duckling!,0.0,2019.0,,,
30,969426.0,tt10001058,tvEpisode,Le grand saut,Le grand saut,0.0,2019.0,,51.0,
32,969458.0,tt10001110,tvEpisode,The Interviews,The Interviews,0.0,2019.0,,,
34,969464.0,tt10001120,tvEpisode,The New Guy,The New Guy,0.0,2019.0,,,
35,969465.0,tt10001122,tvEpisode,The New Girl,The New Girl,0.0,2019.0,,,
...,...,...,...,...,...,...,...,...,...,...
301334,6591633.0,tt9916776,tvEpisode,Talent Coaching with IMOR's Bianca Desmore Mit...,Talent Coaching with IMOR's Bianca Desmore Mit...,0.0,2019.0,,,
301335,6591634.0,tt9916778,tvEpisode,Escape,Escape,0.0,2019.0,,,
301336,6591640.0,tt9916790,tvEpisode,Tinne Oltmans,Tinne Oltmans,0.0,2019.0,,,
301337,6591646.0,tt9916802,tvEpisode,Luc Janssens,Luc Janssens,0.0,2019.0,,,


### Adult films percentage over total releases in 2019

In [8]:
df['isAdult'].mean() * 100

2.9684176293144926

### Create a column containing number of words in the title

In [14]:
df['title_length'] = df['primaryTitle'].map(lambda title: len(str(title)))
df.head()

Unnamed: 0,index,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,title_length
0,87501.0,tt0089435,short,Kokoa,Kokoa,0.0,2019.0,,13.0,,5
1,89512.0,tt0091490,short,Martina's Playhouse,Martina's Playhouse,0.0,2019.0,,20.0,,19
2,114407.0,tt0116991,movie,Mariette in Ecstasy,Mariette in Ecstasy,0.0,2019.0,,,,19
3,126556.0,tt0129960,tvMovie,Eine geschlossene Gesellschaft,Eine geschlossene Gesellschaft,0.0,2019.0,,,,30
4,166388.0,tt0172112,short,Ambulans,Ambulans,0.0,2019.0,,11.0,,8


### Average `runtimeMinutes` for `short` type

In [12]:
df[df['titleType'] == 'short']['runtimeMinutes'].mean()

12.536104279390065

### Filter `imdb` to return `tvMovie` type with 3 or more words in the title, and less than 75 minutes of `runTimeMinutes`

In [16]:
df[(df['titleType'] == 'tvMovie') & (df['title_length'] >= 3) & (df['runtimeMinutes'] < 75)]
df.head()

Unnamed: 0,index,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,title_length
0,87501.0,tt0089435,short,Kokoa,Kokoa,0.0,2019.0,,13.0,,5
1,89512.0,tt0091490,short,Martina's Playhouse,Martina's Playhouse,0.0,2019.0,,20.0,,19
2,114407.0,tt0116991,movie,Mariette in Ecstasy,Mariette in Ecstasy,0.0,2019.0,,,,19
3,126556.0,tt0129960,tvMovie,Eine geschlossene Gesellschaft,Eine geschlossene Gesellschaft,0.0,2019.0,,,,30
4,166388.0,tt0172112,short,Ambulans,Ambulans,0.0,2019.0,,11.0,,8
