## Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
prime_video = pd.read_csv('data\\amazon_prime_titles.csv')

In [3]:
prime_video.shape

(9668, 12)

In [4]:
prime_video.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,The Grand Seduction,Don McKellar,"Brendan Gleeson, Taylor Kitsch, Gordon Pinsent",Canada,"March 30, 2021",2014,,113 min,"Comedy, Drama",A small fishing village must procure a local d...
1,s2,Movie,Take Care Good Night,Girish Joshi,"Mahesh Manjrekar, Abhay Mahajan, Sachin Khedekar",India,"March 30, 2021",2018,13+,110 min,"Drama, International",A Metro Family decides to fight a Cyber Crimin...
2,s3,Movie,Secrets of Deception,Josh Webber,"Tom Sizemore, Lorenzo Lamas, Robert LaSardo, R...",United States,"March 30, 2021",2017,,74 min,"Action, Drama, Suspense",After a man discovers his wife is cheating on ...
3,s4,Movie,Pink: Staying True,Sonia Anderson,"Interviews with: Pink, Adele, Beyoncé, Britney...",United States,"March 30, 2021",2014,,69 min,Documentary,"Pink breaks the mold once again, bringing her ..."
4,s5,Movie,Monster Maker,Giles Foster,"Harry Dean Stanton, Kieran O'Brien, George Cos...",United Kingdom,"March 30, 2021",1989,,45 min,"Drama, Fantasy",Teenage Matt Banting wants to work with a famo...


## Data Cleaning

In [5]:
prime_video.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9668 entries, 0 to 9667
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       9668 non-null   object
 1   type          9668 non-null   object
 2   title         9668 non-null   object
 3   director      7586 non-null   object
 4   cast          8435 non-null   object
 5   country       672 non-null    object
 6   date_added    155 non-null    object
 7   release_year  9668 non-null   int64 
 8   rating        9331 non-null   object
 9   duration      9668 non-null   object
 10  listed_in     9668 non-null   object
 11  description   9668 non-null   object
dtypes: int64(1), object(11)
memory usage: 906.5+ KB


Check for null values

In [6]:
prime_video.isna().sum()

show_id            0
type               0
title              0
director        2082
cast            1233
country         8996
date_added      9513
release_year       0
rating           337
duration           0
listed_in          0
description        0
dtype: int64

Dropping unnecessary columns

In [7]:
prime_video = prime_video.drop(['director', 'cast', 'description'], axis=1)

In [8]:
prime_video.columns

Index(['show_id', 'type', 'title', 'country', 'date_added', 'release_year',
       'rating', 'duration', 'listed_in'],
      dtype='object')

In [9]:
prime_video.isna().sum()

show_id            0
type               0
title              0
country         8996
date_added      9513
release_year       0
rating           337
duration           0
listed_in          0
dtype: int64

Fix data types

In [10]:
prime_video.dtypes

show_id         object
type            object
title           object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
dtype: object

In [11]:
prime_video['type'] = pd.Categorical(prime_video['type'])

In [13]:
prime_video['date_added'] = prime_video['date_added'].astype('datetime64[ns]')

In [14]:
prime_video['rating'] = pd.Categorical(prime_video['rating'])

In [15]:
prime_video.dtypes

show_id                 object
type                  category
title                   object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                category
duration                object
listed_in               object
dtype: object

In [16]:
prime_video['rating'].cat.categories

Index(['13+', '16', '16+', '18+', '7+', 'AGES_16_', 'AGES_18_', 'ALL',
       'ALL_AGES', 'G', 'NC-17', 'NOT_RATE', 'NR', 'PG', 'PG-13', 'R', 'TV-14',
       'TV-G', 'TV-MA', 'TV-NR', 'TV-PG', 'TV-Y', 'TV-Y7', 'UNRATED'],
      dtype='object')

Idea: dividi rating a seconda dell'età consentita in ITA, ad esempio vedi sotto

ITALIA

I film sono classificati in quattro categorie:

T: Film per tutti.

6+: Non adatto ai minori di 6 anni.

14+: Vietato ai minori di 14 anni; spettatori che hanno compiuto i 12 anni sono ammessi alla proiezione se accompagnati da un genitore o da un tutore.

18+: Vietato ai minori di 18 anni; spettatori che hanno compiuto i 16 anni sono ammessi alla proiezione se accompagnati da un genitore o da un tutore.

source: https://it.wikipedia.org/wiki/Sistemi_di_classificazione_dei_film

TV-Y: This program is designed to be appropriate for all children.

TV-Y7: This program is designed for children age 7 and above.

TV-G: This program is suitable for all ages.

TV-PG: This program contains material that parents may find unsuitable for younger children (hence, Parental Guidance)

TV-14: This program contains some material that many parents would find unsuitable for children under 14 years of age.

TV-MA: This program is specifically designed to be viewed by adults and therefore may be unsuitable for children under 17.

G: This program is suitable for all ages.

NC-17: unsuitable for children under 17.

NR: not rated.

PG: may find unsuitable for younger children. I would say it is the same as TV-PG

PG-13: for children over 13.

R: restricted, only for children over 12.

TV-Y7-FV: Fantasy violence (exclusive to the TV-Y7 rating)

UR: not rated, as in (unrated).

source: https://en.wikipedia.org/wiki/Television_content_rating_system

In [None]:
"""
ratings_ages = {
    'TV-PG': 'Older Kids',
    'TV-MA': 'Adults',
    'TV-Y7-FV': 'Older Kids',
    'TV-Y7': 'Older Kids',
    'TV-14': 'Teens',
    'R': 'Adults',
    'TV-Y': 'Kids',
    'NR': 'Adults',
    'PG-13': 'Teens',
    'TV-G': 'Kids',
    'PG': 'Older Kids',
    'G': 'Kids',
    'UR': 'Adults',
    'NC-17': 'Adults'
}
"""

"\nratings_ages = {\n    'TV-PG': 'Older Kids',\n    'TV-MA': 'Adults',\n    'TV-Y7-FV': 'Older Kids',\n    'TV-Y7': 'Older Kids',\n    'TV-14': 'Teens',\n    'R': 'Adults',\n    'TV-Y': 'Kids',\n    'NR': 'Adults',\n    'PG-13': 'Teens',\n    'TV-G': 'Kids',\n    'PG': 'Older Kids',\n    'G': 'Kids',\n    'UR': 'Adults',\n    'NC-17': 'Adults'\n}\n"

Risolvo altre criticità null values

In [18]:
prime_video[prime_video['rating'].isna()]

Unnamed: 0,show_id,type,title,country,date_added,release_year,rating,duration,listed_in
0,s1,Movie,The Grand Seduction,Canada,2021-03-30,2014,,113 min,"Comedy, Drama"
2,s3,Movie,Secrets of Deception,United States,2021-03-30,2017,,74 min,"Action, Drama, Suspense"
3,s4,Movie,Pink: Staying True,United States,2021-03-30,2014,,69 min,Documentary
4,s5,Movie,Monster Maker,United Kingdom,2021-03-30,1989,,45 min,"Drama, Fantasy"
5,s6,Movie,Living With Dinosaurs,United Kingdom,2021-03-30,1989,,52 min,"Fantasy, Kids"
...,...,...,...,...,...,...,...,...,...
6010,s6011,TV Show,Hook City,,NaT,2021,,1 Season,"Action, Drama"
6152,s6153,Movie,Porky's,,NaT,1982,,98 min,"Comedy, Young Adult Audience"
6295,s6296,Movie,The Healing Garden,,NaT,2021,,96 min,"Drama, Special Interest"
6352,s6353,Movie,Manipulated,,NaT,2021,,92 min,"Drama, Suspense"


In [19]:
prime_video[prime_video['date_added'].isna()]

Unnamed: 0,show_id,type,title,country,date_added,release_year,rating,duration,listed_in
16,s17,Movie,Zoombies,,NaT,2016,13+,87 min,"Horror, Science Fiction"
17,s18,TV Show,Zoo Babies,,NaT,2008,ALL,1 Season,"Kids, Special Interest"
18,s19,TV Show,Zoë Coombs Marr: Bossy Bottom,,NaT,2020,18+,1 Season,"Comedy, Talk Show and Variety"
19,s20,Movie,Zoe,,NaT,2018,R,104 min,Science Fiction
20,s21,TV Show,Zoboomafoo,,NaT,2001,TV-Y,1 Season,Kids
...,...,...,...,...,...,...,...,...,...
9663,s9664,Movie,Pride Of The Bowery,,NaT,1940,7+,60 min,Comedy
9664,s9665,TV Show,Planet Patrol,,NaT,2018,13+,4 Seasons,TV Shows
9665,s9666,Movie,Outpost,,NaT,2008,R,90 min,Action
9666,s9667,TV Show,Maradona: Blessed Dream,,NaT,2021,TV-MA,1 Season,"Drama, Sports"


Divisione tra film e serie tv

In [20]:
prime_video_f = prime_video[prime_video['type'] == "Movie"]

In [21]:
prime_video_f.shape

(7814, 9)

In [23]:
prime_video_s = prime_video[prime_video['type'] == "TV Show"] 

In [24]:
prime_video_s.shape

(1854, 9)