In [97]:
# Importare le librerie necessarie
import pandas as pd
import numpy as np

In [98]:
# Caricare il dataset
df = pd.read_csv('../dataset/raw/netflix_titles.csv')

In [99]:
# 1. Gestione dei Valori Mancanti
# Sostituire i valori mancanti in 'director', 'cast', 'country' con 'Non disponibile'
colonne_con_mancanti = ['director', 'cast', 'country']
df[colonne_con_mancanti] = df[colonne_con_mancanti].fillna('Non disponibile')

df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Non disponibile,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,Non disponibile,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Non disponibile,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,Non disponibile,Non disponibile,Non disponibile,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,Non disponibile,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [100]:
# 2. Formattazione e Pulizia dei Dati
# Convertire 'date_added' in formato data gestendo i possibili errori
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

# Verifica le modifiche
df.head()


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Non disponibile,United States,2021-09-25,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,Non disponibile,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Non disponibile,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,Non disponibile,Non disponibile,Non disponibile,2021-09-24,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,Non disponibile,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [101]:
# Separare 'duration' in due colonne 'duration_number' e 'duration_unit'
df[['duration_number', 'duration_unit']] = df['duration'].str.extract('(\d+)\s*(\w+)')
df['duration_number'] = pd.to_numeric(df['duration_number'], errors='coerce')

df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,duration_number,duration_unit
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Non disponibile,United States,2021-09-25,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",90.0,min
1,s2,TV Show,Blood & Water,Non disponibile,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2.0,Seasons
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Non disponibile,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,1.0,Season
3,s4,TV Show,Jailbirds New Orleans,Non disponibile,Non disponibile,Non disponibile,2021-09-24,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",1.0,Season
4,s5,TV Show,Kota Factory,Non disponibile,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,2.0,Seasons


In [102]:
# Filtrare e correggere gli errori di classificazione dei rating
valori_erronei = df['rating'].str.contains('min', na=False)

# Sostituire i valori erronei con NaN per poi gestirli come valori mancanti
df.loc[valori_erronei, 'rating'] = np.nan

# Applicare la mappatura dettagliata dei rating, inclusa la gestione dei NaN
rating_map = {
    'TV-MA': 'Mature - Suitable for adults only',
    'TV-14': 'Teens - Suitable for over 14 years',
    'TV-PG': 'Parental Guidance - Parents urged to give parental guidance under 12 years',
    'PG-13': 'Teens - Suitable for over 13 years',
    'PG': 'Parental Guidance Suggested - Suitable for children with parental guidance',
    'TV-Y': 'Kids - Suitable for all children',
    'TV-Y7': 'Older Kids - Suitable for children over 7 years',
    'TV-G': 'General - Suitable for all audiences',
    'G': 'General - Suitable for all audiences',
    'R': 'Restricted - Suitable for adults (contains adult material)',
    'NR': 'Not Rated - No specific classification',
    'TV-Y7-FV': 'Older Kids - Suitable for children over 7 years with fantasy violence elements',
    'UR': 'Unrated - Uncensored or extended version not submitted for classification',
    'NC-17': 'Adults Only - Not suitable for children under 17 years'
}

df['rating_category'] = df['rating'].map(rating_map).fillna('Non Specificato')

# Verifica delle modifiche
df[['rating', 'rating_category']].head()

df.head()


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,duration_number,duration_unit,rating_category
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Non disponibile,United States,2021-09-25,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",90.0,min,Teens - Suitable for over 13 years
1,s2,TV Show,Blood & Water,Non disponibile,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2.0,Seasons,Mature - Suitable for adults only
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Non disponibile,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,1.0,Season,Mature - Suitable for adults only
3,s4,TV Show,Jailbirds New Orleans,Non disponibile,Non disponibile,Non disponibile,2021-09-24,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",1.0,Season,Mature - Suitable for adults only
4,s5,TV Show,Kota Factory,Non disponibile,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,2.0,Seasons,Mature - Suitable for adults only


In [103]:
# Numero di righe prima della rimozione dei duplicati
num_righe_prima = df.shape[0]

# Rimozione dei duplicati basata su 'title' e 'release_year'
df = df.drop_duplicates(subset=['title', 'release_year'])

# Numero di righe dopo la rimozione dei duplicati
num_righe_dopo = df.shape[0]

# Calcolo del numero di duplicati rimossi
num_duplicati_rimossi = num_righe_prima - num_righe_dopo

print(f"Numero di duplicati rimossi: {num_duplicati_rimossi}")


Numero di duplicati rimossi: 0


In [104]:
# Trasformare i generi separati da virgola in una lista di generi
df['genres_list'] = df['listed_in'].apply(lambda x: x.split(', '))
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,duration_number,duration_unit,rating_category,genres_list
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Non disponibile,United States,2021-09-25,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",90.0,min,Teens - Suitable for over 13 years,[Documentaries]
1,s2,TV Show,Blood & Water,Non disponibile,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2.0,Seasons,Mature - Suitable for adults only,"[International TV Shows, TV Dramas, TV Mysteries]"
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Non disponibile,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,1.0,Season,Mature - Suitable for adults only,"[Crime TV Shows, International TV Shows, TV Ac..."
3,s4,TV Show,Jailbirds New Orleans,Non disponibile,Non disponibile,Non disponibile,2021-09-24,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",1.0,Season,Mature - Suitable for adults only,"[Docuseries, Reality TV]"
4,s5,TV Show,Kota Factory,Non disponibile,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,2.0,Seasons,Mature - Suitable for adults only,"[International TV Shows, Romantic TV Shows, TV..."


In [105]:
#Rimozione colonne non necessarie
df = df.drop(columns=['rating', 'duration', 'show_id', 'listed_in'])

# Verifica delle colonne rimanenti
print(df.columns)


Index(['type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'description', 'duration_number', 'duration_unit',
       'rating_category', 'genres_list'],
      dtype='object')


In [106]:
# Esempio di ricerca per il genere "Commedia"
genere_richiesto = 'Docuseries'
titoli_trovati = df[df['genres_list'].apply(lambda x: genere_richiesto in x)]

# Visualizzare i titoli trovati
print(titoli_trovati[['title', 'genres_list']])


                                                title  \
3                               Jailbirds New Orleans   
10                Vendetta: Truth, Lies and The Mafia   
14                    Crime Stories: India Detectives   
20    Monsters Inside: The 24 Faces of Billy Milligan   
25                               Love on the Spectrum   
...                                               ...   
8712                       Weird Wonders of the World   
8740                                      Wild Alaska   
8741                                      Wild Arabia   
8755                                Women Behind Bars   
8758                           World's Busiest Cities   

                                            genres_list  
3                              [Docuseries, Reality TV]  
10    [Crime TV Shows, Docuseries, International TV ...  
14       [British TV Shows, Crime TV Shows, Docuseries]  
20    [Crime TV Shows, Docuseries, International TV ...  
25     [Docuseries, Inter

In [107]:
df.head(5000)

Unnamed: 0,type,title,director,cast,country,date_added,release_year,description,duration_number,duration_unit,rating_category,genres_list
0,Movie,Dick Johnson Is Dead,Kirsten Johnson,Non disponibile,United States,2021-09-25,2020,"As her father nears the end of his life, filmm...",90.0,min,Teens - Suitable for over 13 years,[Documentaries]
1,TV Show,Blood & Water,Non disponibile,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,"After crossing paths at a party, a Cape Town t...",2.0,Seasons,Mature - Suitable for adults only,"[International TV Shows, TV Dramas, TV Mysteries]"
2,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Non disponibile,2021-09-24,2021,To protect his family from a powerful drug lor...,1.0,Season,Mature - Suitable for adults only,"[Crime TV Shows, International TV Shows, TV Ac..."
3,TV Show,Jailbirds New Orleans,Non disponibile,Non disponibile,Non disponibile,2021-09-24,2021,"Feuds, flirtations and toilet talk go down amo...",1.0,Season,Mature - Suitable for adults only,"[Docuseries, Reality TV]"
4,TV Show,Kota Factory,Non disponibile,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,In a city of coaching centers known to train I...,2.0,Seasons,Mature - Suitable for adults only,"[International TV Shows, Romantic TV Shows, TV..."
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,Movie,Ladies First,Uraaz Bahl,Non disponibile,India,2018-03-08,2018,Born amid poverty and limited women's rights i...,40.0,min,General - Suitable for all audiences,"[Documentaries, International Movies, Sports M..."
4996,Movie,Bullet Head,"Paul Solet, Rick Benattar","Adrien Brody, John Malkovich, Rory Culkin, Ant...","Bulgaria, United States",2018-03-07,2017,"After a daring heist, three fugitives lock the...",94.0,min,Restricted - Suitable for adults (contains adu...,"[International Movies, Thrillers]"
4997,TV Show,Borderliner,Non disponibile,"Tobias Santelmann, Ellen Dorrit Petersen, Benj...","Norway, Germany, Sweden",2018-03-06,2017,"To protect his family, a police detective cove...",1.0,Season,Mature - Suitable for adults only,"[Crime TV Shows, International TV Shows, TV Dr..."
4998,Movie,Gad Elmaleh: American Dream,Michael Simon,Gad Elmaleh,United States,2018-03-06,2018,"In his first English-language special, comedia...",58.0,min,Teens - Suitable for over 14 years,[Stand-Up Comedy]


In [108]:
# Salvare il dataset pulito
df.to_csv('../dataset/cleaned/netflix_titles_cleaned.csv', index=False)
