# Progetto di IUM
### Mangini Dario, 2024-2025

### Importazione delle librerie e creazione dei dataframe

In [1]:
import pandas as pd
import os
import numpy as np

df_movies = pd.read_csv('data/main_data/movies.csv')
df_actors = pd.read_csv('data/main_data/actors.csv')
df_countries = pd.read_csv('data/main_data/countries.csv')
df_crew = pd.read_csv('data/main_data/crew.csv')
df_genres = pd.read_csv('data/main_data/genres.csv')
df_languages = pd.read_csv('data/main_data/languages.csv')
df_posters = pd.read_csv('data/main_data/posters.csv')
df_releases = pd.read_csv('data/main_data/releases.csv')
df_studios = pd.read_csv('data/main_data/studios.csv')
df_themes = pd.read_csv('data/main_data/themes.csv')

df_rotten = pd.read_csv('data/additional_data/rotten_tomatoes_reviews.csv')
df_oscar = pd.read_csv('data/additional_data/the_oscar_awards.csv')

# Primo controllo e pulizia dei dati per ogni dataset

## 1. Movies.csv

### Prima visualizzazione dei dati di movies

In [2]:
df_movies

Unnamed: 0,id,name,date,tagline,description,minute,rating
0,1000001,Barbie,2023.0,She's everything. He's just Ken.,Barbie and Ken are having the time of their li...,114.0,3.86
1,1000002,Parasite,2019.0,Act like you own the place.,"All unemployed, Ki-taek's family takes peculia...",133.0,4.56
2,1000003,Everything Everywhere All at Once,2022.0,The universe is so much bigger than you realize.,An aging Chinese immigrant is swept up in an i...,140.0,4.30
3,1000004,Fight Club,1999.0,Mischief. Mayhem. Soap.,A ticking-time-bomb insomniac and a slippery s...,139.0,4.27
4,1000005,La La Land,2016.0,Here's to the fools who dream.,"Mia, an aspiring actress, serves lattes to mov...",129.0,4.09
...,...,...,...,...,...,...,...
941592,1941593,神笛,,,,,
941593,1941594,蟲極道蜜団子抗争編 壱ノ巻,,,Shinjuku forest at night. In the sap taverns o...,30.0,
941594,1941595,蟲極道蜜団子抗争編 弐ノ巻,,,"The city that never sleeps, where insects gath...",30.0,
941595,1941596,重生,,,"In a world where order has broken down, darkne...",,


In [3]:
df_movies.shape

(941597, 7)

In [4]:
df_movies.dtypes

id               int64
name            object
date           float64
tagline         object
description     object
minute         float64
rating         float64
dtype: object

Faccio una copia del dataframe, che altrimenti potrebbe casuare problemi con altre operazioni in seguito

In [16]:
df_movies_clean = df_movies.copy()

### Visualizzazione e gestione dei dati nulli per movies

In [17]:
# Conta i valori mancanti per colonna
print(df_movies_clean.isnull().sum())

id                  0
name               10
date            91913
tagline        802210
description    160812
minute         181570
rating         850598
dtype: int64


In [18]:
# Filtra le righe con valori mancanti
df_movies_clean[df_movies_clean.isna().any(axis=1)]

Unnamed: 0,id,name,date,tagline,description,minute,rating
34,1000035,Black Swan,2010.0,,A journey through the psyche of a young baller...,108.0,4.15
68,1000069,Past Lives,2023.0,,"Nora and Hae Sung, two childhood friends, are ...",106.0,4.18
133,1000134,Toy Story,1995.0,,"Led by Woody, Andy's toys live happily in his ...",81.0,4.12
146,1000147,Requiem for a Dream,2000.0,,The hopes and dreams of four ambitious people ...,102.0,4.10
162,1000163,Asteroid City,2023.0,,Set in a fictional American desert town circa ...,105.0,3.52
...,...,...,...,...,...,...,...
941592,1941593,神笛,,,,,
941593,1941594,蟲極道蜜団子抗争編 壱ノ巻,,,Shinjuku forest at night. In the sap taverns o...,30.0,
941594,1941595,蟲極道蜜団子抗争編 弐ノ巻,,,"The city that never sleeps, where insects gath...",30.0,
941595,1941596,重生,,,"In a world where order has broken down, darkne...",,


In [19]:
# Seleziona le righe dove 'name' è NaN
righe_con_na = df_movies_clean[df_movies_clean['name'].isna()]
righe_con_na

Unnamed: 0,id,name,date,tagline,description,minute,rating
287514,1287515,,2015.0,,NONE is a short film that explores the balance...,4.0,
617642,1617643,,,,,,
646520,1646521,,2008.0,,,,
648185,1648186,,,,,,
720294,1720295,,,,"In this directorial debut of Eden Ewardson, he...",8.0,
725369,1725370,,,,,,
741481,1741482,,,,,90.0,
840337,1840338,,,,,,
883228,1883229,,,,,,
894771,1894772,,,,,,


Rimuovo le righe dove name è null perché non mi utile tenere film di cui non so neanche il nome

In [20]:
# Rimuove righe con valori nulli nella colonna 'name'
df_movies_clean = df_movies_clean.dropna(subset=['name'])

In [21]:
# Seleziona le righe dove 'date' è NaN
righe_con_na = df_movies_clean[df_movies_clean['date'].isna()]
righe_con_na

Unnamed: 0,id,name,date,tagline,description,minute,rating
3529,1003530,Spider-Man: Beyond the Spider-Verse,,,Taking place right after the events of Spider-...,,
5493,1005494,Wake Up Dead Man: A Knives Out Mystery,,,Benoit Blanc returns in his most dangerous cas...,,
7119,1007120,My Year of Rest and Relaxation,,,"Bored with her seemingly meaningless life, a y...",,
7914,1007915,Frankenstein,,,"Dr. Victor Frankenstein, a brilliant but egoti...",,
8110,1008111,Mad Max: The Wasteland,,,Mad Max: The Wasteland is an upcoming film in ...,,
...,...,...,...,...,...,...,...
941592,1941593,神笛,,,,,
941593,1941594,蟲極道蜜団子抗争編 壱ノ巻,,,Shinjuku forest at night. In the sap taverns o...,30.0,
941594,1941595,蟲極道蜜団子抗争編 弐ノ巻,,,"The city that never sleeps, where insects gath...",30.0,
941595,1941596,重生,,,"In a world where order has broken down, darkne...",,


Di sopra possiamo dedurre che i film che non hanno una data sono film che devono ancora uscire, per cui possiamo sostituire un valore simbolico come zero per ricordarci di questo fatto in seguito.

In [22]:
# Sostituisce i valori NaN nella colonna 'date' con 0
df_movies_clean['date'] = df_movies_clean['date'].fillna(0)

Per le righe dove tagline e description sono nulli non ho interesse a modificare il valore perché dubito che possano essere utilizzati durante l'analisi successivamente.

In [23]:
# Seleziona le righe dove 'minute' è NaN
righe_con_na = df_movies_clean[df_movies_clean['minute'].isna()]
righe_con_na

Unnamed: 0,id,name,date,tagline,description,minute,rating
2572,1002573,Nosferatu,2024.0,,A gothic tale of obsession between a haunted y...,,
3529,1003530,Spider-Man: Beyond the Spider-Verse,0.0,,Taking place right after the events of Spider-...,,
4679,1004680,The Batman - Part II,2026.0,,Sequel to the 2022 film The Batman.,,
5493,1005494,Wake Up Dead Man: A Knives Out Mystery,0.0,,Benoit Blanc returns in his most dangerous cas...,,
5554,1005555,Wicked,2024.0,Everyone deserves a chance to fly.,"Elphaba, an ostracized but defiant girl born w...",,
...,...,...,...,...,...,...,...
941589,1941590,機動戦士ガンダムUC: ユニコーンの日,0.0,,,,
941590,1941591,火线干探之革命,0.0,,,,
941591,1941592,画江湖之不良人,0.0,,,,
941592,1941593,神笛,0.0,,,,


Nelle righe dove minute (la durata) ha valore nullo sostituisco come valore simbolico -1.

In [24]:
# Sostituisce i valori NaN nella colonna 'minute' con -1
df_movies_clean['minute'] = df_movies_clean['minute'].fillna(-1)

In [25]:
# Seleziona le righe dove 'rating' è NaN
righe_con_na = df_movies_clean[df_movies_clean['rating'].isna()]
righe_con_na

Unnamed: 0,id,name,date,tagline,description,minute,rating
2532,1002533,MaXXXine,2024.0,Hollywood is a killer.,"In 1980s Hollywood, adult film star and aspiri...",104.0,
2572,1002573,Nosferatu,2024.0,,A gothic tale of obsession between a haunted y...,-1.0,
3257,1003258,Deadpool & Wolverine,2024.0,Come together.,A listless Wade Wilson toils away in civilian ...,127.0,
3348,1003349,Joker: Folie à Deux,2024.0,The world is a stage.,A sequel to the 2019 film Joker.,120.0,
3529,1003530,Spider-Man: Beyond the Spider-Verse,0.0,,Taking place right after the events of Spider-...,-1.0,
...,...,...,...,...,...,...,...
941592,1941593,神笛,0.0,,,-1.0,
941593,1941594,蟲極道蜜団子抗争編 壱ノ巻,0.0,,Shinjuku forest at night. In the sap taverns o...,30.0,
941594,1941595,蟲極道蜜団子抗争編 弐ノ巻,0.0,,"The city that never sleeps, where insects gath...",30.0,
941595,1941596,重生,0.0,,"In a world where order has broken down, darkne...",-1.0,


Nelle righe dove rating (la valutazione) è nulla sostituisco come valore simbolico -1


In [26]:
# Sostituisce i valori NaN nella colonna 'rating' con -1
df_movies_clean['rating'] = df_movies_clean['rating'].fillna(-1)

### Controllo e gestione dei valori duplicati per movies


In [27]:
# Trova i duplicati completi
duplicates = df_movies_clean[df_movies_clean.duplicated()]

duplicates

Unnamed: 0,id,name,date,tagline,description,minute,rating


In [28]:
# Trova i duplicati in base al nome
duplicates = df_movies_clean[df_movies_clean.duplicated(subset=['name'])]

duplicates

Unnamed: 0,id,name,date,tagline,description,minute,rating
389,1000390,Scream,2022.0,It's always someone you know.,Twenty-five years after a streak of brutal mur...,114.0,3.21
453,1000454,Suspiria,2018.0,Give your soul to the dance.,A darkness swirls at the center of a world-ren...,152.0,3.74
473,1000474,Mean Girls,2024.0,Plastic is forever.,New student Cady Heron is welcomed into the to...,113.0,2.69
668,1000669,The Little Mermaid,1989.0,Somewhere under the sea and beyond your imagin...,This colorful adventure tells the story of an ...,83.0,3.67
704,1000705,Beauty and the Beast,2017.0,Be our guest.,A live-action adaptation of Disney's version o...,129.0,3.07
...,...,...,...,...,...,...,...
941512,1941513,Untouchable,0.0,,Zun Fei (Shen Teng) with a group of brothers t...,-1.0,-1.00
941514,1941515,Vagabond,0.0,,"... shot at the ""Love-In"" held in Elysian Park...",6.0,-1.00
941524,1941525,Waste,0.0,,When Orel learns that God hates waste he decid...,11.0,-1.00
941532,1941533,Without,0.0,,,6.0,-1.00


In [29]:
# Filtra le righe dove il valore della colonna 'name' è uguale a 'Scream'
scream_rows = df_movies_clean.loc[df_movies_clean['name'] == 'Scream']

# Visualizza le righe trovate
print("Righe con il nome 'Scream':")
scream_rows

Righe con il nome 'Scream':


Unnamed: 0,id,name,date,tagline,description,minute,rating
122,1000123,Scream,1996.0,Someone has taken their love of scary movies o...,A killer known as Ghostface begins killing off...,112.0,4.02
389,1000390,Scream,2022.0,It's always someone you know.,Twenty-five years after a streak of brutal mur...,114.0,3.21
28275,1028276,Scream,1981.0,It was the perfect weekend vacation until THE ...,A group of people on a rafting excursion happe...,82.0,2.18
143379,1143380,Scream,1978.0,,"Film consists of shots of Belgrade Zoo, animal...",9.0,-1.0
149643,1149644,Scream,1949.0,,"Scream, called the first Turkish horror/thrill...",85.0,-1.0
251118,1251119,Scream,2007.0,,A group of new students register to be members...,92.0,-1.0
359087,1359088,Scream,2023.0,,"Mels, who was born without legs, lives in a sm...",108.0,-1.0
414221,1414222,Scream,1964.0,,,98.0,-1.0
431991,1431992,Scream,2021.0,,"An experimental essay film about terrorism, me...",47.0,-1.0
504961,1504962,Scream,2018.0,,Mongolian horror anthology,80.0,-1.0


Dall'esempio di sopra possiamo capire che solo il nome non basta per capire se un duplicato si può eliminare, dato che ci possono essere film con lo stesso nome ma fatti in anni diversi.

In [30]:
# Trova tutte le occorrenze duplicate nella colonna "name" e "date"
duplicati_name_date = df_movies_clean[df_movies_clean.duplicated(subset=['name', 'date'], keep=False)]

print("Tutte le occorrenze duplicate nella colonna 'name' e 'date':")
duplicati_name_date

Tutte le occorrenze duplicate nella colonna 'name' e 'date':


Unnamed: 0,id,name,date,tagline,description,minute,rating
0,1000001,Barbie,2023.0,She's everything. He's just Ken.,Barbie and Ken are having the time of their li...,114.0,3.86
58,1000059,Taxi Driver,1976.0,"On every street in every city, there's a nobod...",A mentally unstable Vietnam War veteran works ...,114.0,4.17
63,1000064,Arrival,2016.0,Why are they here?,Taking place after alien crafts land around th...,116.0,4.12
68,1000069,Past Lives,2023.0,,"Nora and Hae Sung, two childhood friends, are ...",106.0,4.18
85,1000086,Us,2019.0,Watch yourself,Husband and wife Gabe and Adelaide Wilson take...,116.0,3.65
...,...,...,...,...,...,...,...
941505,1941506,Unravel,0.0,"Alexia, a young woman who struggles with anxie...",Along her journey to relieve herself of anxiet...,7.0,-1.00
941510,1941511,Untitled Project,0.0,,A narrator is simply trying to do his job.,-1.0,-1.00
941514,1941515,Vagabond,0.0,,"... shot at the ""Love-In"" held in Elysian Park...",6.0,-1.00
941524,1941525,Waste,0.0,,When Orel learns that God hates waste he decid...,11.0,-1.00


Elimino i dati dove nome e data sono uguali per l'improbabilità che venga fatto un film con lo stesso nome nello stesso anno.

In [31]:
# Rimuovo i duplicati basandoti su 'name' e 'date', mantenendo la prima occorrenza
df_movies_clean = df_movies_clean.drop_duplicates(subset=['name', 'date'])

### Conversione e ultime operazioni per movies

In [32]:
# Conversione delle colonne in stringhe
df_movies_clean['name'] = df_movies_clean['name'].astype(str)
df_movies_clean['tagline'] = df_movies_clean['tagline'].astype(str)
df_movies_clean['description'] = df_movies_clean['description'].astype(str)
# Rinomina la colonna 'date' in 'year'
df_movies_clean = df_movies_clean.rename(columns={'date': 'year'})
#conversione year in formato intero perché se lo converto in data mi aggiunge giorne e mese
df_movies_clean['year'] = df_movies_clean['year'].astype(int)
#Conversione della colonna in intero
df_movies_clean['minute'] = df_movies_clean['minute'].astype(int)

In [34]:
df_movies_clean.head()

Unnamed: 0,id,name,year,tagline,description,minute,rating
0,1000001,Barbie,2023,She's everything. He's just Ken.,Barbie and Ken are having the time of their li...,114,3.86
1,1000002,Parasite,2019,Act like you own the place.,"All unemployed, Ki-taek's family takes peculia...",133,4.56
2,1000003,Everything Everywhere All at Once,2022,The universe is so much bigger than you realize.,An aging Chinese immigrant is swept up in an i...,140,4.3
3,1000004,Fight Club,1999,Mischief. Mayhem. Soap.,A ticking-time-bomb insomniac and a slippery s...,139,4.27
4,1000005,La La Land,2016,Here's to the fools who dream.,"Mia, an aspiring actress, serves lattes to mov...",129,4.09


## 2. Actors.csv

### Prima visualizzazione dei dati di Actors

In [35]:
df_actors

Unnamed: 0,id,name,role
0,1000001,Margot Robbie,Barbie
1,1000001,Ryan Gosling,Ken
2,1000001,America Ferrera,Gloria
3,1000001,Ariana Greenblatt,Sasha
4,1000001,Issa Rae,Barbie
...,...,...,...
5798445,1941596,Marc Ma,Ba Cai/巴莱
5798446,1941596,线雨轩,Tata/塔塔
5798447,1941596,Jiang Yixuan,Zuo Yila（Zoila）/佐伊拉
5798448,1941597,Hiroshi Mikami,


In [36]:
df_actors.dtypes

id       int64
name    object
role    object
dtype: object

Faccio una copia del dataframe, che altrimenti potrebbe casuare problemi con altre operazioni in seguito

In [37]:
df_actors_clean = df_actors.copy()

### Controllo e gestione dei valori nulli per Actors

In [38]:
# Conta i valori mancanti per colonna
print(df_actors_clean.isnull().sum())

id            0
name          4
role    1361559
dtype: int64


In [39]:
# Seleziona le righe dove 'name' è NaN
righe_con_na = df_actors_clean[df_actors_clean['name'].isna()]
righe_con_na

Unnamed: 0,id,name,role
4145738,1443629,,
4281100,1469981,,Self
4306960,1474958,,Cinematography
5430275,1773264,,


Elimino le righe dove il nome è nullo perché non sono utili e sono poche.

In [40]:
# Rimuove righe con valori nulli nella colonna 'name'
df_actors_clean = df_actors_clean.dropna(subset=['name'])

### Controllo e gestione dei valori duplicati per Actors


In [41]:
# Verifica le righe duplicate considerando tutte le colonne
duplicati = df_actors_clean.duplicated()

# Conta il numero di righe duplicate
numero_duplicati = duplicati.sum()
print(f"Numero di righe duplicate: {numero_duplicati}")

# Visualizza le righe duplicate
righe_duplicati = df_actors_clean[duplicati]
print("Righe duplicate:")
righe_duplicati

Numero di righe duplicate: 946
Righe duplicate:


Unnamed: 0,id,name,role
3993,1000062,Rosie Jones,Lady of the Boot of Jemiah
44642,1000797,Karel Heřmánek,Czech Injured Man
47807,1000863,Michael Fennimore,Car Salesman
117813,1002509,Isabel Rodriguez,Dancer
125321,1002704,Harry Sabin,Additional Muppet Performer
...,...,...,...
5788300,1935883,Denis Lovrinović,
5791298,1937512,David Livet,
5791299,1937512,David Livet,
5792884,1939290,Gudni Oddgeirsson,Interviewee


In [42]:
# Elimina i duplicati nel DataFrame
df_actors_clean = df_actors_clean.drop_duplicates()

### Conversione e ultime operazioni per Actors

In [43]:
df_actors_clean['name'] = df_actors_clean['name'].astype(str)
df_actors_clean['role'] = df_actors_clean['role'].astype(str)

In [44]:
df_actors_clean = df_actors_clean.rename(columns={'id': 'film_id'})

In [45]:
df_actors_clean.head()

Unnamed: 0,film_id,name,role
0,1000001,Margot Robbie,Barbie
1,1000001,Ryan Gosling,Ken
2,1000001,America Ferrera,Gloria
3,1000001,Ariana Greenblatt,Sasha
4,1000001,Issa Rae,Barbie


## 3. Countries.csv

### Prima visualizzazione dei dati di Countries

In [46]:
df_countries

Unnamed: 0,id,country
0,1000001,UK
1,1000001,USA
2,1000002,South Korea
3,1000003,USA
4,1000004,Germany
...,...,...
693471,1941593,China
693472,1941594,USA
693473,1941595,USA
693474,1941596,China


In [47]:
df_countries.dtypes

id          int64
country    object
dtype: object

Faccio una copia del dataframe, dato che potrebbe causare problemi con altre operazioni in seguito.

In [48]:
df_countries_clean = df_countries.copy()

### Controllo e gestione dei valori nulli per countries

In [49]:
# Conta i valori mancanti per colonna
print(df_countries_clean.isnull().sum())

id         0
country    0
dtype: int64


### Controllo e gestione dei valori duplicati per countries

In [50]:
    # Verifica le righe duplicate considerando tutte le colonne
duplicati = df_countries_clean.duplicated()

# Conta il numero di righe duplicate
numero_duplicati = duplicati.sum()
print(f"Numero di righe duplicate: {numero_duplicati}")

# Visualizza le righe duplicate
righe_duplicati = df_countries_clean[duplicati]
print("Righe duplicate:")
righe_duplicati

Numero di righe duplicate: 0
Righe duplicate:


Unnamed: 0,id,country


### Conversione e ultime operazioni per contries

In [51]:
df_countries_clean['country'] = df_countries_clean['country'].astype(str)
df_countries_clean = df_countries_clean.rename(columns={'id': 'film_id'})

df_countries_clean.head()

Unnamed: 0,film_id,country
0,1000001,UK
1,1000001,USA
2,1000002,South Korea
3,1000003,USA
4,1000004,Germany


controllo che ci sia solo la dicitura 'UK' per rappresentare il Regno Unito e la dicitura 'USA' per gli Stati Uniti


In [52]:
# Definisci le varianti
varianti_uk = [
    'UK', 'GB', 'GBR', 'United Kingdom', 'Great Britain',
    'United Kingdom of Great Britain and Northern Ireland',
    'England', 'Scotland', 'Wales', 'Northern Ireland', 'Britain'
]

varianti_usa = [
    'USA', 'US', 'U.S.', 'U.S.A.', 'United States',
    'United States of America', 'America'
]

# Uniforma i valori usando una sostituzione diretta senza regex
df_countries_clean['country'] = df_countries_clean['country'].replace(
    {**dict.fromkeys(varianti_uk, 'UK'), **dict.fromkeys(varianti_usa, 'USA')},
    regex=False
)

# Verifica il risultato
print(df_countries_clean['country'].unique())

['UK' 'USA' 'South Korea' 'Germany' 'Hong Kong' 'Canada' 'Sweden'
 'Ireland' 'Japan' 'China' 'France' 'Brazil' 'Italy' 'Czechia'
 'New Zealand' 'Australia' 'India' 'Spain' 'Austria' 'Greece'
 'Netherlands' 'Poland' 'Taiwan' 'Denmark' 'Norway' 'Mexico' 'Switzerland'
 'Turkey' 'Finland' 'USSR' 'Singapore' 'Chile' 'Belgium'
 'United Arab Emirates' 'Malta' 'Hungary' 'South Africa' 'Bulgaria'
 'Czechoslovakia' 'Argentina' 'Iceland' 'Indonesia' 'Slovenia' 'Iran'
 'Luxembourg' 'Philippines' 'Russian Federation' 'Malaysia' 'Portugal'
 'Serbia' 'Algeria' 'Estonia' 'Romania' 'Colombia' 'Thailand' 'Cyprus'
 'Lebanon' 'Qatar' 'Peru' 'Cambodia' 'Bahamas' 'Israel' 'Puerto Rico'
 'Cuba' 'Senegal' 'Bosnia and Herzegovina' 'Georgia' 'Guadeloupe'
 'Uruguay' 'Pakistan' 'Sri Lanka' 'Latvia' 'Tunisia' 'Dominican Republic'
 'Yugoslavia' 'Bolivarian Republic of Venezuela' 'Montenegro' 'Egypt'
 'Ethiopia' 'Ghana' 'Jordan' 'Namibia' 'Saudi Arabia' 'Angola' 'Mali'
 'Myanmar' 'Morocco' 'Syrian Arab Republic' 'Ug

## 4. Crew

### Prima visualizzazione dei dati di Crew


In [53]:
df_crew

Unnamed: 0,id,role,name
0,1000001,Director,Greta Gerwig
1,1000001,Producer,Tom Ackerley
2,1000001,Producer,Margot Robbie
3,1000001,Producer,Robbie Brenner
4,1000001,Producer,David Heyman
...,...,...,...
4720178,1941596,Casting,线雨轩
4720179,1941596,Editor,Eric Kwong Chi-Leung
4720180,1941596,Cinematography,Kenny Tse
4720181,1941596,Composer,胡小欧


In [54]:
df_crew.dtypes

id       int64
role    object
name    object
dtype: object

Faccio una copia del dataframe, dato che potrebbe causare problemi con altre operazioni in seguito.

In [55]:
df_crew_clean = df_crew.copy()

### Controllo e gestione dei nulli per crew

In [56]:
# Conta i valori mancanti per colonna
print(df_crew_clean.isnull().sum())

id      0
role    0
name    1
dtype: int64


In [57]:
# Seleziona le righe dove 'name' è NaN
righe_con_na = df_crew_clean[df_crew_clean['name'].isna()]
righe_con_na

Unnamed: 0,id,role,name
4562126,1859397,Writer,


In [58]:
# Elimina le righe dove la colonna 'name' è nulla
df_crew_clean = df_crew_clean.dropna(subset=['name'])

### Controllo e gestione dei duplicati per crew

In [59]:
# Verifica le righe duplicate considerando tutte le colonne
duplicati = df_crew_clean.duplicated()

# Conta il numero di righe duplicate
numero_duplicati = duplicati.sum()
print(f"Numero di righe duplicate: {numero_duplicati}")

# Visualizza le righe duplicate
righe_duplicati = df_crew_clean[duplicati]
print("Righe duplicate:")
righe_duplicati

Numero di righe duplicate: 1282
Righe duplicate:


Unnamed: 0,id,role,name
1721,1000018,Stunts,Chris Webb
2691,1000031,Stunts,Sarah Irwin
2692,1000031,Stunts,Sarah Irwin
3920,1000043,Set decoration,Chris Arnold
6655,1000074,Casting,Magui Jimenez
...,...,...,...
4716242,1940117,Sound,Greg Francis
4716243,1940117,Sound,Greg Francis
4718342,1940904,Assistant director,Choe Yeong-sik
4719628,1941357,Executive producer,Josh Earl


In [60]:
# Elimina i duplicati nel DataFrame
df_crew_clean = df_crew_clean.drop_duplicates()

### Conversione e ultime operazioni per crew

In [62]:
df_crew_clean['role'] = df_crew_clean['role'].astype(str)
df_crew_clean['name'] = df_crew_clean['name'].astype(str)
df_crew_clean = df_crew_clean.rename(columns={'id': 'film_id'})

df_crew_clean.head()

Unnamed: 0,film_id,role,name
0,1000001,Director,Greta Gerwig
1,1000001,Producer,Tom Ackerley
2,1000001,Producer,Margot Robbie
3,1000001,Producer,Robbie Brenner
4,1000001,Producer,David Heyman
