In [98]:
import csv
import pandas as pd
import re

In [99]:
def cleaning_cells(file_csv):
    df = pd.read_csv(file_csv)
    # remove rows with empty 
    df = df.dropna() 
    # remove rows with 'outstanding' in cat
    df['category'] = df['category'].str.replace('outstanding', '', case=False)
    # remove ;
    df['staff'] = df['staff'].str.replace(';', '')
    df.to_csv(file_csv, index=False)
    
def print_uniques(df, column):
    uniques = df[column].value_counts().sort_index()
    print(f"unique '{column}' values:")
    for val in uniques.index:
        print(val)
        
def comma_remove(df):
    for column in df.columns:
        df[column] = df[column].apply(lambda x: x.split(',')[0] if isinstance(x, str) else x)
    return df

In [100]:
file_csv = '../dataset/the_emmy_awards.csv'
cleaning_cells(file_csv)
df = pd.read_csv(file_csv)
df = df.drop(columns=['id', 'company', 'producer'])

In [101]:
print_uniques(df, 'category')

unique 'category' values:
 ACHIEVEMENT IN MAIN TITLE THEME MUSIC
 ACHIEVEMENT IN MAKEUP
 ACHIEVEMENT IN MUSIC COMPOSITION for a limited series or a special (dramatic underscore)
 ACHIEVEMENT IN MUSIC DIRECTION
 ACHIEVEMENT IN NON-FICTION PROGRAMMING - PICTURE EDITING
 ACHIEVEMENT IN TECHNICAL DIRECTION AND electronic camerawork
 ANIMATED PROGRAM (FOR PROGRAMMING ONE HOUR OR LESS)
 ANIMATED PROGRAM (FOR PROGRAMMING one hour or less.)
 ART DIRECTION FOR A COMEDY-VARIETY or music series
 ART DIRECTION FOR A SERIES
 ART DIRECTION FOR A VARIETY OR MUSIC PROGRAM
 Actor In A Short Form Comedy Or Drama Series
 Actor in a Short Form Comedy or Drama Series
 Actress In A Short Form Comedy Or Drama Series
 Actress in a Short Form Comedy or Drama Series
 Animated Program
 Animated Program (for programming less than one hour)
 Animated Program (for programming one hour or more)
 Art Direction For A Contemporary Or Fantasy Series (Single-Camera)
 Art Direction For A Contemporary Program (Half-Hour Or

Removal of first useless lines

In [102]:
removes = [
    'costume', 'Costumes', 'creative', 'Hairstyling', 'Host', 'Individual', 'Innovation', 'Informational',
    'LIGHTING', 'MUSIC', 'CAMERA', 'Make up', 'Makeup', 'Make-up', 'Design', 'Editing', 'Reality', 'SOUND',
    'Short Form', 'Short-Format', 'Special Class', 'Stunt', 'Technical', 'animated', 'art direction', 'casting',
    'choreography', 'cinematography', 'commercial', 'commercials', 'composing', 'costume', 'costumes', 'Interactive',
    'Children\'s Program', 'Documentary', 'Competition', 'Visual Effects', 'Drama Series', 'Limited Series',
    'Television Movie', 'Variety Talk Series', 'Variety Sketch Series', 'Variety Special', 'Short Form Comedy or Drama Series'
]

categories_to_remove = ['Comedy Series', 'Drama Series',' Limited Series', ' Made For Television Movie', 'Miniseries', 'Miniseries or Movie', ' Nonfiction Program (Alternative)', ' Nonfiction Series', ' NON-FICTION SERIES - area award', ' Nonfiction Series (Traditional)', ' Nonfiction Series', ' Television Movie', ' Variety Series', ' Variety Sketch Series']
filtered_df = df[~df['category'].isin(categories_to_remove)]

emmy_prizes = df.copy()
for word in removes:
    emmy_prizes = emmy_prizes[~emmy_prizes['category'].str.contains(word, case=False)]

emmy_prizes.to_csv('../dataset/the_emmy_awards.csv', index=False)

In [103]:
print_uniques(df, 'category')

unique 'category' values:
 ACHIEVEMENT IN MAIN TITLE THEME MUSIC
 ACHIEVEMENT IN MAKEUP
 ACHIEVEMENT IN MUSIC COMPOSITION for a limited series or a special (dramatic underscore)
 ACHIEVEMENT IN MUSIC DIRECTION
 ACHIEVEMENT IN NON-FICTION PROGRAMMING - PICTURE EDITING
 ACHIEVEMENT IN TECHNICAL DIRECTION AND electronic camerawork
 ANIMATED PROGRAM (FOR PROGRAMMING ONE HOUR OR LESS)
 ANIMATED PROGRAM (FOR PROGRAMMING one hour or less.)
 ART DIRECTION FOR A COMEDY-VARIETY or music series
 ART DIRECTION FOR A SERIES
 ART DIRECTION FOR A VARIETY OR MUSIC PROGRAM
 Actor In A Short Form Comedy Or Drama Series
 Actor in a Short Form Comedy or Drama Series
 Actress In A Short Form Comedy Or Drama Series
 Actress in a Short Form Comedy or Drama Series
 Animated Program
 Animated Program (for programming less than one hour)
 Animated Program (for programming one hour or more)
 Art Direction For A Contemporary Or Fantasy Series (Single-Camera)
 Art Direction For A Contemporary Program (Half-Hour Or

In [104]:
colonne = ['staff']
emmy_prizes = comma_remove(emmy_prizes)
emmy_prizes.to_csv('../dataset/the_emmy_awards.csv', index=False)

In [118]:
import pandas as pd

# Carica il dataset dei film
df_movies = pd.read_csv('../dataset/COPIA_rounded_updated_films.csv')
df_movies['dir_emmy_nom'] = 0
df_movies['dir_emmy_won'] = 0
df_movies['writer_emmy_won'] = 0
df_movies['writer_emmy_nom'] = 0
df_movies['act_emmy_nom'] = 0
df_movies['act_emmy_won'] = 0

# Carica il dataset dei premi Emmy
df_emmies = pd.read_csv('../dataset/the_emmy_awards.csv')

# Funzione per normalizzare e dividere i nomi
def normalize_split_names(names):
    return [name.strip() for name in names.split(',')]

# Iterare su ogni riga del dataset degli Emmy
for index, row in df_emmies.iterrows():
    staff_members = normalize_split_names(row['staff'])
    emmy_year = row['year']
    is_winner = row['win']

    # Controllare corrispondenze nel dataset movies
    for staff in staff_members:
        for column in ['actors', 'writer', 'director']:
            matches = df_movies[column].apply(lambda x: staff in str(x).split(', '))
            valid_years = df_movies['year'] >= (emmy_year - 1)
            matches = matches & valid_years

            # Aggiorna i contatori per direttori, scrittori e attori
            if column == 'director':
                df_movies.loc[matches, 'dir_emmy_nom'] += 1
                if is_winner:
                    df_movies.loc[matches, 'dir_emmy_won'] += 1
            elif column == 'writer':
                df_movies.loc[matches, 'writer_emmy_nom'] += 1
                if is_winner:
                    df_movies.loc[matches, 'writer_emmy_won'] += 1
            elif column == 'actors':
                df_movies.loc[matches, 'act_emmy_nom'] += 1
                if is_winner:
                    df_movies.loc[matches, 'act_emmy_won'] += 1

# Salvare il dataset aggiornato
df_movies.to_csv('movies_with_emmies.csv', index=False)
