# KNN - CLASSIFICATION

In [133]:
import ast
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
)
from scikitplot.metrics import plot_roc
from scikitplot.metrics import plot_precision_recall
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

In [134]:
train_df = pd.read_csv("pp_train.csv")
test_df = pd.read_csv("pp_test.csv")
copy_train = train_df.copy()
copy_test = test_df.copy()

In [135]:
copy_train['countryOfOrigin'] = copy_train['countryOfOrigin'].apply(ast.literal_eval)
copy_train.head()

Unnamed: 0,originalTitle,rating,startYear,endYear,runtimeMinutes,awardWins,numVotes,totalImages,totalVideos,totalCredits,...,is_Fantasy,is_Family,is_Mystery,is_Talk-Show,is_Drama,is_Sport,is_War,is_Horror,is_Animation,fill_runtimeMinutes
0,Neogenic Nightmare Chapter 3: Hydro-Man,8,1995,,,0.0,779,1,0,21,...,0,0,0,0,0,0,0,0,1,40.0
1,Looping,6,1982,,,0.0,11,1,0,1,...,0,0,0,0,0,0,0,0,0,28.0
2,Idealnaya para,6,1992,,,0.0,38,1,0,24,...,0,0,0,0,0,0,0,0,0,90.0
3,MasterChef Celebrity México,6,2021,,,0.0,15,22,0,56,...,0,0,0,0,0,0,0,0,0,31.0
4,Seutateueob,8,2020,2020.0,80.0,1.0,15718,347,2,47,...,0,0,0,0,1,0,0,0,0,80.0


In [136]:
copy_test['countryOfOrigin'] = copy_test['countryOfOrigin'].apply(ast.literal_eval)
copy_test.head()

Unnamed: 0,originalTitle,rating,startYear,endYear,runtimeMinutes,awardWins,numVotes,totalImages,totalVideos,totalCredits,...,is_Short,is_Action,is_Adult,is_History,is_Musical,is_Crime,is_Film-Noir,is_War,is_Comedy,fill_runtimeMinutes
0,Geologist of Animal Collective Performs Live a...,10,2015,,21.0,0.0,15,7,2,11,...,1,0,0,0,0,0,0,0,0,21.0
1,Shan gou 1999,5,1999,,,0.0,304,10,0,40,...,0,0,0,0,0,0,0,0,0,91.0
2,The Craftsman: Preserving the Last Higgins Boat,9,2023,,,0.0,10,2,0,6,...,0,0,0,0,0,0,0,0,0,73.0
3,Week 3: Tuesday,7,2020,,,0.0,11,0,0,52,...,0,0,0,0,0,0,0,0,0,33.0
4,Traumhaus,6,1999,,90.0,0.0,145,2,0,44,...,0,0,0,0,0,1,0,0,0,90.0


In [137]:
val = copy_test.loc[23,"countryOfOrigin"]
type(val)

list

### Preprocessing delle variabili

##### Eliminazione variabili inutilizzabili

In [138]:
copy_train.columns

Index(['originalTitle', 'rating', 'startYear', 'endYear', 'runtimeMinutes',
       'awardWins', 'numVotes', 'totalImages', 'totalVideos', 'totalCredits',
       'criticReviewsTotal', 'titleType', 'awardNominationsExcludeWins',
       'canHaveEpisodes', 'isAdult', 'numRegions', 'userReviewsTotal',
       'countryOfOrigin', 'is_Documentary', 'is_History', 'is_Adventure',
       'is_Thriller', 'is_Game-Show', 'is_Comedy', 'is_Sci-Fi', 'is_Romance',
       'is_Biography', 'is_Musical', 'is_Western', 'is_Music', 'is_Film-Noir',
       'is_Adult', 'is_Reality-TV', 'is_News', 'is_Action', 'is_Crime',
       'is_Short', 'is_Fantasy', 'is_Family', 'is_Mystery', 'is_Talk-Show',
       'is_Drama', 'is_Sport', 'is_War', 'is_Horror', 'is_Animation',
       'fill_runtimeMinutes'],
      dtype='object')

In [139]:
copy_test.columns

Index(['originalTitle', 'rating', 'startYear', 'endYear', 'runtimeMinutes',
       'awardWins', 'numVotes', 'totalImages', 'totalVideos', 'totalCredits',
       'criticReviewsTotal', 'titleType', 'awardNominationsExcludeWins',
       'canHaveEpisodes', 'isAdult', 'numRegions', 'userReviewsTotal',
       'countryOfOrigin', 'is_Western', 'is_Thriller', 'is_Horror',
       'is_Animation', 'is_Family', 'is_News', 'is_Sport', 'is_Reality-TV',
       'is_Biography', 'is_Sci-Fi', 'is_Fantasy', 'is_Documentary',
       'is_Talk-Show', 'is_Drama', 'is_Music', 'is_Game-Show', 'is_Adventure',
       'is_Romance', 'is_Mystery', 'is_Short', 'is_Action', 'is_Adult',
       'is_History', 'is_Musical', 'is_Crime', 'is_Film-Noir', 'is_War',
       'is_Comedy', 'fill_runtimeMinutes'],
      dtype='object')

In [140]:
copy_train = copy_train.drop(["originalTitle", "runtimeMinutes"], axis=1)
copy_test = copy_test.drop(["originalTitle", "runtimeMinutes"], axis=1)

##### Mapping variabile target

In [141]:
y = np.array(copy_train['titleType'])  # Il target originale

# Creiamo un dizionario che associa ogni categoria a un numero
y_map = {title: idx for idx, title in enumerate(copy_train['titleType'].unique())}

#aggiungo una nuova colonna
copy_train['titleType_encoded'] = copy_train['titleType'].map(y_map)
copy_test['titleType_encoded'] = copy_test['titleType'].map(y_map)

print(y_map)  # Per vedere la mappatura delle categorie

{'tvEpisode': 0, 'videoGame': 1, 'movie': 2, 'tvSeries': 3, 'video': 4, 'tvMiniSeries': 5, 'short': 6, 'tvMovie': 7, 'tvSpecial': 8, 'tvShort': 9}


##### Gestione country of origin
# FARE LA STESSA COSA ANCHE SU TEST E SALVARE

In [142]:
HISTORIC_CODES = {
    # Europa
    'XYU': {'name': 'Yugoslavia', 'continent': 'Europe'},
    'YUCS': {'name': 'Yugoslavia (Socialist Federal Republic)', 'continent': 'Europe'},
    'YUG': {'name': 'Yugoslavia', 'continent': 'Europe'},
    'SUHH': {'name': 'Soviet Union', 'continent': 'Europe'},
    'SUN': {'name': 'Soviet Union', 'continent': 'Europe'},
    'SU': {'name': 'Soviet Union', 'continent': 'Europe'},
    'USSR': {'name': 'Union of Soviet Socialist Republics', 'continent': 'Europe'},
    'DDDE': {'name': 'East Germany', 'continent': 'Europe'},
    'DDR': {'name': 'German Democratic Republic', 'continent': 'Europe'},
    'XWG': {'name': 'East Germany', 'continent': 'Europe'},
    'BRD': {'name': 'Federal Republic of Germany (West Germany)', 'continent': 'Europe'},
    'FRGG': {'name': 'West Germany', 'continent': 'Europe'},
    'CSXX': {'name': 'Czechoslovakia', 'continent': 'Europe'},
    'CSHH': {'name': 'Czechoslovakia', 'continent': 'Europe'},
    'CSK': {'name': 'Czechoslovakia', 'continent': 'Europe'},
    'TCH': {'name': 'Czechoslovakia', 'continent': 'Europe'},
    'XCZ': {'name': 'Czechoslovakia (1945-1992)', 'continent': 'Europe'},
    'SCG': {'name': 'Serbia and Montenegro', 'continent': 'Europe'},
    'SCGN': {'name': 'Serbia and Montenegro', 'continent': 'Europe'},
    'XKV': {'name': 'Kosovo', 'continent': 'Europe'},
    'XPL': {'name': 'Poland (1945-1989)', 'continent': 'Europe'},
    'XHU': {'name': 'Hungary (1945-1989)', 'continent': 'Europe'},
    'XEU': {'name': 'European Union', 'continent': 'Europe'},
    'EUE': {'name': 'European Union', 'continent': 'Europe'},
    'XCI': {'name': 'Channel Islands', 'continent': 'Europe'},
    'XES': {'name': 'Estonia (pre-1991)', 'continent': 'Europe'},
    'XLV': {'name': 'Latvia (pre-1991)', 'continent': 'Europe'},
    'XLT': {'name': 'Lithuania (pre-1991)', 'continent': 'Europe'},
    'XAZ': {'name': 'Azores', 'continent': 'Europe'},
    'XFR': {'name': 'France (Historical)', 'continent': 'Europe'},
    'FXX': {'name': 'Metropolitan France', 'continent': 'Europe'},
    'AHU': {'name': 'Austria-Hungary', 'continent': 'Europe'},
    'PRU': {'name': 'Prussia', 'continent': 'Europe'},
    'BAV': {'name': 'Bavaria', 'continent': 'Europe'},
    'KPS': {'name': 'Kingdom of the Two Sicilies', 'continent': 'Europe'},
    'VEN': {'name': 'Republic of Venice', 'continent': 'Europe'},
    'XOH': {'name': 'Ottoman Hungary', 'continent': 'Europe'},
    'XCR': {'name': 'Crimean Khanate', 'continent': 'Europe'},
    'MON': {'name': 'Montenegro (pre-independence)', 'continent': 'Europe'},
    'VA': {'name': 'Vatican City State', 'continent': 'Europe'},
    'SRB': {'name': 'Kingdom of Serbia', 'continent': 'Europe'},
    'PDM': {'name': 'Piedmont-Sardinia', 'continent': 'Europe'},
    'XPP': {'name': 'Papal States', 'continent': 'Europe'},
    'XMO': {'name': 'Monaco (Historical)', 'continent': 'Europe'},
    'XSM': {'name': 'San Marino (Historical)', 'continent': 'Europe'},
    'XLI': {'name': 'Liechtenstein (Historical)', 'continent': 'Europe'},
    
    # Asia
    'ANT': {'name': 'Netherlands Antilles', 'continent': 'North_America'},
    'ANHH': {'name': 'Netherlands Antilles', 'continent': 'North_America'},
    'XAN': {'name': 'Netherlands Antilles', 'continent': 'North_America'},
    'BUR': {'name': 'Burma (now Myanmar)', 'continent': 'Asia'},
    'TPE': {'name': 'Chinese Taipei (Taiwan)', 'continent': 'Asia'},
    'XIR': {'name': 'Persia (now Iran)', 'continent': 'Asia'},
    'TMP': {'name': 'East Timor (pre-independence)', 'continent': 'Asia'},
    'TLS': {'name': 'East Timor', 'continent': 'Asia'},
    'XTI': {'name': 'East Timor (Portuguese Timor)', 'continent': 'Asia'},
    'YMD': {'name': 'South Yemen (People\'s Democratic Republic)', 'continent': 'Asia'},
    'YMN': {'name': 'North Yemen (Yemen Arab Republic)', 'continent': 'Asia'},
    'XHK': {'name': 'Hong Kong (pre-1997)', 'continent': 'Asia'},
    'HKJ': {'name': 'Hashemite Kingdom of Jordan', 'continent': 'Asia'},
    'XNA': {'name': 'Neutral Zone (Saudi Arabia-Iraq)', 'continent': 'Asia'},
    'XEA': {'name': 'East Asia (historical region)', 'continent': 'Asia'},
    'XSE': {'name': 'Southeast Asia (historical region)', 'continent': 'Asia'},
    'XME': {'name': 'Middle East (region)', 'continent': 'Asia'},
    'XIO': {'name': 'British Indian Ocean Territory', 'continent': 'Asia'},
    'XSI': {'name': 'Sikkim (now part of India)', 'continent': 'Asia'},
    #'XEG': {'name': 'United Arab Republic (Egypt & Syria)', 'continent': 'Africa/Asia'},
    'XMB': {'name': 'Manchukuo (Japanese puppet state in Manchuria)', 'continent': 'Asia'},
    'KOR': {'name': 'Korea (pre-division)', 'continent': 'Asia'},
    'XSM': {'name': 'Siam (now Thailand)', 'continent': 'Asia'},
    'XCY': {'name': 'Ceylon (now Sri Lanka)', 'continent': 'Asia'},
    'SAA': {'name': 'Sarawak (pre-Malaysia)', 'continent': 'Asia'},
    'SBH': {'name': 'North Borneo (now Sabah)', 'continent': 'Asia'},
    'XPH': {'name': 'Philippines (American period)', 'continent': 'Asia'},
    'XIN': {'name': 'British India', 'continent': 'Asia'},
    'GBG': {'name': 'Great Burhan Regime (Afghanistan)', 'continent': 'Asia'},
    'XPL': {'name': 'Palestine (British Mandate)', 'continent': 'Asia'},
    'XFI': {'name': 'French Indochina', 'continent': 'Asia'},
    'XDP': {'name': 'Portuguese India', 'continent': 'Asia'},
    'XMO': {'name': 'Macao (pre-1999)', 'continent': 'Asia'},
    'XQG': {'name': 'Qing Dynasty China', 'continent': 'Asia'},
    
    # Africa
    'RHO': {'name': 'Rhodesia (now Zimbabwe)', 'continent': 'Africa'},
    'XRH': {'name': 'Southern Rhodesia', 'continent': 'Africa'},
    'ZAR': {'name': 'Zaire (now Democratic Republic of the Congo)', 'continent': 'Africa'},
    'XAF': {'name': 'Afars and Issas (now Djibouti)', 'continent': 'Africa'},
    'XDY': {'name': 'Dahomey (now Benin)', 'continent': 'Africa'},
    'XUG': {'name': 'Uganda (pre-1962)', 'continent': 'Africa'},
    'XET': {'name': 'Ethiopia (pre-1993)', 'continent': 'Africa'},
    'XSU': {'name': 'Sudan (pre-2011)', 'continent': 'Africa'},
    'USR': {'name': 'Upper Volta (now Burkina Faso)', 'continent': 'Africa'},
    'XVO': {'name': 'Upper Volta', 'continent': 'Africa'},
    'XBF': {'name': 'Burkina Faso (1984-present)', 'continent': 'Africa'},
    'XLI': {'name': 'Libya (pre-2011)', 'continent': 'Africa'},
    'XTC': {'name': 'Tanganyika (now part of Tanzania)', 'continent': 'Africa'},
    'XZR': {'name': 'Zanzibar (now part of Tanzania)', 'continent': 'Africa'},
    'XTZ': {'name': 'Tanzania (1964-present)', 'continent': 'Africa'},
    'XGH': {'name': 'Gold Coast (now Ghana)', 'continent': 'Africa'},
    'XSL': {'name': 'Sierra Leone (pre-1961)', 'continent': 'Africa'},
    'XMA': {'name': 'Madeira', 'continent': 'Africa'},
    'XTA': {'name': 'Tangier International Zone', 'continent': 'Africa'},
    'XCV': {'name': 'Cape Verde (before 2013)', 'continent': 'Africa'},
    'XBI': {'name': 'British Somaliland', 'continent': 'Africa'},
    'XIF': {'name': 'Italian Somaliland', 'continent': 'Africa'},
    'XAL': {'name': 'Algérie française (French Algeria)', 'continent': 'Africa'},
    'XNY': {'name': 'Nyasaland (now Malawi)', 'continent': 'Africa'},
    'XBE': {'name': 'Belgian Congo', 'continent': 'Africa'},
    'XBA': {'name': 'Basutoland (now Lesotho)', 'continent': 'Africa'},
    'XSW': {'name': 'South West Africa (now Namibia)', 'continent': 'Africa'},
    'XZB': {'name': 'Zambezi (colonial term)', 'continent': 'Africa'},
    'XAO': {'name': 'Angola (Portuguese colony)', 'continent': 'Africa'},
    'EH': {'name': 'Western Sahara (Sahara Occidental)', 'continent': 'Africa'},
    'XEN': {'name': 'Emirates of North Africa', 'continent': 'Africa'},
    'XTR': {'name': 'Tripolitania (now part of Libya)', 'continent': 'Africa'},
    'XCY': {'name': 'Cyrenaica (now part of Libya)', 'continent': 'Africa'},
    'XAB': {'name': 'Abyssinia (historical Ethiopia)', 'continent': 'Africa'},
    'XBB': {'name': 'Bechuanaland (now Botswana)', 'continent': 'Africa'},
    'XMA': {'name': 'Madagascar (French colony)', 'continent': 'Africa'},
    'XMO': {'name': 'Morocco (French/Spanish protectorate)', 'continent': 'Africa'},
    'XTU': {'name': 'Tunisia (French protectorate)', 'continent': 'Africa'},
    
    # Americas
    'XUS': {'name': 'United States (Historical)', 'continent': 'North_America'},
    'XCA': {'name': 'Canada (Historical)', 'continent': 'North_America'},
    'XPU': {'name': 'Panama Canal Zone', 'continent': 'North_America'},
    'XMX': {'name': 'Mexico (Historical)', 'continent': 'North_America'},
    'XBR': {'name': 'Brazil (Historical)', 'continent': 'South_America'},
    'XAR': {'name': 'Argentina (Historical)', 'continent': 'South_America'},
    'XCB': {'name': 'Caribbean Islands (collective)', 'continent': 'North_America'},
    'XCL': {'name': 'Chile (pre-1990)', 'continent': 'South_America'},
    'XCO': {'name': 'Colombia (pre-1991)', 'continent': 'South_America'},
    'XCU': {'name': 'Cuba (pre-1959)', 'continent': 'North_America'},
    'XEN': {'name': 'English Caribbean Islands', 'continent': 'North_America'},
    'XFR': {'name': 'French Caribbean Islands', 'continent': 'North_America'},
    'XGL': {'name': 'Greenland (pre-1979)', 'continent': 'North_America'},
    'XGY': {'name': 'Guyana (British Guiana)', 'continent': 'South_America'},
    'XSR': {'name': 'Suriname (Dutch Guiana)', 'continent': 'South_America'},
    'XGF': {'name': 'French Guiana (Historical)', 'continent': 'South_America'},
    'XPI': {'name': 'Saint-Pierre and Miquelon', 'continent': 'North_America'},
    'USC': {'name': 'Confederate States of America', 'continent': 'North_America'},
    'XTX': {'name': 'Republic of Texas', 'continent': 'North_America'},
    'XHT': {'name': 'Saint-Domingue (now Haiti)', 'continent': 'North_America'},
    'XBH': {'name': 'British Honduras (now Belize)', 'continent': 'North_America'},
    'XBW': {'name': 'British West Indies', 'continent': 'North_America'},
    'XUN': {'name': 'United Provinces of Central America', 'continent': 'North_America'},
    'XGP': {'name': 'Gran Colombia', 'continent': 'South_America'},
    'XPR': {'name': 'Peru-Bolivian Confederation', 'continent': 'South_America'},
    'XPN': {'name': 'Panama (as part of Colombia)', 'continent': 'North_America'},
    'XRI': {'name': 'Río de la Plata (Viceroyalty)', 'continent': 'South_America'},
    'XLN': {'name': 'New Granada (Viceroyalty)', 'continent': 'South_America'},
    'XNS': {'name': 'New Spain (Viceroyalty)', 'continent': 'North_America'},
    'XBE': {'name': 'British Empire in America', 'continent': 'North_America'},
    'XDW': {'name': 'Danish West Indies (now US Virgin Islands)', 'continent': 'North_America'},
    
    # Oceania
    'XAU': {'name': 'Australia (pre-Federation)', 'continent': 'Oceania'},
    'XNZ': {'name': 'New Zealand (pre-1907)', 'continent': 'Oceania'},
    'PCI': {'name': 'Pacific Islands Trust Territory', 'continent': 'Oceania'},
    'XPI': {'name': 'Pacific Islands (US Trust Territory)', 'continent': 'Oceania'},
    'XFJ': {'name': 'Fiji (pre-1970)', 'continent': 'Oceania'},
    'XPN': {'name': 'Papua New Guinea (pre-1975)', 'continent': 'Oceania'},
    'XPW': {'name': 'Palau (pre-1994)', 'continent': 'Oceania'},
    'XMH': {'name': 'Marshall Islands (pre-1986)', 'continent': 'Oceania'},
    'XFM': {'name': 'Federated States of Micronesia (pre-1986)', 'continent': 'Oceania'},
    'XGE': {'name': 'Gilbert Islands (now part of Kiribati)', 'continent': 'Oceania'},
    'XEL': {'name': 'Ellice Islands (now Tuvalu)', 'continent': 'Oceania'},
    'XNH': {'name': 'New Hebrides (now Vanuatu)', 'continent': 'Oceania'},
    'PCT': {'name': 'Pacific Islands Trust Territory', 'continent': 'Oceania'},
    'XWS': {'name': 'Western Samoa (now Samoa)', 'continent': 'Oceania'},
    'XNA': {'name': 'Netherlands New Guinea', 'continent': 'Oceania'},
    'XHS': {'name': 'Hawaii (pre-US statehood)', 'continent': 'Oceania'},
    'XBP': {'name': 'British Polynesia', 'continent': 'Oceania'},
    'XFP': {'name': 'French Polynesia (historical)', 'continent': 'Oceania'},
    'XNC': {'name': 'New Caledonia (historical)', 'continent': 'Oceania'},
    'XSI': {'name': 'Solomon Islands (British protectorate)', 'continent': 'Oceania'},
    
}

In [143]:
import pycountry
import pycountry_convert as pc


def get_continent(country_code):
    """Ottiene il continente per una singola sigla di paese"""
    # Controlla prima nel dizionario storico
    if country_code in HISTORIC_CODES:
        return HISTORIC_CODES[country_code]['continent']
    
    try:
        # Per codici a 2 lettere (alpha-2)
        if len(country_code) == 2:
            country_continent_code = pc.country_alpha2_to_continent_code(country_code)
            continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
            return continent_name
            
        # Per codici a 3 lettere (alpha-3)
        elif len(country_code) == 3:
            # Converti da alpha-3 a alpha-2
            country = pycountry.countries.get(alpha_3=country_code)
            if country:
                country_alpha2 = country.alpha_2
                country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
                continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
                return continent_name
            
        # Controlla nei paesi storici di pycountry
        historic = next((c for c in pycountry.historic_countries if c.alpha_3 == country_code), None)
        if historic:
            return map_historic_to_continent(historic.name)
                
        return None
    except (KeyError, ValueError, AttributeError):
        return None

def get_continents_for_list(country_codes):
    """Gestisce una lista di sigle di paesi e restituisce un dizionario con i risultati"""
    results = {}
    
    # Se l'input è una singola stringa, convertila in lista
    if isinstance(country_codes, str):
        country_codes = [country_codes]
    
    # Elabora ogni sigla nella lista
    for code in country_codes:
        results[code] = get_continent(code)
    
    return results

def map_historic_to_continent(country_name):
    """Mappa manuale di alcuni paesi storici ai loro continenti"""
    mappings = {
        'USSR': 'Europe',
        'YUGOSLAVIA': 'Europe',
        'CZECHOSLOVAKIA': 'Europe',
        # Aggiungi altri paesi storici secondo necessità
    }
    
    for key, value in mappings.items():
        if key in country_name.upper():
            return value
    
    return None

# utilizzo con lista
all_countries = set(country for sublist in copy_train['countryOfOrigin'] if isinstance(sublist, list) for country in sublist)
continent_mapping = get_continents_for_list(all_countries)
continent_mapping

{'KE': 'Africa',
 'IN': 'Asia',
 'ET': 'Africa',
 'CH': 'Europe',
 'IS': 'Europe',
 'NP': 'Asia',
 'CZ': 'Europe',
 'HK': 'Asia',
 'SN': 'Africa',
 'US': 'North America',
 'CN': 'Asia',
 'GN': 'Africa',
 'FR': 'Europe',
 'NG': 'Africa',
 'IL': 'Asia',
 'GW': 'Africa',
 'BD': 'Asia',
 'BA': 'Europe',
 'AE': 'Asia',
 'ME': 'Europe',
 'MK': 'Europe',
 'VC': 'North America',
 'PK': 'Asia',
 'DE': 'Europe',
 'ZA': 'Africa',
 'SY': 'Asia',
 'TW': 'Asia',
 'JP': 'Asia',
 'EG': 'Africa',
 'AR': 'South America',
 'PE': 'South America',
 'CR': 'North America',
 'KR': 'Asia',
 'CO': 'South America',
 'PR': 'North America',
 'MO': 'Asia',
 'TO': 'Oceania',
 'EH': 'Africa',
 'LV': 'Europe',
 'SUHH': 'Europe',
 'LI': 'Europe',
 'LC': 'North America',
 'SM': 'Europe',
 'CD': 'Africa',
 'NE': 'Africa',
 'GP': 'North America',
 'GL': 'North America',
 'RW': 'Africa',
 'EC': 'South America',
 'XYU': 'Europe',
 'SV': 'North America',
 'HU': 'Europe',
 'IT': 'Europe',
 'BR': 'South America',
 'CY': 'Asia'

In [144]:
def replace_country_with_continent(df, column_name, continent_mapping):
    """
    Sostituisce i codici dei paesi nella colonna di un DataFrame con il loro continente corrispondente.
    
    :param df: DataFrame contenente la colonna da modificare
    :param column_name: Nome della colonna con le liste di codici paese
    :param continent_mapping: Dizionario {codice_paese: continente}
    :return: DataFrame con la colonna modificata
    """
    df[column_name] = df[column_name].apply(
        lambda country_list: list(set(continent_mapping.get(country, 'Unknown') for country in country_list))
    )
    return df


# Applicazione della funzione
df = replace_country_with_continent(copy_train, 'countryOfOrigin', get_continents_for_list(all_countries))

# Stampa del risultato
print(copy_train["countryOfOrigin"])


0        [North America]
1               [Europe]
2               [Europe]
3        [North America]
4                 [Asia]
              ...       
16426           [Europe]
16427             [Asia]
16428             [Asia]
16429           [Europe]
16430             [Asia]
Name: countryOfOrigin, Length: 16431, dtype: object


In [145]:
# Flatten all lists and get unique genres
all_continents = set(cont for sublist in copy_train['countryOfOrigin'] if isinstance(sublist, list) for cont in sublist)
print(all_continents)

# Create a one-hot encoded column for each continent
for genre in all_continents:
    copy_train[f"is_from_{genre}"] = copy_train['countryOfOrigin'].apply(lambda x: 1 if genre in x else 0)



#elimino colonna country of origin
#variabili da eliminare
column2drop = ['countryOfOrigin']
copy_train.drop(column2drop, axis=1, inplace=True)

{'South America', 'North America', 'Africa', 'Asia', 'Europe', 'Oceania'}


In [146]:
copy_train.head(15)

Unnamed: 0,rating,startYear,endYear,awardWins,numVotes,totalImages,totalVideos,totalCredits,criticReviewsTotal,titleType,...,is_Horror,is_Animation,fill_runtimeMinutes,titleType_encoded,is_from_South America,is_from_North America,is_from_Africa,is_from_Asia,is_from_Europe,is_from_Oceania
0,8,1995,,0.0,779,1,0,21,1,tvEpisode,...,0,1,40.0,0,0,1,0,0,0,0
1,6,1982,,0.0,11,1,0,1,0,videoGame,...,0,0,28.0,1,0,0,0,0,1,0
2,6,1992,,0.0,38,1,0,24,0,movie,...,0,0,90.0,2,0,0,0,0,1,0
3,6,2021,,0.0,15,22,0,56,0,tvSeries,...,0,0,31.0,3,0,1,0,0,0,0
4,8,2020,2020.0,1.0,15718,347,2,47,4,tvSeries,...,0,0,80.0,3,0,0,0,1,0,0
5,9,1995,,0.0,14,1,0,0,0,video,...,0,0,34.0,4,0,1,0,0,0,0
6,8,1995,,0.0,96,5,0,110,0,tvEpisode,...,0,0,45.0,0,0,1,0,0,0,0
7,8,2022,,0.0,70,7,0,69,0,tvMiniSeries,...,0,0,60.0,5,0,0,0,1,0,0
8,8,1996,,2.0,12,1,0,2,0,short,...,0,1,12.0,6,1,0,0,0,0,0
9,9,2022,,0.0,27,9,0,15,0,tvMiniSeries,...,0,0,60.0,5,0,0,0,1,0,0


In [147]:
continent_distribution = copy_train[['is_from_South America', 'is_from_Oceania', 'is_from_Asia', 
                             'is_from_Africa', 'is_from_Europe', 'is_from_North America']].sum()

print(continent_distribution)

#somma totale > num record perchè alcuni multivalore

is_from_South America     405
is_from_Oceania           345
is_from_Asia             2536
is_from_Africa            154
is_from_Europe           5718
is_from_North America    7856
dtype: int64


##### Creazione array di valori

In [148]:
X_train = copy_train.values
X_test = copy_test.values

##### Splitting in validation set

In [149]:
X_train2, X_val, y_train2, y_val = train_test_split(
     X_train, y, test_size=0.2, random_state=32
)

# X_train2 = training set --> 80% di X_train
# X_val = validation set --> 20% di X_train
# y_train2 e y_val sono i target corrispondenti (etichette) per i dati di addestramento e validazione

##### Normalizzazione

In [150]:
# standardsc = StandardScaler()

# # Fitta e trasforma il training set in un solo passaggio
# X_train_standardsc = standardsc.fit_transform(X_train2)
# X_val_standardsc = standardsc.fit_transform(X_val)

# # Trasforma il test set (senza rifittare!)
# X_test_standardsc = standardsc.transform(X_test)

In [151]:
# minmax = MinMaxScaler()

# # Fitta e trasforma il training set in un solo passaggio
# X_train_minmax = minmax.fit_transform(X_train2)
# X_val_minmax = minmax.fit_transform(X_val)


# # Trasforma il test set (senza rifittare!)
# X_test_minmax = minmax.transform(X_test)

##### Variabili da modificare in base a cosa uso come metodo per normalizzare

In [152]:
# X_train_tr = X_train_minmax 
# X_val_tr = X_val_minmax