### Étude particulière du dataset "anime-dataset-2023.csv" : 

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.neighbors import NearestNeighbors
import re
from datetime import datetime

In [2]:
df_0 = pd.read_csv("./anime-dataset-2023.csv")

In [3]:
df = df_0.copy()

In [4]:
df.columns

Index(['anime_id', 'Name', 'English name', 'Other name', 'Score', 'Genres',
       'Synopsis', 'Type', 'Episodes', 'Aired', 'Premiered', 'Status',
       'Producers', 'Licensors', 'Studios', 'Source', 'Duration', 'Rating',
       'Rank', 'Popularity', 'Favorites', 'Scored By', 'Members', 'Image URL'],
      dtype='object')

In [5]:
cols = ["Genres", "Producers", "Studios", "Type", "Episodes", "Duration", "Premiered", "Aired", "Rating"]

In [6]:
df = df[cols]

In [7]:
df

Unnamed: 0,Genres,Producers,Studios,Type,Episodes,Duration,Premiered,Aired,Rating
0,"Action, Award Winning, Sci-Fi",Bandai Visual,Sunrise,TV,26.0,24 min per ep,spring 1998,"Apr 3, 1998 to Apr 24, 1999",R - 17+ (violence & profanity)
1,"Action, Sci-Fi","Sunrise, Bandai Visual",Bones,Movie,1.0,1 hr 55 min,UNKNOWN,"Sep 1, 2001",R - 17+ (violence & profanity)
2,"Action, Adventure, Sci-Fi",Victor Entertainment,Madhouse,TV,26.0,24 min per ep,spring 1998,"Apr 1, 1998 to Sep 30, 1998",PG-13 - Teens 13 or older
3,"Action, Drama, Mystery, Supernatural","Bandai Visual, Dentsu, Victor Entertainment, T...",Sunrise,TV,26.0,25 min per ep,summer 2002,"Jul 3, 2002 to Dec 25, 2002",PG-13 - Teens 13 or older
4,"Adventure, Fantasy, Supernatural","TV Tokyo, Dentsu",Toei Animation,TV,52.0,23 min per ep,fall 2004,"Sep 30, 2004 to Sep 29, 2005",PG - Children
...,...,...,...,...,...,...,...,...,...
24900,"Comedy, Fantasy, Slice of Life",UNKNOWN,UNKNOWN,ONA,15.0,Unknown,UNKNOWN,"Jul 4, 2023 to ?",PG-13 - Teens 13 or older
24901,"Action, Adventure, Fantasy",UNKNOWN,UNKNOWN,ONA,18.0,Unknown,UNKNOWN,"Jul 27, 2023 to ?",PG-13 - Teens 13 or older
24902,"Action, Adventure, Fantasy, Sci-Fi",UNKNOWN,UNKNOWN,ONA,16.0,Unknown,UNKNOWN,"Jul 19, 2023 to ?",PG-13 - Teens 13 or older
24903,UNKNOWN,UNKNOWN,UNKNOWN,Music,1.0,3 min,UNKNOWN,"Apr 23, 2022",PG-13 - Teens 13 or older


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24905 entries, 0 to 24904
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Genres     24905 non-null  object
 1   Producers  24905 non-null  object
 2   Studios    24905 non-null  object
 3   Type       24905 non-null  object
 4   Episodes   24905 non-null  object
 5   Duration   24905 non-null  object
 6   Premiered  24905 non-null  object
 7   Aired      24905 non-null  object
 8   Rating     24905 non-null  object
dtypes: object(9)
memory usage: 1.7+ MB


### Prétraitement des variables numériques

In [9]:
for i in list(df["Duration"]):
  if not isinstance(i, int):
    print(i)

24 min per ep
1 hr 55 min
24 min per ep
25 min per ep
23 min per ep
23 min per ep
23 min per ep
23 min per ep
27 min per ep
24 min per ep
23 min per ep
24 min
22 min per ep
25 min per ep
23 min per ep
24 min per ep
23 min per ep
24 min per ep
24 min per ep
24 min per ep
24 min per ep
1 hr 44 min
1 hr 26 min
25 min per ep
1 hr 22 min
29 min per ep
25 min per ep
1 hr 31 min
2 hr 4 min
24 min per ep
30 min per ep
24 min per ep
29 min per ep
24 min per ep
23 min per ep
1 hr 45 min
22 min per ep
24 min per ep
25 min per ep
23 min per ep
24 min per ep
23 min per ep
23 min per ep
24 min per ep
23 min per ep
24 min per ep
24 min per ep
24 min per ep
24 min per ep
24 min per ep
24 min per ep
24 min per ep
24 min per ep
24 min per ep
25 min per ep
24 min per ep
25 min per ep
25 min per ep
24 min per ep
25 min per ep
25 min per ep
27 min per ep
52 min
28 min per ep
24 min per ep
24 min per ep
1 hr 59 min
2 hr
24 min per ep
24 min per ep
30 min per ep
24 min per ep
24 min per ep
24 min per ep
24 m

In [10]:
def convert_to_minutes(duration):
    # Si c'est déjà un nombre (par exemple, après nettoyage préalable), on le retourne
    if isinstance(duration, (int, float)):
        return duration

    # Gérer les valeurs "Unknown"
    if "Unknown" in duration or pd.isna(duration):
        return 0

    # Cas "hr min" -> conversion en minutes
    if "hr" in duration and "min" in duration:
        hours = int(re.search(r"(\d+)\s*hr", duration).group(1))
        minutes = int(re.search(r"(\d+)\s*min", duration).group(1))
        return hours * 60 + minutes

    # Cas "hr" uniquement (pas de minutes)
    if "hr" in duration:
        hours = int(re.search(r"(\d+)\s*hr", duration).group(1))
        return hours * 60

    # Cas "min per ep" -> on prend la valeur en minutes (pour 1 épisode)
    if "min per ep" in duration:
        minutes = re.search(r"(\d+)\s*min", duration)
        return int(minutes.group(1)) if minutes else 0

    # Cas avec seulement des minutes
    if "min" in duration:
        minutes = re.search(r"(\d+)\s*min", duration)
        return int(minutes.group(1)) if minutes else 0

    # Cas avec des secondes (on ignore)
    if "sec" in duration:
        return 0  # Ignorer les durées en secondes

    # Si aucune correspondance trouvée
    return 0

In [11]:
df['Duration'] = df['Duration'].apply(convert_to_minutes)

In [12]:
missing_values = df['Duration'].isna().sum()

In [13]:
# Afficher les résultats
print(f"Nombre de valeurs NaN après conversion: {missing_values}")

Nombre de valeurs NaN après conversion: 0


In [14]:
# Afficher les premières lignes pour vérifier le résultat
df.head()

Unnamed: 0,Genres,Producers,Studios,Type,Episodes,Duration,Premiered,Aired,Rating
0,"Action, Award Winning, Sci-Fi",Bandai Visual,Sunrise,TV,26.0,24,spring 1998,"Apr 3, 1998 to Apr 24, 1999",R - 17+ (violence & profanity)
1,"Action, Sci-Fi","Sunrise, Bandai Visual",Bones,Movie,1.0,115,UNKNOWN,"Sep 1, 2001",R - 17+ (violence & profanity)
2,"Action, Adventure, Sci-Fi",Victor Entertainment,Madhouse,TV,26.0,24,spring 1998,"Apr 1, 1998 to Sep 30, 1998",PG-13 - Teens 13 or older
3,"Action, Drama, Mystery, Supernatural","Bandai Visual, Dentsu, Victor Entertainment, T...",Sunrise,TV,26.0,25,summer 2002,"Jul 3, 2002 to Dec 25, 2002",PG-13 - Teens 13 or older
4,"Adventure, Fantasy, Supernatural","TV Tokyo, Dentsu",Toei Animation,TV,52.0,23,fall 2004,"Sep 30, 2004 to Sep 29, 2005",PG - Children


In [15]:
# Fonction pour normaliser la colonne "Aired" et retourner des chaînes de caractères
def normalize_aired(aired):
    default_date = "1900-01-01"  # Date par défaut si NaN
    if pd.isna(aired):
        return default_date, default_date
    
    # Cas "date to date" (ex: 'Apr 3, 1998 to Apr 24, 1999')
    if "to" in aired:
        dates = aired.split("to")
        start_date_str = dates[0].strip()
        end_date_str = dates[1].strip()
        
        # Convertir la date de début
        try:
            start_date = datetime.strptime(start_date_str, '%b %d, %Y').strftime('%Y-%m-%d')
        except ValueError:
            start_date = default_date

        # Convertir la date de fin (si elle n'est pas '?')
        if end_date_str != "?":
            try:
                end_date = datetime.strptime(end_date_str, '%b %d, %Y').strftime('%Y-%m-%d')
            except ValueError:
                end_date = default_date
        else:
            end_date = default_date

        return start_date, end_date

    # Cas "une seule date" (ex: 'Sep 1, 2001')
    else:
        try:
            start_date = datetime.strptime(aired.strip(), '%b %d, %Y').strftime('%Y-%m-%d')
        except ValueError:
            start_date = default_date
        return start_date, default_date

In [16]:
# Appliquer la fonction pour normaliser la colonne "Aired"
df[['Aired_start', 'Aired_end']] = df['Aired'].apply(lambda x: pd.Series(normalize_aired(x)))

In [17]:
# Vérifier les résultats
print(df[['Aired', 'Aired_start', 'Aired_end']].head(20))

                           Aired Aired_start   Aired_end
0    Apr 3, 1998 to Apr 24, 1999  1998-04-03  1999-04-24
1                    Sep 1, 2001  2001-09-01  1900-01-01
2    Apr 1, 1998 to Sep 30, 1998  1998-04-01  1998-09-30
3    Jul 3, 2002 to Dec 25, 2002  2002-07-03  2002-12-25
4   Sep 30, 2004 to Sep 29, 2005  2004-09-30  2005-09-29
5    Apr 6, 2005 to Mar 19, 2008  2005-04-06  2008-03-19
6   Apr 15, 2005 to Sep 27, 2005  2005-04-15  2005-09-27
7   Sep 11, 2002 to Sep 10, 2003  2002-09-11  2003-09-10
8   Apr 17, 2004 to Feb 18, 2006  2004-04-17  2006-02-18
9    Apr 7, 2004 to Sep 28, 2005  2004-04-07  2005-09-28
10    Oct 3, 2002 to Feb 8, 2007  2002-10-03  2007-02-08
11             Oct 20, 1999 to ?  1999-10-20  1900-01-01
12  Oct 10, 2001 to Mar 23, 2005  2001-10-10  2005-03-23
13   Oct 6, 2004 to Dec 15, 2004  2004-10-06  2004-12-15
14   Oct 5, 2004 to Mar 29, 2005  2004-10-05  2005-03-29
15   Oct 6, 2004 to Mar 30, 2005  2004-10-06  2005-03-30
16  Apr 17, 2003 to Sep 25, 200

In [18]:
df['Aired_start'] = pd.to_datetime(df['Aired_start'], errors='coerce').dt.year
df['Aired_end'] = pd.to_datetime(df['Aired_end'], errors='coerce').dt.year
df.drop(["Aired"], axis = 1, inplace=True)

In [19]:
print(df[['Aired_start', 'Aired_end']].head(10))

   Aired_start  Aired_end
0         1998       1999
1         2001       1900
2         1998       1998
3         2002       2002
4         2004       2005
5         2005       2008
6         2005       2005
7         2002       2003
8         2004       2006
9         2004       2005


In [20]:
# Fonction pour normaliser la colonne 'Premiered'
def normalize_premiered(premiered):
    if premiered == "UNKNOWN":
        return np.nan  # Remplacer 'UNKNOWN' par NaN

    # Dictionnaire pour mapper les saisons à des mois
    season_to_month = {
        "winter": 1,   # Winter -> Janvier
        "spring": 4,   # Spring -> Avril
        "summer": 7,   # Summer -> Juillet
        "fall": 10     # Fall -> Octobre
    }

    # Extraire la saison et l'année
    match = re.match(r"(winter|spring|summer|fall)\s+(\d{4})", premiered)
    if match:
        season = match.group(1)
        year = int(match.group(2))

        # Créer une date fictive (par exemple, le premier jour du mois de la saison)
        month = season_to_month[season]
        return f"{year}-{month:02d}-01"  # Format YYYY-MM-DD

    return np.nan

In [21]:
# Appliquer la fonction pour normaliser la colonne 'Premiered'
df['Premiered'] = df['Premiered'].apply(normalize_premiered)

In [22]:
# Convertir en format datetime si vous souhaitez manipuler ces dates par la suite
df['Premiered'] = pd.to_datetime(df['Premiered'], errors='coerce')

# Remplacer les valeurs NaN par une date par défaut si nécessaire
# Exemple : remplacer les NaN par "1900-01-01"
df['Premiered'] = df['Premiered'].fillna(pd.to_datetime("1900-01-01"))

# Extraire l'année et convertir en entier
df['Premiered'] = df['Premiered'].dt.year

# Si vous souhaitez remplacer les valeurs NaN par une valeur par défaut (par exemple 1900)
df['Premiered'] = df['Premiered'].fillna(1900).astype(int)

In [23]:
df

Unnamed: 0,Genres,Producers,Studios,Type,Episodes,Duration,Premiered,Rating,Aired_start,Aired_end
0,"Action, Award Winning, Sci-Fi",Bandai Visual,Sunrise,TV,26.0,24,1998,R - 17+ (violence & profanity),1998,1999
1,"Action, Sci-Fi","Sunrise, Bandai Visual",Bones,Movie,1.0,115,1900,R - 17+ (violence & profanity),2001,1900
2,"Action, Adventure, Sci-Fi",Victor Entertainment,Madhouse,TV,26.0,24,1998,PG-13 - Teens 13 or older,1998,1998
3,"Action, Drama, Mystery, Supernatural","Bandai Visual, Dentsu, Victor Entertainment, T...",Sunrise,TV,26.0,25,2002,PG-13 - Teens 13 or older,2002,2002
4,"Adventure, Fantasy, Supernatural","TV Tokyo, Dentsu",Toei Animation,TV,52.0,23,2004,PG - Children,2004,2005
...,...,...,...,...,...,...,...,...,...,...
24900,"Comedy, Fantasy, Slice of Life",UNKNOWN,UNKNOWN,ONA,15.0,0,1900,PG-13 - Teens 13 or older,2023,1900
24901,"Action, Adventure, Fantasy",UNKNOWN,UNKNOWN,ONA,18.0,0,1900,PG-13 - Teens 13 or older,2023,1900
24902,"Action, Adventure, Fantasy, Sci-Fi",UNKNOWN,UNKNOWN,ONA,16.0,0,1900,PG-13 - Teens 13 or older,2023,1900
24903,UNKNOWN,UNKNOWN,UNKNOWN,Music,1.0,3,1900,PG-13 - Teens 13 or older,2022,1900


In [24]:
# Fonction pour normaliser la colonne "Rating"
def normalize_rating(rating):
    if pd.isna(rating):
        return 'Unknown'  # Gérer les valeurs manquantes

    # Normaliser les valeurs selon les catégories principales
    rating = rating.strip().lower()  # Nettoyer l'entrée
    if "g" in rating:
        return "G"  # Public général
    elif "pg-13" in rating:
        return "PG-13"  # Adolescents de 13 ans et plus
    elif "pg" in rating:
        return "PG"  # Supervision parentale
    elif "r - 17+" in rating:
        return "R"  # Restreint, 17+ (violence, obscénités)
    elif "r+" in rating:
        return "R+"  # Restreint, avec nudité modérée
    elif "nc-17" in rating:
        return "NC-17"  # Restreint, 17+ avec contenu explicite
    else:
        return "Unknown"  # Si on ne trouve aucune correspondance


In [25]:
# Appliquer la fonction sur la colonne 'Rating'
df['Rating'] = df['Rating'].apply(normalize_rating)

In [26]:
# Si vous souhaitez ajouter une colonne numérique basée sur les ratings
rating_to_numeric = {
    "G": 1,
    "PG": 2,
    "PG-13": 3,
    "R": 4,
    "R+": 5,
    "NC-17": 6,
    "Unknown": 0
}

In [27]:
# Créer une colonne numérique pour la notation
df['Rating'] = df['Rating'].map(rating_to_numeric)

In [28]:
df.head(20)

Unnamed: 0,Genres,Producers,Studios,Type,Episodes,Duration,Premiered,Rating,Aired_start,Aired_end
0,"Action, Award Winning, Sci-Fi",Bandai Visual,Sunrise,TV,26.0,24,1998,4,1998,1999
1,"Action, Sci-Fi","Sunrise, Bandai Visual",Bones,Movie,1.0,115,1900,4,2001,1900
2,"Action, Adventure, Sci-Fi",Victor Entertainment,Madhouse,TV,26.0,24,1998,1,1998,1998
3,"Action, Drama, Mystery, Supernatural","Bandai Visual, Dentsu, Victor Entertainment, T...",Sunrise,TV,26.0,25,2002,1,2002,2002
4,"Adventure, Fantasy, Supernatural","TV Tokyo, Dentsu",Toei Animation,TV,52.0,23,2004,1,2004,2005
5,Sports,"TV Tokyo, Nihon Ad Systems, TV Tokyo Music, Sh...",Gallop,TV,145.0,23,2005,1,2005,2008
6,"Comedy, Drama, Romance","Dentsu, Genco, Fuji TV, Asmik Ace, Shueisha",J.C.Staff,TV,24.0,23,2005,1,2005,2005
7,"Comedy, Slice of Life, Sports",UNKNOWN,Nippon Animation,TV,52.0,23,2002,1,2002,2003
8,"Action, Drama","OB Planning, Studio Jack",A.C.G.T.,TV,24.0,27,2004,1,2004,2006
9,"Drama, Mystery, Suspense","VAP, Shogakukan-Shueisha Productions, Nippon T...",Madhouse,TV,74.0,24,2004,5,2004,2005


### Noramlisation des variables catégorielles

##### Normalisation de la variable "Genres"

In [29]:
df["Liste Genre"] = df["Genres"].str.split(", ")

In [30]:
mlb = MultiLabelBinarizer()

In [31]:
df_genres = mlb.fit_transform(df["Liste Genre"])

In [32]:
df = pd.concat([df, pd.DataFrame(df_genres, columns=mlb.classes_)], axis=1)

In [33]:
df = df.drop(["Genres", "Liste Genre"], axis=1)

In [34]:
df.columns

Index(['Producers', 'Studios', 'Type', 'Episodes', 'Duration', 'Premiered',
       'Rating', 'Aired_start', 'Aired_end', 'Action', 'Adventure',
       'Avant Garde', 'Award Winning', 'Boys Love', 'Comedy', 'Drama', 'Ecchi',
       'Erotica', 'Fantasy', 'Girls Love', 'Gourmet', 'Hentai', 'Horror',
       'Mystery', 'Romance', 'Sci-Fi', 'Slice of Life', 'Sports',
       'Supernatural', 'Suspense', 'UNKNOWN'],
      dtype='object')

##### Normalisation de la variable "Producers"

In [35]:
# Regrouper les producteurs peu fréquents
threshold = 10  # Seuil d'occurrences
value_counts = df["Producers"].str.split(", ").explode().value_counts()

# Créer une liste de producteurs peu fréquents
rare_producers = value_counts[value_counts < threshold].index

# Remplacer les producteurs peu fréquents par "Other"
df["Producers"] = df["Producers"].str.split(", ").apply(lambda x: [prod if prod not in rare_producers else 'Other' for prod in x])

# Appliquer à nouveau le MultiLabelBinarizer
df["Liste Producteurs"] = df["Producers"].str.join(", ").str.split(", ")
mlb_producers = MultiLabelBinarizer()
df_producers = mlb_producers.fit_transform(df["Liste Producteurs"])
df = pd.concat([df, pd.DataFrame(df_producers, columns=mlb_producers.classes_)], axis=1)
df.drop(["Producers", "Liste Producteurs"], axis=1, inplace=True)

In [36]:
df.columns

Index(['Studios', 'Type', 'Episodes', 'Duration', 'Premiered', 'Rating',
       'Aired_start', 'Aired_end', 'Action', 'Adventure',
       ...
       'animate Film', 'bilibili', 'chara-ani.com', 'comico', 'd-rights',
       'dugout', 'flying DOG', 'i0+', 'iQIYI', 'm.o.e.'],
      dtype='object', length=379)

In [37]:
df

Unnamed: 0,Studios,Type,Episodes,Duration,Premiered,Rating,Aired_start,Aired_end,Action,Adventure,...,animate Film,bilibili,chara-ani.com,comico,d-rights,dugout,flying DOG,i0+,iQIYI,m.o.e.
0,Sunrise,TV,26.0,24,1998,4,1998,1999,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Bones,Movie,1.0,115,1900,4,2001,1900,1,0,...,0,0,0,0,0,0,0,0,0,0
2,Madhouse,TV,26.0,24,1998,1,1998,1998,1,1,...,0,0,0,0,0,0,0,0,0,0
3,Sunrise,TV,26.0,25,2002,1,2002,2002,1,0,...,0,0,0,0,0,0,0,0,0,0
4,Toei Animation,TV,52.0,23,2004,1,2004,2005,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24900,UNKNOWN,ONA,15.0,0,1900,1,2023,1900,0,0,...,0,0,0,0,0,0,0,0,0,0
24901,UNKNOWN,ONA,18.0,0,1900,1,2023,1900,1,1,...,0,0,0,0,0,0,0,0,0,0
24902,UNKNOWN,ONA,16.0,0,1900,1,2023,1900,1,1,...,0,0,0,0,0,0,0,0,0,0
24903,UNKNOWN,Music,1.0,3,1900,1,2022,1900,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
data = df["Studios"]

# Étape 1: Définir un seuil d'occurrences
threshold_studios = 10  

# Étape 2: Compter les occurrences de chaque studio
value_counts_studios = df["Studios"].str.split(", ").explode().value_counts()

# Étape 3: Créer une liste de studios peu fréquents
rare_studios = value_counts_studios[value_counts_studios < threshold_studios].index

# Étape 4: Remplacer les studios peu fréquents par "Other"
# Gérer les valeurs NaN en remplaçant par une liste vide
df["Studios"] = df["Studios"].str.split(", ").apply(
    lambda x: [studio if studio not in rare_studios else 'Other' for studio in x] if isinstance(x, list) else []
)

# Étape 5: Initialiser le MultiLabelBinarizer
mlb_studios = MultiLabelBinarizer()

# Étape 6: Appliquer le MultiLabelBinarizer
df["Liste Studios"] = df["Studios"].str.join(", ").str.split(", ")
df_studios = mlb_studios.fit_transform(df["Liste Studios"])

# Étape 7: Concaténer les nouvelles colonnes avec le DataFrame original
df = pd.concat([df, pd.DataFrame(df_studios, columns=mlb_studios.classes_)], axis=1)

# Étape 8: Supprimer les colonnes d'origine
df.drop(["Studios", "Liste Studios"], axis=1, inplace=True)

In [39]:
df.columns

Index(['Type', 'Episodes', 'Duration', 'Premiered', 'Rating', 'Aired_start',
       'Aired_end', 'Action', 'Adventure', 'Avant Garde',
       ...
       'Yokohama Animation Lab', 'Yostar Pictures', 'Yumeta Company', 'Zero-G',
       'Zexcs', 'animate Film', 'asread.', 'dwarf', 'feel.', 'ufotable'],
      dtype='object', length=630)

In [40]:
df

Unnamed: 0,Type,Episodes,Duration,Premiered,Rating,Aired_start,Aired_end,Action,Adventure,Avant Garde,...,Yokohama Animation Lab,Yostar Pictures,Yumeta Company,Zero-G,Zexcs,animate Film,asread.,dwarf,feel.,ufotable
0,TV,26.0,24,1998,4,1998,1999,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Movie,1.0,115,1900,4,2001,1900,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,TV,26.0,24,1998,1,1998,1998,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,TV,26.0,25,2002,1,2002,2002,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,TV,52.0,23,2004,1,2004,2005,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24900,ONA,15.0,0,1900,1,2023,1900,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24901,ONA,18.0,0,1900,1,2023,1900,1,1,0,...,0,0,0,0,0,0,0,0,0,0
24902,ONA,16.0,0,1900,1,2023,1900,1,1,0,...,0,0,0,0,0,0,0,0,0,0
24903,Music,1.0,3,1900,1,2022,1900,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
# Transformer les colonnes
col_cat = ["Type"]

In [42]:
ct = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False), col_cat)
    ],
    remainder="passthrough",
    verbose_feature_names_out=True  # Utiliser des noms de colonnes explicites
)

In [43]:
ct.set_output(transform="pandas")

In [44]:
# Fit et transform sur le DataFrame
df_final = ct.fit_transform(df)

In [45]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24905 entries, 0 to 24904
Columns: 635 entries, cat__Type_Music to remainder__ufotable
dtypes: float64(6), int32(626), int64(2), object(1)
memory usage: 61.2+ MB


In [46]:
# Affichage des colonnes produites :
for i in df_final.columns:
  print(i)

cat__Type_Music
cat__Type_ONA
cat__Type_OVA
cat__Type_Special
cat__Type_TV
cat__Type_UNKNOWN
remainder__Episodes
remainder__Duration
remainder__Premiered
remainder__Rating
remainder__Aired_start
remainder__Aired_end
remainder__Action
remainder__Adventure
remainder__Avant Garde
remainder__Award Winning
remainder__Boys Love
remainder__Comedy
remainder__Drama
remainder__Ecchi
remainder__Erotica
remainder__Fantasy
remainder__Girls Love
remainder__Gourmet
remainder__Hentai
remainder__Horror
remainder__Mystery
remainder__Romance
remainder__Sci-Fi
remainder__Slice of Life
remainder__Sports
remainder__Supernatural
remainder__Suspense
remainder__UNKNOWN
remainder__81 Produce
remainder__A-Sketch
remainder__ABC Animation
remainder__ADK Emotions
remainder__ADK Marketing Solutions
remainder__AIC
remainder__ASCII Media Works
remainder__AT-X
remainder__Ai Addiction
remainder__Akita Shoten
remainder__Alpha Group Co. Ltd.
remainder__Amuse
remainder__AniMan
remainder__Animatic
remainder__Animation Do
re

### Imputation
### On impute ici avec un `KNNImputer`.

In [47]:
# Étape 1: Remplacer les valeurs 'UNKNOWN' par NaN
df_final.replace('UNKNOWN', 0, inplace=True)

In [48]:
imputer = KNNImputer(n_neighbors=10)

In [49]:
imputer.set_output(transform="pandas")

In [50]:
df_final = imputer.fit_transform(df_final)

In [51]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24905 entries, 0 to 24904
Columns: 635 entries, cat__Type_Music to remainder__ufotable
dtypes: float64(635)
memory usage: 120.7 MB


### Normalisation
### Pour attribuer autant de poids à chaque variable lors de la recherche des plus proches voisins, il faut normaliser tout le jeu de données.

In [52]:
scaler = MinMaxScaler()
scaler.set_output(transform="pandas")

In [53]:
df_final = scaler.fit_transform(df_final)

### Recommandation Content based avec les plus proches voisins

In [54]:
model = NearestNeighbors(n_neighbors=10)

In [55]:
model.fit(df_final)

In [84]:
# Essai avec le premier anime du jeu de données :
indice_anime_1 = 200
anime_1 = df_final.iloc[indice_anime_1]

In [85]:
distance, neig = model.kneighbors(pd.concat([pd.DataFrame(), anime_1.to_frame().T]), n_neighbors=10)

In [86]:
print(df_0.iloc[indice_anime_1])

for i in range(1, len(distance[0])):
  titre = df_0.iloc[neig[0][i]]["Name"]
  print(f"{titre} recommandé à {100/(1+distance[0][i]):.2f}%") # On calcule la similarité à partir de la distance

anime_id                                                      223
Name                                                  Dragon Ball
English name                                          Dragon Ball
Other name                                                ドラゴンボール
Score                                                        7.96
Genres                         Action, Adventure, Comedy, Fantasy
Synopsis        Gokuu Son is a young boy who lives in the wood...
Type                                                           TV
Episodes                                                    153.0
Aired                                Feb 26, 1986 to Apr 12, 1989
Premiered                                             winter 1986
Status                                            Finished Airing
Producers                                                 Fuji TV
Licensors                                Funimation, Harmony Gold
Studios                                            Toei Animation
Source    

In [87]:
df_0[df_0["Name"]=="Dragon Ball"]

Unnamed: 0,anime_id,Name,English name,Other name,Score,Genres,Synopsis,Type,Episodes,Aired,...,Studios,Source,Duration,Rating,Rank,Popularity,Favorites,Scored By,Members,Image URL
200,223,Dragon Ball,Dragon Ball,ドラゴンボール,7.96,"Action, Adventure, Comedy, Fantasy",Gokuu Son is a young boy who lives in the wood...,TV,153.0,"Feb 26, 1986 to Apr 12, 1989",...,Toei Animation,Manga,24 min per ep,PG-13 - Teens 13 or older,640.0,149,14998,640286.0,989241,https://cdn.myanimelist.net/images/anime/1887/...


In [88]:
df_0[df_0["Name"]=="Naruto"]

Unnamed: 0,anime_id,Name,English name,Other name,Score,Genres,Synopsis,Type,Episodes,Aired,...,Studios,Source,Duration,Rating,Rank,Popularity,Favorites,Scored By,Members,Image URL
10,20,Naruto,Naruto,ナルト,7.99,"Action, Adventure, Fantasy","Moments prior to Naruto Uzumaki's birth, a hug...",TV,220.0,"Oct 3, 2002 to Feb 8, 2007",...,Pierrot,Manga,23 min per ep,PG-13 - Teens 13 or older,599.0,8,76343,1883772.0,2717330,https://cdn.myanimelist.net/images/anime/13/17...


In [90]:
indice_anime_1 = 10
anime_1 = df_final.iloc[indice_anime_1]

distance, neig = model.kneighbors(pd.concat([pd.DataFrame(), anime_1.to_frame().T]), n_neighbors=10)

print(df_0.iloc[indice_anime_1])

for i in range(1, len(distance[0])):
  titre = df_0.iloc[neig[0][i]]["Name"]
  print(f"{titre} recommandé à {100/(1+distance[0][i]):.2f}%") # On calcule la similarité à partir de la distance

anime_id                                                       20
Name                                                       Naruto
English name                                               Naruto
Other name                                                    ナルト
Score                                                        7.99
Genres                                 Action, Adventure, Fantasy
Synopsis        Moments prior to Naruto Uzumaki's birth, a hug...
Type                                                           TV
Episodes                                                    220.0
Aired                                  Oct 3, 2002 to Feb 8, 2007
Premiered                                               fall 2002
Status                                            Finished Airing
Producers                             TV Tokyo, Aniplex, Shueisha
Licensors                                               VIZ Media
Studios                                                   Pierrot
Source    