In [1]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("/home/antoine/mlops_reco_movies/airflow/data/raw/bronze/genome-scores.csv")
df.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.025
1,1,2,0.025
2,1,3,0.05775
3,1,4,0.09675
4,1,5,0.14675


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11709768 entries, 0 to 11709767
Data columns (total 3 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   tagId      int64  
 2   relevance  float64
dtypes: float64(1), int64(2)
memory usage: 268.0 MB


In [4]:
# Check for missing values
print("Missing values in each column:")
print(df.isnull().sum())

Missing values in each column:
movieId      0
tagId        0
relevance    0
dtype: int64


In [5]:
# Optimisation mémoire
df = df.astype({
    "movieId": "int32",
    "tagId": "int32",
    "relevance": "float32"
}).rename(columns={"movieId": "movie_id", "tagId": "tag_id"})

In [6]:
df.head()

Unnamed: 0,movie_id,tag_id,relevance
0,1,1,0.025
1,1,2,0.025
2,1,3,0.05775
3,1,4,0.09675
4,1,5,0.14675


In [7]:
df.describe()

Unnamed: 0,movie_id,tag_id,relevance
count,11709770.0,11709770.0,11709770.0
mean,25842.97,564.5,0.1164834
std,34676.15,325.6254,0.1542463
min,1.0,1.0,0.00025
25%,2926.0,282.75,0.02425
50%,6017.0,564.5,0.0565
75%,46062.0,846.25,0.1415
max,131170.0,1128.0,1.0


In [8]:
# Selection des relevances > 0.2
df = df[df["relevance"] >= 0.2]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2012678 entries, 5 to 11709763
Data columns (total 3 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movie_id   int32  
 1   tag_id     int32  
 2   relevance  float32
dtypes: float32(1), int32(2)
memory usage: 38.4 MB


In [9]:
df["relevance"] = np.log1p(df["relevance"])
df.head()

Unnamed: 0,movie_id,tag_id,relevance
5,1,6,0.196389
7,1,8,0.233292
8,1,9,0.232698
10,1,11,0.455524
15,1,16,0.248226


In [10]:
tags_df = pd.read_csv("/home/antoine/mlops_reco_movies/airflow/data/raw/bronze/genome-tags.csv")
tags_df.head()

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


In [14]:

import pandas as pd
import logging
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OrdinalEncoder
import spacy
import re
from typing import Optional

logger = logging.getLogger(__name__)

nlp = spacy.load("en_core_web_sm")



def clean_tag(tag: str) -> str:
    doc = nlp(tag)
    return " ".join(
        token.text.lower() for token in doc if not token.is_punct and not token.is_stop
    )

def preprocess_genome_tags(df: pd.DataFrame, min_term_freq: Optional[int] = 2) -> pd.DataFrame:
    """
    Prétraite le dataset genome-tags avec gestion MLOps :
    - Nettoyage sémantique des tags
    - Dédoublonnage par similarité textuelle
    - Optimisation pour l'embedding

    Args:
        min_term_freq: Seuil d'occurrence minimal pour conserver un terme (TF-IDF)
    """
    # Validation initiale
    assert {"tagId", "tag"}.issubset(df.columns), "Colonnes manquantes"
    df = df.astype({"tagId": "int32"})
    df = df.rename(columns={"tagId": "tag_id"})

    try:
        # Nettoyage des tags
        df["clean_tag"] = df["tag"].apply(clean_tag)

        # Gestion des doublons sémantiques
        if min_term_freq:
            vectorizer = TfidfVectorizer(min_df=min_term_freq)
            tfidf_matrix = vectorizer.fit_transform(df["clean_tag"])
            df = df.iloc[list(vectorizer.vocabulary_.values())].reset_index(drop=True)

        # Catégorisation automatique
        df["category"] = df["clean_tag"].apply(
            lambda x: (
                "decade"
                if (
                    re.search(r"\d{4}s?$", x)
                    or re.search(r"\d{1,2}(st|nd|rd|th) century", x, re.IGNORECASE)
                )
                else "concept"
            )
        )

        # Encodage ordinal des catégories
        encoder = OrdinalEncoder(categories=[df["category"].unique().tolist()])
        df["category_encoded"] = encoder.fit_transform(df[["category"]])

    except Exception as e:
        logger.error(f"Erreur de prétraitement : {str(e)}")
        raise

    # Validation finale
    assert df["tag_id"].is_unique, "IDs de tags dupliqués"
    assert df["clean_tag"].str.len().gt(0).all(), "Tags vides après nettoyage"
    logger.info(f"Prétraitement genome-tags terminé. Tags traités : {len(df)}")

    return df

In [15]:
valid_tags = preprocess_genome_tags(tags_df, min_term_freq=2)

In [17]:
valid_tags.head()

Unnamed: 0,tag_id,tag,clean_tag,category,category_encoded
0,1,007,007,concept,0.0
1,186,cartoon,cartoon,concept,0.0
2,46,allegory,allegory,concept,0.0
3,3,18th century,18th century,decade,1.0
4,7,1960s,1960s,decade,1.0


In [18]:
valid_tags_1 = valid_tags["tag_id"].unique()
df = df[df["tag_id"].isin(valid_tags_1)]

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 348551 entries, 5 to 11708860
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   movie_id   348551 non-null  int32  
 1   tag_id     348551 non-null  int32  
 2   relevance  348551 non-null  float32
dtypes: float32(1), int32(2)
memory usage: 6.6 MB


In [20]:
df.head(20  )

Unnamed: 0,movie_id,tag_id,relevance
5,1,6,0.196389
7,1,8,0.233292
8,1,9,0.232698
10,1,11,0.455524
15,1,16,0.248226
18,1,19,0.513123
20,1,21,0.285367
21,1,22,0.248811
28,1,29,0.637634
29,1,30,0.516559


In [21]:
# Gestion des doublons complets
df = df.drop_duplicates(["movie_id", "tag_id"])
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 348551 entries, 5 to 11708860
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   movie_id   348551 non-null  int32  
 1   tag_id     348551 non-null  int32  
 2   relevance  348551 non-null  float32
dtypes: float32(1), int32(2)
memory usage: 6.6 MB


In [22]:
from pandas.api.types import is_float_dtype

In [23]:
for col in df.columns:
                if is_float_dtype(df[col]):
                    df[col] = pd.to_numeric(df[col], downcast="float")
                else:
                    df[col] = pd.to_numeric(df[col], downcast="integer")

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 348551 entries, 5 to 11708860
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   movie_id   348551 non-null  int32  
 1   tag_id     348551 non-null  int16  
 2   relevance  348551 non-null  float32
dtypes: float32(1), int16(1), int32(1)
memory usage: 6.0 MB
