In [9]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [10]:
# Paramètres
CSV_PATH = "data.csv"

# Choisis une cible présente dans le CSV.
TARGET = "Industry"

# Split train / execution / test
TEST_SIZE = 0.20
EXEC_SIZE = 0.20  # portion du "reste" (après avoir retiré le test)
RANDOM_STATE = 42

In [11]:
# Chargement + diagnostic rapide

df_raw = pd.read_csv(CSV_PATH)
print("Colonnes:", list(df_raw.columns))
print("Shape brut:", df_raw.shape)

# Doublons (lignes identiques)
print("Doublons:", int(df_raw.duplicated().sum()))

# Valeurs manquantes (par colonne)
na_counts = df_raw.isna().sum().sort_values(ascending=False)
print("\nTop NA:")
print(na_counts[na_counts > 0].head(20))

df_raw.head()

Colonnes: ['Index', 'Organization Id', 'Name', 'Website', 'Country', 'Description', 'Founded', 'Industry', 'Number of employees']
Shape brut: (1000, 9)
Doublons: 0

Top NA:
Organization Id        3
Number of employees    2
Country                2
Industry               2
Description            2
Website                1
Name                   1
Founded                1
dtype: int64


Unnamed: 0,Index,Organization Id,Name,Website,Country,Description,Founded,Industry,Number of employees
0,1,E84A904909dF528,Liu-Hoover,http://www.day-hartman.org/,Western Sahara,Ergonomic zero administration knowledge user,1980.0,Online Publishing,6852.0
1,2,AAC4f9aBF86EAeF,Orr-Armstrong,https://www.chapman.net/,Algeria,Ergonomic radical budgetary management,1970.0,Import / Export,
2,3,ad2eb3C8C24DB87,Gill-Lamb,http://lin.com/,,Programmable intermediate conglomeration,2005.0,Apparel / Fashion,5105.0
3,4,D76BB12E5eE165B,Bauer-Weiss,https://gillespie-stout.com/,United States of America,Synergistic maximized definition,2015.0,Dairy,
4,5,2F31EddF2Db9aAE,Love-Palmer,https://kramer.com/,Denmark,Optimized optimizing moderator,2010.0,Management Consulting,6991.0


In [12]:
def basic_clean(df):
    #Nettoyage basique: NA + doublons + suppression index
    cleaned = df.copy()

    # Convertit "vide" ("", "   ") en NA
    cleaned = cleaned.replace(r"^\s*$", pd.NA, regex=True)

    # Doublons exacts
    cleaned = cleaned.drop_duplicates()

    # Colonnes d'index typiques
    for col in ("Unnamed: 0", "Index"):
        if col in cleaned.columns:
            s = cleaned[col]
            if s.is_unique and pd.api.types.is_numeric_dtype(s):
                cleaned = cleaned.drop(columns=[col])

    return cleaned


df = basic_clean(df_raw)
print("Shape après clean:", df.shape)
print("Doublons après clean:", int(df.duplicated().sum()))

df.head()

Shape après clean: (1000, 8)
Doublons après clean: 0


Unnamed: 0,Organization Id,Name,Website,Country,Description,Founded,Industry,Number of employees
0,E84A904909dF528,Liu-Hoover,http://www.day-hartman.org/,Western Sahara,Ergonomic zero administration knowledge user,1980.0,Online Publishing,6852.0
1,AAC4f9aBF86EAeF,Orr-Armstrong,https://www.chapman.net/,Algeria,Ergonomic radical budgetary management,1970.0,Import / Export,
2,ad2eb3C8C24DB87,Gill-Lamb,http://lin.com/,,Programmable intermediate conglomeration,2005.0,Apparel / Fashion,5105.0
3,D76BB12E5eE165B,Bauer-Weiss,https://gillespie-stout.com/,United States of America,Synergistic maximized definition,2015.0,Dairy,
4,2F31EddF2Db9aAE,Love-Palmer,https://kramer.com/,Denmark,Optimized optimizing moderator,2010.0,Management Consulting,6991.0


In [13]:
def split_features_target(df, target):
    if target not in df.columns:
        raise ValueError(f"TARGET='{target}' introuvable. Colonnes: {list(df.columns)}")
    return df.drop(columns=[target]), df[target]


def infer_column_types(X):
    numeric_cols = list(X.select_dtypes(include=["number", "bool"]).columns)
    categorical_cols = [c for c in X.columns if c not in numeric_cols]
    return numeric_cols, categorical_cols


X, y = split_features_target(df, TARGET)
y = df[[TARGET]]
num_cols, cat_cols = infer_column_types(X)

print("X:", X.shape)
print("y:", y.shape)
print("Num cols:", num_cols)
print("Cat cols:", cat_cols)

X: (1000, 7)
y: (1000, 1)
Num cols: ['Founded', 'Number of employees']
Cat cols: ['Organization Id', 'Name', 'Website', 'Country', 'Description']


## Split: train / execution / test

On découpe en 3 ensembles:
- **train**: pour *fit* (apprendre) l’imputation, le scaling, et le one-hot
- **execution** (validation): pour tester/ajuster sans toucher au test final
- **test**: uniquement pour la mesure finale (pas de fit dessus)

In [14]:
# Split en 3 parties: train / execution / test

if not (0 < TEST_SIZE < 1):
    raise ValueError("TEST_SIZE doit être entre 0 et 1")
if not (0 < EXEC_SIZE < 1):
    raise ValueError("EXEC_SIZE doit être entre 0 et 1")

# 1) isole le test
X_tmp, X_test, y_tmp, y_test = train_test_split(
    X,
    y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
)

# 2) split le reste en train + execution
X_train, X_exec, y_train, y_exec = train_test_split(
    X_tmp,
    y_tmp,
    test_size=EXEC_SIZE,
    random_state=RANDOM_STATE,
)

print("train:", X_train.shape, y_train.shape)
print("execution:", X_exec.shape, y_exec.shape)
print("test:", X_test.shape, y_test.shape)

train: (640, 7) (640, 1)
execution: (160, 7) (160, 1)
test: (200, 7) (200, 1)
