# Stats COurse 2023 BLAB LA

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

### Load the dataset

In [3]:
df = pd.read_csv('data/train_test_2023.csv', encoding='ISO-8859-1')

### Preprocess the dataset

In [5]:
def preprocess_data(df):
    """
    Preprocess the dataset for XGBoost model training.
    
    This function encodes categorical variables using one-hot encoding, 
    then splits the dataset into features (X) and target (y), 
    and further into training and testing sets.

    :param df: Pandas DataFrame containing the dataset.
    :return: Tuple of (X_train, X_test, y_train, y_test)
    """
    # Renaming the target column for clarity
    df.rename(columns={'': 'target'}, inplace=True)

    # Identifying categorical and numerical columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    categorical_cols = categorical_cols.drop('target')  # Exclude the target column
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

    # Defining the transformer for categorical data
    transformer = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ],
        remainder='passthrough'
    )

    # Splitting the dataset into features and target
    X = df.drop('target', axis=1)
    y = df['target']

    # Applying transformation to the features
    X_transformed = transformer.fit_transform(X)

    # Splitting the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test


In [6]:
X_train, X_test, y_train, y_test = preprocess_data(df)