In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load the training data
train_data = pd.read_csv("../data/cell2celltrain.csv")

# Load the holdout data
holdout_data = pd.read_csv("../data/cell2cellholdout.csv")

# Remove customer_id
train_data = train_data.drop('CustomerID', axis=1)
holdout_data = holdout_data.drop('CustomerID', axis=1)

# Handle missing values BEFORE Label Encoding
for col in train_data.columns:
    if train_data[col].dtype == 'object':
        train_data[col] = train_data[col].fillna('Unknown')
        holdout_data[col] = holdout_data[col].fillna('Unknown')
    else:
        train_data[col] = train_data[col] = train_data[col].fillna(train_data[col].mean())
        holdout_data[col] = holdout_data[col].fillna(holdout_data[col].mean())

# Convert categorical features to numerical features
for col in train_data.columns:
    if train_data[col].dtype == 'object':
        le = LabelEncoder()
        # Fit LabelEncoder on combined data
        le.fit(pd.concat([train_data[col], holdout_data[col]], axis=0))
        train_data[col] = le.transform(train_data[col])
        holdout_data[col] = le.transform(holdout_data[col])

# Scale numerical features
numerical_cols = train_data.select_dtypes(include=['number']).columns
scaler = StandardScaler()
train_data[numerical_cols] = scaler.fit_transform(train_data[numerical_cols])
holdout_data[numerical_cols] = scaler.transform(holdout_data[numerical_cols])

# Print the shape of the data
print("Train data shape:", train_data.shape)
print("Holdout data shape:", holdout_data.shape)

# Split the training data into training and validation sets
X = train_data.drop('Churn', axis=1)
y = train_data['Churn']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)