In [None]:
# Define the feature columns and target column
features = telecom_data_cleaned_no_duplicates.columns.drop('churn')
target = 'churn'

# Step 1: Split the data into training (60%), validation (20%), and testing (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(telecom_data_cleaned_no_duplicates[features], telecom_data_cleaned_no_duplicates[target], test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Define categorical and numerical features
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_features = X_train.select_dtypes(include=[np.number]).columns.tolist()

# Step 2: Define separate transformers for each set
# Transformer for training set
train_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)])

# Transformer for validation set
val_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)])

# Transformer for test set
test_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)])

# Step 3: Fit each preprocessor only on its respective set and transform
X_train_preprocessed = train_preprocessor.fit_transform(X_train)
X_val_preprocessed = val_preprocessor.fit_transform(X_val)
X_test_preprocessed = test_preprocessor.fit_transform(X_test)

# Display shapes of the preprocessed sets to confirm successful transformation
print("Training set shape:", X_train_preprocessed.shape)
print("Validation set shape:", X_val_preprocessed.shape)
print("Test set shape:", X_test_preprocessed.shape)

16. Let's also encode it and scale it so that it's ready to be run on different models
17. Okay now let's continue by preparing the data for machine learning deployment , beggining with one-hot encoding as it would work better than manually mapping the categories to values like 0,1,2 will introduce a FALSE sense of order in places where there ain't no order ("None" < "DSL" < "Fiber optic") which is not the case they are seperate things and do not follow any natural order and the model could assume false relationships.
18. Okay now let's continue by preparing the data for machine learning deployment , beggining with one-hot encoding as it would work better than manually mapping the categories to values like 0,1,2 will introduce a FALSE sense of order in places where there ain't no order ("None" < "DSL" < "Fiber optic") which is not the case they are seperate things and do not follow any natural order and the model could assume false relationships.
19. For that we will have to convert the categorical columns into dummies which in this case I assume would come with the addition of new features, as the encoding works with (True(1) and False(0))and in some columns we have 3 values.
20. Also let's first split the data into train-test and validation so that we don't encode and scale the whole data to prevent data leakage. And we will scale and encode every set seperately in order not to leak any knowledge into the test set.

In [None]:
def split_and_preprocess_data(data_no_monthly_tenure, target_column, test_size=0.4, val_size=0.5, random_state=42):
    """
    Splits the data into training, validation, and testing sets, identifies categorical and numerical features,
    and preprocesses each set using scaling for numerical features and one-hot encoding for categorical features.
    
    Parameters:
    - data (pd.DataFrame): The dataset to split and preprocess.
    - target_column (str): The name of the target column.
    - test_size (float): The proportion of data to set aside for validation + testing.
    - val_size (float): The proportion of the remaining data to use for validation.
    - random_state (int): The random seed for reproducibility.
    
    Returns:
    - X_train_preprocessed, X_val_preprocessed, X_test_preprocessed: Preprocessed training, validation, and test sets.
    - y_train, y_val, y_test: Target values for training, validation, and test sets.
    - train_preprocessor: The ColumnTransformer used for the training set (for feature names).
    """
    
    # Split the data
    features = data_no_monthly_tenure.columns.drop(target_column)
    X_train, X_temp, y_train, y_temp = train_test_split(data_no_monthly_tenure[features], data_no_monthly_tenure[target_column], test_size=test_size, random_state=random_state)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=val_size, random_state=random_state)
    
    # Define categorical and numerical features
    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
    
    # Define preprocessors for each set
    train_preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(drop='first'), categorical_features)]
    )
    
    val_preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(drop='first'), categorical_features)]
    )
    
    test_preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(drop='first'), categorical_features)]
    )
    
    # Fit and transform each set
    X_train_preprocessed = train_preprocessor.fit_transform(X_train)
    X_val_preprocessed = val_preprocessor.fit_transform(X_val)
    X_test_preprocessed = test_preprocessor.fit_transform(X_test)
    
    # Display shapes of the preprocessed sets
    print("Training set shape:", X_train_preprocessed.shape)
    print("Validation set shape:", X_val_preprocessed.shape)
    print("Test set shape:", X_test_preprocessed.shape)
    
    return X_train_preprocessed, X_val_preprocessed, X_test_preprocessed, y_train, y_val, y_test, train_preprocessor

# Usage example:
# Assuming 'data_no_total' is your DataFrame and 'churn' is your target column
X_train_preprocessed, X_val_preprocessed, X_test_preprocessed, y_train, y_val, y_test, train_preprocessor = split_and_preprocess_data(data_no_monthly_tenure, 'churn')

# Now you can use train_preprocessor to get feature names
feature_names = train_preprocessor.get_feature_names_out()