In [None]:
from random import random
from sklearn.datasets import load_breast_cancer
import pandas as pd


# Get Data

In [None]:
def get_noise_date():
    data = load_breast_cancer()
    df = pd.DataFrame(data.data, columns=data.feature_names)
    df_fake = pd.DataFrame([random() for x in range(df.shape[0])], columns=["noise"])
    df_fake["target"] = data.target
    df_fake["id"] = df["id"] = df.index.values

    return df_fake

def get_data():
    data = load_breast_cancer()
    df = pd.DataFrame(data.data, columns=data.feature_names)
    df['target'] = data.target
    df["id"] = df.index.values

    return df

In [None]:
df_fake = get_noise_date()
df = get_data()

# Join Data

In [None]:
def join_data(df1, df2):
    """
    Join multiple DataFrames based on the 'id' column while ensuring no duplicated 'id' and 'target' values.
    """
    
    
    
    merged_df = pd.merge(df1, df2, on=['id', 'target'], how='inner')

    merged_df = merged_df.drop('id', axis=1)
    return merged_df

In [None]:
merged_df = join_data(df, df_fake)

merged_df.head()

# Split Data Into Train, Val y Test

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
def split_data(df):
    X = df.drop(columns=['target'])
    y = df['target']
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    df_train = pd.concat([X_train, y_train], axis=1)
    df_val = pd.concat([X_val, y_val], axis=1)
    df_test = pd.concat([X_test, y_test], axis=1)

    return df_train, df_val, df_test

In [None]:
df_train, df_val, df_test = split_data(merged_df)

In [None]:
merged_df.shape

In [None]:
df_train.shape


# Preprocess Data

In [None]:
from sklearn.preprocessing import StandardScaler
import joblib

In [None]:

def preprocess_data(df_train):
    X = df_train.drop(columns=['target'])
    y = df_train['target']
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    df_scaled = pd.concat([pd.DataFrame(X_scaled, columns=X.columns), y], axis=1)

    return df_scaled, scaler


df_scaled, scaler = preprocess_data(df_train)

# Training  (Hyperparameter tuning)

In [None]:
import optuna
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
def train_model(df_train, df_val,  scaler, num_of_trials: int = 100):

    X_train = df_train.drop(columns=['target'])
    y_train = df_train['target']


    X_val = df_val.drop(columns=['target'])
    y_val = df_val['target']

    
    # Scale the data
    X_train_scaled = scaler.transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Define the objective function for Optuna
    def objective(trial):
        model_name = trial.suggest_categorical('model', ['LogisticRegression', 'RandomForest'])

        if model_name == 'LogisticRegression':
            C = trial.suggest_float('C', 1e-4, 1e2, log=True)
            l1_ratio = trial.suggest_float('l1_ratio', 0, 1)

            model = LogisticRegression(
                penalty='elasticnet', 
                C=C, 
                solver = 'saga',
                l1_ratio=l1_ratio
            )
        
        else: 
            n_estimators = trial.suggest_int('n_estimators', 10, 100)
            max_depth = trial.suggest_int('max_depth', 2, 8)
            min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
            model = RandomForestClassifier(
                n_estimators=n_estimators,
                max_depth=max_depth,
                min_samples_split=min_samples_split
            )

        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_val_scaled)
        accuracy = accuracy_score(y_val, y_pred)
        
        return accuracy

    
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=num_of_trials)

    best_trial = study.best_trial
    if best_trial.params['model'] == 'LogisticRegression':
            best_model = LogisticRegression(
                penalty='elasticnet',
                C=best_trial.params['C'],
                solver='saga',
                l1_ratio=best_trial.params['l1_ratio']
            )
    else:
        best_model = RandomForestClassifier(
            n_estimators=best_trial.params['n_estimators'],
            max_depth=best_trial.params['max_depth'],
            min_samples_split=best_trial.params['min_samples_split']
        )

    
    final_pipeline = Pipeline([
        ('scaler', scaler),
        ('model', best_model)
    ])

    # Train the final pipeline on the full training data
    final_pipeline.fit(X_train, y_train)



    # Evaluate the model on the validation data
    y_val_pred = final_pipeline.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)

    return final_pipeline, val_accuracy




In [None]:
final_pipeline, val_accuracy = train_model(df_train, df_val,  scaler, num_of_trials=50)

# Evaluate the model

In [None]:
def evaluatetrain_model(df_train, df_test, model):

    X_train = df_train.drop(columns=['target'])
    y_train = df_train['target']

    X_test= df_test.drop(columns=['target'])
    y_test = df_test['target']

    y_train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)

    y_test_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    return train_accuracy, test_accuracy




    

In [None]:
train_accuracy, test_accuracy = evaluatetrain_model(df_train, df_test, final_pipeline)

In [None]:
print(train_accuracy)
print(test_accuracy)