# Proyecto de MLOps

In [82]:
import pandas as pd
import numpy as np
import logging

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [118]:
class MLPipeLine:
    """
    """

    def __init__( self, data_path, target_column, test_size = 0.2 ):
        """
        Initialize the MLPipeline.

        Args:
            data_path (str): Path to the CSV data file.
            target_column (str): Name of the target column.
            test_size (float, optional): Proportion of the dataset to include in the test split. Defaults to 0.2.
            random_state (int, optional): Random state for reproducibility. Defaults to 42.
        """
        self.data_path = data_path
        self.target_column = target_column
        
        self.test_size = test_size
        self.random_state = 42
        
        self.X_train = None
        self.y_train = None
        self.X_test = None
        self.y_test = None
        
        self.pipeline = None
        self.best_model = None
        self.target_encoder = None


    def load_data(self):
        """ """
        df_data = pd.read_csv(self.data_path)

        df_data = df_data.rename(columns = { 'Classes  ': 'Classes'})
        df_data['Classes'] = df_data['Classes'].fillna('fire')
        df_data['Classes'] = df_data['Classes'].apply(lambda x: x.strip())


        X = df_data.drop(self.target_column, axis = 1)
        y = df_data[self.target_column]

        # Transform categorical target to numeric
        self.target_encoder = LabelEncoder()
        self.target_encoder.fit(y)
        y = self.target_encoder.transform(y)

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size = self.test_size, random_state = self.random_state)

        logging.info(f"Data loaded and split. Training set size: {len(self.X_train)}, Test set size: {len(self.X_test)}")

    def data_cleaning(self):
        pass

    def transform_data(self):        
        pass

    def create_pipeline(self):
        """Create the scikit-learn pipeline with preprocessing steps and the classifier."""
        
        numeric_features = self.X_train.select_dtypes(include=['int64', 'float64']).columns
        categorical_features = self.X_train.select_dtypes(include=['object']).columns
        
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])

        ### Implementar funciones personalizadas
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ])

        self.pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', LogisticRegression())
        ])
        logging.info("Pipeline created successfully")


    def train_model(self, param_grid):
        """
        Train the model using GridSearchCV for hyperparameter tuning.

        Args:
            param_grid: Dictionary with parameters names as keys and lists of parameter settings to try as values.
        """
        grid_search = GridSearchCV(self.pipeline, param_grid, cv=5, n_jobs=-1)
        grid_search.fit(self.X_train, self.y_train)
        self.best_model = grid_search.best_estimator_
        logging.info(f"Model trained. Best parameters: {grid_search.best_params_}")


    def predict(self, new_data):
        predictions = self.best_model.predict(new_data)
        predictions = self.target_encoder.inverse_transform(predictions)
        return predictions


    
        

In [122]:
#if __name__ == "__main__":


ml_pipeline = MLPipeLine('data/Algerian_forest_fires_dataset_UPDATE_RegionAdd.csv', 'Classes')
ml_pipeline.load_data()
ml_pipeline.create_pipeline()

param_grid = {
    'classifier__penalty': ['l1', 'l2', 'elasticnet', None],  # tipo de penalización
    'classifier__C': [0.01, 0.1, 1, 10, 100],  # inverso de la regularización
    'classifier__max_iter': [100, 200, 300],  # número máximo de iteraciones
}

ml_pipeline.train_model(param_grid)

2024-10-01 23:39:27,062 - INFO - Data loaded and split. Training set size: 195, Test set size: 49
2024-10-01 23:39:27,063 - INFO - Pipeline created successfully
150 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Omar\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Omar\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\Omar\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit


In [123]:
predictions = ml_pipeline.predict(ml_pipeline.X_test)
print("Predictions for new data:", predictions)


Predictions for new data: ['fire' 'fire' 'fire' 'not fire' 'fire' 'not fire' 'fire' 'not fire'
 'not fire' 'not fire' 'not fire' 'fire' 'fire' 'not fire' 'fire' 'fire'
 'fire' 'not fire' 'fire' 'fire' 'fire' 'not fire' 'fire' 'not fire'
 'fire' 'not fire' 'fire' 'fire' 'not fire' 'fire' 'fire' 'not fire'
 'fire' 'fire' 'not fire' 'not fire' 'not fire' 'fire' 'not fire'
 'not fire' 'fire' 'fire' 'not fire' 'not fire' 'fire' 'fire' 'fire'
 'not fire' 'fire']
