# **Setup**

In [1]:
#Importacion de librerias necesarias
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from imblearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pickle




In [4]:
# from google.colab import drive
# drive.mount('/content/drive')
%load_ext autoreload
%autoreload 2

import os
import sys

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Loading and exploring the data

In [51]:
# data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/MNA/MLOps/data/Algerian_forest_fires_dataset_UPDATE_RegionAdd.csv",sep=',', header='infer')

def load_data(filepath):
    data = pd.read_csv(filepath)
    return data

# Visualizing the data

# Preprocessing and feature engineering

In [48]:
def data_cleanup(input_file, output_file):

    data = pd.read_csv(input_file ,sep=',', header='infer')

    print(data)
    print(f'Columnas: {data.columns}')

    data.rename(columns={' RH':'RH', ' Ws': 'Ws', 'Classes  ':'Classes', 'Rain ':'Rain'}, inplace=True)

    #data.info()

    #print(f"Classes: {data['Classes'].unique()} \n")

    #print(f"Region {data['Region'].unique()} \n")

    #Detectamos que los valores de Classes contienen espacios extras por lo cual procedemos a eliminarlos
    #El valor nulo se va a limpiar en el siguiente paso
    data['Classes'] = data['Classes'].str.strip()

    #Imprimimos de nuevo los valores unicos:
    #print(f"Classes: {data['Classes'].unique()} \n")

    valores_nulos = data.isnull().sum()
    data_limpia = data.dropna()

    #Realizamos la transformacion de variables en el dataframe a su correcto formato
    columnas_categoricas = ['Classes', 'Region']
    columnas_enteras = ['day', 'month', 'year']
    columnas_continuas = ['Temperature', 'RH', 'Ws', 'Rain', 'FFMC', 'DMC', 'DC', 'ISI', 'BUI', 'FWI']

    data_transformada = data_limpia.copy()
    data_transformada[columnas_categoricas] = data_limpia[columnas_categoricas].astype('category')
    data_transformada[columnas_enteras] = data_limpia[columnas_enteras].astype('int64')
    data_transformada[columnas_continuas] = data_limpia[columnas_continuas].astype('float64')

    data_transformada.to_csv(output_file, index=False)



**Aplicar técnicas de preprocesamiento como normalización, codificación de variables categóricas y reducción de dimensionalidad.**

In [77]:
def data_pre_proc(input_file):

    data_transformada = load_data(input_file)

    columnas_continuas = ['Temperature', 'RH', 'Ws', 'Rain', 'FFMC', 'DMC', 'DC', 'ISI', 'BUI', 'FWI']

    y = data_transformada['Classes']
    data_transformada.drop(columns=['Classes'], inplace=True)
    X = data_transformada.select_dtypes(include=['float64', 'int64', 'category','object'])

    #Eliminamos las columnas que no añaden valor
    X = X.drop(columns=['day', 'month', 'year'])

    #Dividimos los datos
    X_train_base, X_test_base, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    #para las variables continuas vamos a aplicar Normalizacion usando MinMaxScaler
    numericas_pipeline = Pipeline( steps=[
        ('minmax', MinMaxScaler()),
        ('scaler', StandardScaler()),
        ('PCA', PCA(n_components=0.95))
    ] )


    #para las variables cate vamos a aplicar OneHot
    catOHE_pipeline = Pipeline( steps=[
        ('OneHotEncoder', OneHotEncoder())
    ] )

    columnas_categoricas = ['Region']
    ct = ColumnTransformer(transformers=[
            ('numericas_continuas', numericas_pipeline, columnas_continuas),
            ('categoricas', catOHE_pipeline, columnas_categoricas)
            ])

    X_train = ct.fit_transform(X_train_base)
    X_test  = ct.transform(X_test_base)

    pd.DataFrame(X_train).to_csv('../data/processed/X_train.csv', index=False)
    pd.DataFrame(X_test).to_csv('../data/processed/X_test.csv', index=False)
    pd.DataFrame(y_train).to_csv('../data/processed/y_train.csv', index=False)
    pd.DataFrame(y_test).to_csv('../data/processed/y_test.csv', index=False)



# Training the model

In [86]:
def train_model():
    X_train=pd.read_csv('../data/processed/X_train.csv')
    y_train=pd.read_csv('../data/processed/y_train.csv')

    modeloRL = LogisticRegression(penalty='l2',
                                  C=100,
                                  solver='liblinear',
                                  max_iter=1000,
                                  random_state=55)

    modeloRL.fit(X_train,np.ravel(y_train))

    with open('../models/modelLR.pkl', 'wb') as f:
        pickle.dump(modeloRL, f)
    return modeloRL

# Evaluating the model

In [98]:
def evaluate_model(modelLR):

    #with open('../models/modelLR.pkl', 'rb') as f:
    #    modelLR = pickle.load(f)

    X_train =   pd.read_csv('../data/processed/X_train.csv')
    X_test  =   pd.read_csv('../data/processed/X_test.csv')
    y_train =   pd.read_csv('../data/processed/y_train.csv')
    y_test  =   pd.read_csv('../data/processed/y_test.csv')

    print(">>Exactitud (Accuracy) de los conjuntos de Entrenamiento y Validación con Logistic Regresion:")
    y_pred_trainRL = modelLR.predict(X_train)
    y_pred_testRL = modelLR.predict(X_test)
    print('accuracy-train', metrics.accuracy_score(y_train, y_pred_trainRL))
    print('accuracy-test', metrics.accuracy_score(y_test, y_pred_testRL))

    print("\n>>Matriz de Confusión:")
    print(metrics.confusion_matrix(y_test, y_pred_testRL))

    print("\n>>Reporte varias métricas:")
    print(metrics.classification_report(y_test, y_pred_testRL))


# Cross Validation

In [95]:
def cross_validate_model(model, X, y, num_cross_validation = 5):
    scores = cross_val_score(model, X, y, cv=num_cross_validation)
    print("Average Accuracy with CV:", np.mean(scores))

In [99]:
# Main function for running the pipeline
def main(filepath, output_file = '../data/processed/Algerian_forest_fires_dataset_clean.csv'):
    data_cleanup(filepath, output_file)
    data_pre_proc(output_file)
    model_LR = train_model()
    evaluate_model(model_LR)

In [100]:
file_path = '../data/Algerian_forest_fires_dataset_UPDATE_RegionAdd.csv'

main(filepath=file_path)

             Region  day  month  year  Temperature   RH   Ws  Rain   FFMC  \
0            Bejaia    1      6  2012           29   57   18    0.0  65.7   
1            Bejaia    2      6  2012           29   61   13    1.3  64.4   
2            Bejaia    3      6  2012           26   82   22   13.1  47.1   
3            Bejaia    4      6  2012           25   89   13    2.5  28.6   
4            Bejaia    5      6  2012           27   77   16    0.0  64.8   
..              ...  ...    ...   ...          ...  ...  ...    ...   ...   
239  Sidi-Bel Abbes   26      9  2012           30   65   14    0.0  85.4   
240  Sidi-Bel Abbes   27      9  2012           28   87   15    4.4  41.1   
241  Sidi-Bel Abbes   28      9  2012           27   87   29    0.5  45.9   
242  Sidi-Bel Abbes   29      9  2012           24   54   18    0.1  79.7   
243  Sidi-Bel Abbes   30      9  2012           24   64   15    0.2  67.3   

      DMC    DC  ISI   BUI  FWI     Classes    
0     3.4   7.6  1.3   3.4 