# **Manipulación y preparación de datos**

In [1]:
#Importacion de librerias necesarias
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from imblearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.impute import SimpleImputer



%load_ext autoreload
%autoreload 2

import os
import sys

# Creating the classes

### Data Explorer

In [None]:
class DataExplorer:

    def __init__( self, data_path, target_column, test_size = 0.2 ):
        pass


    

### Model

In [None]:
class ForestFireModel:


    def __init__(self, path_file):
        
        self.path_file = path_file
        self.path_file = None
        self.X_train = None
        self.y_train = None
        self.X_test = None
        self.y_test = None
        
        self.pipeline = None
        self.best_model = None
        self.target_encoder = None

    
    def load_data(self):
        data = pd.read_csv(self.path_file, sep=',', header='infer')
        return data
    

    def data_cleanup(self, output_file):
        data = self.load_data()
        data.rename(columns={' RH':'RH', ' Ws': 'Ws', 'Classes  ':'Classes', 'Rain ':'Rain'}, inplace=True)
        data['Classes'] = data['Classes'].str.strip()
        data_limpia = data.dropna()
        #Realizamos la transformacion de variables en el dataframe a su correcto formato
        columnas_categoricas = ['Classes', 'Region']
        columnas_enteras = ['day', 'month', 'year']
        columnas_continuas = ['Temperature', 'RH', 'Ws', 'Rain', 'FFMC', 'DMC', 'DC', 'ISI', 'BUI', 'FWI']
        data_transformada = data_limpia.copy()
        data_transformada[columnas_categoricas] = data_limpia[columnas_categoricas].astype('category')
        data_transformada[columnas_enteras] = data_limpia[columnas_enteras].astype('int64')
        data_transformada[columnas_continuas] = data_limpia[columnas_continuas].astype('float64')
        data_transformada.to_csv(output_file, index=False)


    def preprocess_data(self):

        data_transformada = self.load_data()
        columnas_continuas = ['Temperature', 'RH', 'Ws', 'Rain', 'FFMC', 'DMC', 'DC', 'ISI', 'BUI', 'FWI']
        y = data_transformada['Classes']
        data_transformada.drop(columns=['Classes'], inplace=True)
        X = data_transformada.select_dtypes(include=['float64', 'int64', 'category'])
        #Eliminamos las columnas que no añaden valor
        X = X.drop(columns=['day', 'month', 'year'])
        #Dividimos los datos
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        #para las variables continuas vamos a aplicar Normalizacion usando MinMaxScaler
        numericas_pipeline = Pipeline( steps=[
            ('minmax', MinMaxScaler()),
            ('scaler', StandardScaler()),
            ('PCA', PCA(n_components=0.95))
        ] )
        #para las variables cate vamos a aplicar OneHot
        catOHE_pipeline = Pipeline( steps=[
            ('OneHotEncoder', OneHotEncoder())
        ] )

        columnas_categoricas = ['Region']
        ct = ColumnTransformer(transformers=[
                ('numericas_continuas', numericas_pipeline, columnas_continuas),
                ('categoricas', catOHE_pipeline, columnas_categoricas)
                ])

        self.X_train = ct.fit_transform(self.X_train)
        self.X_test  = ct.transform(self.X_test)


    def train_model(self, param_grid):
        """
        Train the model using GridSearchCV for hyperparameter tuning.

        Args:
            param_grid: Dictionary with parameters names as keys and lists of parameter settings to try as values.
        """
        grid_search = GridSearchCV(self.pipeline, param_grid, cv=5, n_jobs=-1)
        grid_search.fit(self.X_train, self.y_train)
        self.best_model = grid_search.best_estimator_


    def predict(self, new_data):
        """
        Prediction of new data, returns the data in its categorical labels.

        Args:
            new_data: dataframe with new features to predict.
        """
        predictions = self.best_model.predict(new_data)
        predictions = self.target_encoder.inverse_transform(predictions)
        return predictions


    def evaluate_model(self):
        """
        Model evaluation on test set.
        
        """
        y_pred = self.best_model.predict(self.X_test)
        cm = metrics.confusion_matrix(self.y_test, y_pred)
        print("Confusion Matrix:")
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.show()
        
        report = metrics.classification_report(self.y_test, y_pred)
        print("Classification Report:")
        print(report)

        print('accuracy-test', metrics.accuracy_score(self.y_test, y_pred))



    def cross_validate_model(self):
        scores = cross_val_score(self.pipeline, self.X_train, self.y_train, cv=5)
        print("Average Accuracy with CV:", np.mean(scores))
        return self





### Executing the code

m_lr = ForestFireModel()

