In [154]:
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, List
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler

In [155]:
def read_dataset()->pd.DataFrame:
    """Leer el dataset desde el archivo generado por el ETL. 

    Returns
    -------
    pd.DataFrame
        El dataset.
    """
    # TODO
    df = pd.read_csv("datalab2.csv", header=None, na_values=['?'])
    df = df.rename(columns={0: 'SEQUENCE_NAME'})
    df[['SEQUENCE_NAME','MCG', 'GVH', 'LIP', 'CHG', 'AAC', 'ALM1', 'ALM2', 'CLASS']]=df.SEQUENCE_NAME.str.split(pat=";",expand=True)
   
    return df


def preprocessing(df: pd.DataFrame)->pd.DataFrame:
    """Realizar el preprocesamiento: Escalar o normalizar variables. Para normalizar
    se debe confirmar empíricamente mediante inspección visual que una variable
    sigue una distribución normal.

    Parameters
    ----------
    df : pd.DataFrame
        El dataset.

    Returns
    -------
    pd.DataFrame
        El dataset.
    """
    #TODO
    #df["MCG"].hist(bins=100)=>Normalización min max
    columns = [#'SEQUENCE_NAME',
               'MCG', 
               'GVH', 
               'LIP', 
               'CHG',
               'AAC', 
               'ALM1', 
               'ALM2', 
               #'CLASS'
               ]
    X=df[columns]
    y=df["MCG"]
    min_max_scaler = MinMaxScaler()
    x_scaled= min_max_scaler.fit_transform(X)
    new_df=pd.DataFrame(x_scaled,columns=columns)
    
    return new_df,y


def remove_variables(df: pd.DataFrame)->pd.DataFrame:
    """Quitar variables que sean reduduntantes. Utilice el algoritmo PCA para encontrar
    las variables apropiadas. Usted debe definir cuál es el número de variables
    necesarias.

    Parameters
    ----------
    df : pd.DataFrame
        El dataset.

    Returns
    -------
    pd.DataFrame
        El dataset.
    """
    #TODO
    
    new_df =PCA(n_components=1).fit_transform(df)
   
    return new_df

def add_variables(df: pd.DataFrame)->pd.DataFrame:
    """Agregar variables nuevas al dataset utilizando la expansición polinomial.

    Parameters
    ----------
    df : pd.DataFrame
        El dataset.

    Returns
    -------
    pd.DataFrame
        El dataset.
    """
    #TODO
    new_df=PolynomialFeatures(1).fit_transform(df)
    print(new_df)
    return new_df

In [156]:

DF, y = preprocessing(read_dataset())
print(DF.shape)
remove_DF = remove_variables(DF)
print(remove_DF.shape)
add_DF = add_variables(DF)
print(add_DF.shape)

(336, 7)
(336, 1)
[[1.         0.5505618  0.1547619  ... 0.63636364 0.21649485 0.35353535]
 [1.         0.07865169 0.28571429 ... 0.61363636 0.32989691 0.44444444]
 [1.         0.62921348 0.28571429 ... 0.55681818 0.35051546 0.46464646]
 ...
 [1.         0.68539326 0.52380952 ... 0.5        0.37113402 0.38383838]
 [1.         0.66292135 0.53571429 ... 0.47727273 0.40206186 0.37373737]
 [1.         0.83146067 0.69047619 ... 0.35227273 0.51546392 0.52525253]]
(336, 8)


In [157]:
import pandas as pd
import numpy as np
import requests
import shutil
import tempfile
import zipfile
from pathlib import Path
from typing import Dict, List

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import PolynomialFeatures

def classify(X:pd.DataFrame, y:pd.DataFrame, class_names:List[str]):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

    grid = GridSearchCV(
        estimator=LogisticRegression(max_iter=5000),
        param_grid={},
        n_jobs=-1,
        verbose=2,
        cv=KFold(
            n_splits=5, shuffle=True, random_state=10
        ),
    )
    grid.fit(X_train, y=y_train)

    y_pred = grid.best_estimator_.predict(X_test)

    print(
        classification_report(
            y_test,
            y_pred,
            target_names=class_names,
            zero_division=0,
        )
    )

In [158]:
classify(DF, y, ['SEQUENCE_NAME',
               'MCG', 
               'GVH', 
               'LIP', 
               'CHG',
               'AAC', 
               'ALM1', 
               'ALM2', 
               'CLASS','SEQUENCE_NAME',
               'MCG', 
               'GVH', 
               'LIP', 
               'CHG',
               'AAC', 
               'ALM1', 
               'ALM2', 
               'CLASS','SEQUENCE_NAME',
               'MCG', 
               'GVH', 
               'LIP', 
               'CHG',
               'AAC', 
               'ALM1', 
               'ALM2', 
               'CLASS','SEQUENCE_NAME',
               'MCG', 
               'GVH', 
               'LIP', 
               'CHG',
               'AAC', 
               'ALM1', 
               'ALM2', 
               'CLASS','SEQUENCE_NAME',
               'MCG', 
               'GVH', 
               'LIP', 
               'CHG',
               'AAC', 
               'ALM1', 
               'ALM2', 
               'CLASS','SEQUENCE_NAME',
               'MCG', 
               'GVH', 
               'LIP', 
               'CHG',
               'AAC', 
               'ALM1', 
               'ALM2', 
               'CLASS',
               
               
              ])

Fitting 5 folds for each of 1 candidates, totalling 5 fits
               precision    recall  f1-score   support

SEQUENCE_NAME       0.00      0.00      0.00         1
          MCG       0.00      0.00      0.00         1
          GVH       0.00      0.00      0.00         1
          LIP       0.00      0.00      0.00         1
          CHG       0.00      0.00      0.00         1
          AAC       0.00      0.00      0.00         1
         ALM1       0.00      0.00      0.00         1
         ALM2       0.00      0.00      0.00         1
        CLASS       0.00      0.00      0.00         4
SEQUENCE_NAME       0.00      0.00      0.00         5
          MCG       0.00      0.00      0.00         3
          GVH       0.00      0.00      0.00         1
          LIP       0.03      1.00      0.06         1
          CHG       0.00      0.00      0.00         2
          AAC       0.00      0.00      0.00         2
         ALM1       0.00      0.00      0.00         1
     

In [159]:
classify(remove_DF, y, ['SEQUENCE_NAME',
               'MCG', 
               'GVH', 
               'LIP', 
               'CHG',
               'AAC', 
               'ALM1', 
               'ALM2', 
               'CLASS','SEQUENCE_NAME',
               'MCG', 
               'GVH', 
               'LIP', 
               'CHG',
               'AAC', 
               'ALM1', 
               'ALM2', 
               'CLASS','SEQUENCE_NAME',
               'MCG', 
               'GVH', 
               'LIP', 
               'CHG',
               'AAC', 
               'ALM1', 
               'ALM2', 
               'CLASS','SEQUENCE_NAME',
               'MCG', 
               'GVH', 
               'LIP', 
               'CHG',
               'AAC', 
               'ALM1', 
               'ALM2', 
               'CLASS','SEQUENCE_NAME',
               'MCG', 
               'GVH', 
               'LIP', 
               'CHG',
               'AAC', 
               'ALM1', 
               'ALM2', 
               'CLASS','SEQUENCE_NAME',
               'MCG', 
               'GVH', 
               'LIP', 
               'CHG',
               'AAC', 
               'ALM1', 
               'ALM2', 
               'CLASS',])

Fitting 5 folds for each of 1 candidates, totalling 5 fits
               precision    recall  f1-score   support

SEQUENCE_NAME       0.00      0.00      0.00         1
          MCG       0.00      0.00      0.00         1
          GVH       0.00      0.00      0.00         1
          LIP       0.00      0.00      0.00         1
          CHG       0.00      0.00      0.00         1
          AAC       0.00      0.00      0.00         1
         ALM1       0.00      0.00      0.00         1
         ALM2       0.00      0.00      0.00         1
        CLASS       0.00      0.00      0.00         4
SEQUENCE_NAME       0.00      0.00      0.00         5
          MCG       0.00      0.00      0.00         3
          GVH       0.00      0.00      0.00         1
          LIP       0.03      1.00      0.07         1
          CHG       0.00      0.00      0.00         2
          AAC       0.00      0.00      0.00         2
         ALM1       0.00      0.00      0.00         1
     

In [160]:
classify(add_DF, y, ['SEQUENCE_NAME',
               'MCG', 
               'GVH', 
               'LIP', 
               'CHG',
               'AAC', 
               'ALM1', 
               'ALM2', 
               'CLASS','SEQUENCE_NAME',
               'MCG', 
               'GVH', 
               'LIP', 
               'CHG',
               'AAC', 
               'ALM1', 
               'ALM2', 
               'CLASS','SEQUENCE_NAME',
               'MCG', 
               'GVH', 
               'LIP', 
               'CHG',
               'AAC', 
               'ALM1', 
               'ALM2', 
               'CLASS','SEQUENCE_NAME',
               'MCG', 
               'GVH', 
               'LIP', 
               'CHG',
               'AAC', 
               'ALM1', 
               'ALM2', 
               'CLASS','SEQUENCE_NAME',
               'MCG', 
               'GVH', 
               'LIP', 
               'CHG',
               'AAC', 
               'ALM1', 
               'ALM2', 
               'CLASS','SEQUENCE_NAME',
               'MCG', 
               'GVH', 
               'LIP', 
               'CHG',
               'AAC', 
               'ALM1', 
               'ALM2', 
               'CLASS',])

Fitting 5 folds for each of 1 candidates, totalling 5 fits
               precision    recall  f1-score   support

SEQUENCE_NAME       0.00      0.00      0.00         1
          MCG       0.00      0.00      0.00         1
          GVH       0.00      0.00      0.00         1
          LIP       0.00      0.00      0.00         1
          CHG       0.00      0.00      0.00         1
          AAC       0.00      0.00      0.00         1
         ALM1       0.00      0.00      0.00         1
         ALM2       0.00      0.00      0.00         1
        CLASS       0.00      0.00      0.00         4
SEQUENCE_NAME       0.00      0.00      0.00         5
          MCG       0.00      0.00      0.00         3
          GVH       0.00      0.00      0.00         1
          LIP       0.03      1.00      0.06         1
          CHG       0.00      0.00      0.00         2
          AAC       0.00      0.00      0.00         2
         ALM1       0.00      0.00      0.00         1
     