In [114]:
# Importaciones para manejo de datos y dataframes
import numpy as np
from numpy.random import seed
import pandas as pd

# Importaciones para manejo de archivos y llamadas al OS
import os as os
import warnings

# Importaciones para manejo de gráficos
import pylab as plt
import seaborn as sns
from matplotlib.colors import ListedColormap

# Sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.neighbors import LocalOutlierFactor

from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

import optuna


# No mostrar warnings de versiones anteriores
warnings.filterwarnings('ignore')

In [98]:
# Cargamos csv con los datos de train
df_train = pd.read_csv("../data_raw/training_data.csv", sep=",", header=0, na_values=['?', '', 'NA'])
# Cargamos csv con los datos de test
df_test = pd.read_csv("../data_raw/test_data.csv", sep=",", header=0, na_values=['?', '', 'NA'])

In [99]:
#CATEGÓRICAS: X24, X25, X30, RATE 

# X24 -> Factor ordenado (VLOW, LOW, MED, HIGH, VHIGH) -> LabelEncoder
# X25 -> Binario (YES, NO) -> LabelEncoder NO -> 0, YES -> 1. (Da un poco igual si es 0,1 o 1,2 la verdad)
# X30 -> ASKVR, CLPXZ, GXZVX, KUHMP, VTKGN, XNHTQ -> OneHotEncoder

# Ninguna presenta nulos, aplicamos las transformaciones y posteriormente imputación de NAs
# ------------------------------------------------------------------------------------------------------


df_train_num = df_train.copy()
df_test_num = df_test.copy()

# 1. "OrdinalEncoder" para X24
orden_x24 = ['VLOW', 'LOW', 'MED', 'HIGH', 'VHIGH']

ordinal_encoder_x24 = OrdinalEncoder(categories=[orden_x24], dtype=int)

df_train_num['X24'] = ordinal_encoder_x24.fit_transform(df_train_num[['X24']])
df_test_num['X24'] = ordinal_encoder_x24.transform(df_test_num[['X24']])

# 2. "OrdinalEncoder" para X25
orden_x25 = ['NO', 'YES']

ordinal_encoder_x25 = OrdinalEncoder(categories=[orden_x25], dtype=int)

df_train_num['X25'] = ordinal_encoder_x25.fit_transform(df_train_num[['X25']])
df_test_num['X25'] = ordinal_encoder_x25.transform(df_test_num[['X25']])


# Si es VTKGN 1 else 0
# Ya que la la clase está muy desbalanceada
df_train_encoded = df_train_num.copy()
df_test_encoded = df_test_num.copy()

df_train_encoded.loc[df_train_num['X30'] == 'VTKGN', 'X30'] = 1
df_train_encoded.loc[df_train_num['X30'] != 'VTKGN', 'X30'] = 0

df_test_encoded.loc[df_test_num['X30'] == 'VTKGN', 'X30'] = 1
df_test_encoded.loc[df_test_num['X30'] != 'VTKGN', 'X30'] = 0

# df_train_encoded['X30'].astype(int)
# df_test_encoded['X30'].astype(int)

# #3. "OneHotEncoder" para X30

# one_hot_encoder = OneHotEncoder(sparse=False, dtype=np.int32)
# col_encoded = one_hot_encoder.fit_transform(df_train_num[["X30"]])
# df_train_encoded = pd.concat([df_train_num, pd.DataFrame(col_encoded, columns=one_hot_encoder.get_feature_names_out(['X30']))], axis=1)

# one_hot_encoder_test = OneHotEncoder(sparse=False, dtype=np.int32)
# col_encoded_test = one_hot_encoder_test.fit_transform(df_train_num[["X30"]]) # ponemos train porque test no tiene todas las distintas categorias
# df_test_encoded = pd.concat([df_test_num, pd.DataFrame(col_encoded_test, columns=one_hot_encoder_test.get_feature_names_out(['X30']))], axis=1)


# Eliminamos original
# df_train_encoded.head()
# df_test_encoded.head()

df_train_encoded['X30'] = pd.to_numeric(df_train_encoded['X30'])
df_test_encoded['X30'] = pd.to_numeric(df_train_encoded['X30']) 

In [77]:
def scale_data(method,train,test):
    if method=="Minmax":
        scaler = MinMaxScaler()
    if method=="Robust":
        scaler = RobustScaler()
    if method=="Standard":
        scaler = StandardScaler()
    df_scaled_train = scaler.fit_transform(train.to_numpy())
    df_scaled_train = pd.DataFrame(df_scaled_train,index=train.index, columns=
    train.columns)
    df_scaled_test = scaler.transform(test.to_numpy())
    df_scaled_test = pd.DataFrame(df_scaled_test,index=test.index, columns=
    test.columns)

    return df_scaled_train,df_scaled_test
        
        

In [22]:
def value_imputation(method,n_neigh,train,test):
    if method=="knn":
        Knn_imp_train = KNNImputer(n_neighbors=n_neigh).fit(train)
        imp_train = pd.DataFrame(Knn_imp_train.transform(train), columns=train.columns)
        Knn_imp_test = KNNImputer(n_neighbors=8).fit(test)
        imp_test = pd.DataFrame(Knn_imp_test.transform(test), columns=test.columns)
    else:
        if method=="mean":
            Mean_imp_train = SimpleImputer(strategy="mean")
        if method=="median":
            Mean_imp_train = SimpleImputer(strategy="median")
        if method=="most_frequent":
            Mean_imp_train = SimpleImputer(strategy="most_frequent")

        Mean_imp_train.fit(train,train_RATE)
        imp_train = pd.DataFrame(Mean_imp_train.transform(train), columns=train.columns)
        imp_test = pd.DataFrame(Mean_imp_train.transform(test), columns=test.columns)
    
    return imp_train,imp_test


In [80]:
def feature_selection(train,test):

    min_features_to_select = 1  # Minimum number of features to consider
    clf = LogisticRegression(max_iter=1000)
    cv = StratifiedKFold(5)

    rfecv = RFECV(
        estimator=clf,
        step=1,
        cv=cv,
        scoring="accuracy",
        min_features_to_select=min_features_to_select,
        n_jobs=2,
    )
    rfecv.fit(train,train_RATE )
    
    final_columns=rfecv.get_feature_names_out()
    res_train= train.loc[:,final_columns]
    res_test= test.loc[:,final_columns]
    return res_train,res_test

In [24]:
def imbalanced_management(method,train_X,train_Y):
    if method=="SMOTE":
        sm=SMOTE()
    if method=="Tomeklinks":
        sm=TomekLinks()
    if method=="SMOTETomek":
        sm=SMOTETomek()
    
    x_over, y_over=sm.fit_resample(train_X,train_Y)
    return x_over,y_over

In [133]:
def objective(trial):

    # Cargamos csv con los datos de train
    df_train = pd.read_csv("../data_raw/training_data.csv", sep=",", header=0, na_values=['?', '', 'NA'])
    # Cargamos csv con los datos de test
    df_test = pd.read_csv("../data_raw/test_data.csv", sep=",", header=0, na_values=['?', '', 'NA'])
 

    df_train_num = df_train.copy()
    df_test_num = df_test.copy()

    orden_x24 = ['VLOW', 'LOW', 'MED', 'HIGH', 'VHIGH']

    ordinal_encoder_x24 = OrdinalEncoder(categories=[orden_x24], dtype=int)

    df_train_num['X24'] = ordinal_encoder_x24.fit_transform(df_train_num[['X24']])
    df_test_num['X24'] = ordinal_encoder_x24.transform(df_test_num[['X24']])

    orden_x25 = ['NO', 'YES']

    ordinal_encoder_x25 = OrdinalEncoder(categories=[orden_x25], dtype=int)

    df_train_num['X25'] = ordinal_encoder_x25.fit_transform(df_train_num[['X25']])
    df_test_num['X25'] = ordinal_encoder_x25.transform(df_test_num[['X25']])


    df_train_encoded = df_train_num.copy()
    df_test_encoded = df_test_num.copy()

    df_train_encoded.loc[df_train_num['X30'] == 'VTKGN', 'X30'] = 1
    df_train_encoded.loc[df_train_num['X30'] != 'VTKGN', 'X30'] = 0

    df_test_encoded.loc[df_test_num['X30'] == 'VTKGN', 'X30'] = 1
    df_test_encoded.loc[df_test_num['X30'] != 'VTKGN', 'X30'] = 0

    df_train_encoded['X30'] = pd.to_numeric(df_train_encoded['X30'])
    df_test_encoded['X30'] = pd.to_numeric(df_train_encoded['X30']) 

    
    train_ID = df_train_encoded['ID'].copy()
    train_RATE = df_train_encoded ['RATE'].copy()

    df_train_encoded = df_train_encoded .drop(['ID','RATE'], axis=1, inplace=False)
    test_ID = df_test_encoded['ID'].copy()
    df_test_encoded = df_test_encoded.drop('ID', axis=1, inplace=False)

    
    train=df_train_encoded
    test=df_test_encoded



    scaling=trial.suggest_categorical("scaling",['Minmax','Robust','Standard'])
    train,test=scale_data(method=scaling,train=train,test=test)
    imputation=trial.suggest_categorical("imputation",["knn","mean","median","most_frequent"])
    if imputation=="knn":
        n_neigh=trial.suggest_int("neighbors",3,15,step=1,log=False)
    else:
        n_neigh=None
    train,test=value_imputation(method=imputation,n_neigh=n_neigh,train=train,test=test)
    fselct=trial.suggest_categorical("fselect",[True,False])
    if(fselct):
        train,test=feature_selection(train=train,test=test)


    x_train,x_val,y_train,y_val = train_test_split(train,train_RATE,shuffle=True)

    
    imbalanced=trial.suggest_categorical("imbalanced",["SMOTE","Tomeklinks","SMOTETomek"])
    if(imbalanced!=None):
        x_over,y_over=imbalanced_management(method=imbalanced,train_X=x_train,train_Y=y_train)
    else:
        x_over,y_over=x_train,y_train


    solver=trial.suggest_categorical("solver",['lbfgs','newton-cg','sag','saga'])
    if solver!='liblinear':
        multiclass=trial.suggest_categorical("multi_class",['ovr','multinomial'])
    else:
        multiclass='ovr'
    multiclass_weights=trial.suggest_categorical("class_weight",['balanced',"None"])
    if multiclass_weights=="None":
        multiclass_weights=None
    fit_intercept=trial.suggest_categorical("fit_intercept",[True,False])
    if solver=="lbfgs" or solver=="newton-cg" or solver=="sag":
        penalty=trial.suggest_categorical("penalty",['l2',None])
    if solver=="saga":
        penalty="elasticnet"
        l1_ratio=trial.suggest_float("l1_ratio",0.01,0.99,step=0.01)
    else:
        l1_ratio=None
    if penalty!=None:
        C_reg=trial.suggest_float("C",0.01,2,step=None,log=True)
    else:
        C_reg=1.0
        


    lr=LogisticRegression(penalty=penalty,solver=solver,multi_class=multiclass,class_weight=multiclass_weights,fit_intercept=fit_intercept,C=C_reg,l1_ratio=l1_ratio,max_iter=100000)
    lr.fit(x_over,y_over)
    y_pred=lr.predict(x_val)
    score_a=accuracy_score(y_val,y_pred)
    score_b=f1_score(y_pred=y_pred,y_true=y_val,average='weighted')
    return score_b


In [134]:

study=optuna.create_study(direction="maximize")
study.optimize(objective,n_trials=500)



[I 2024-01-11 20:52:28,384] A new study created in memory with name: no-name-25d2ac37-8f3e-48d0-a6a9-b3732b650eec
[I 2024-01-11 20:52:31,314] Trial 0 finished with value: 0.5622887592228428 and parameters: {'scaling': 'Minmax', 'imputation': 'mean', 'fselect': False, 'imbalanced': 'SMOTE', 'solver': 'lbfgs', 'multi_class': 'ovr', 'class_weight': 'None', 'fit_intercept': False, 'penalty': None}. Best is trial 0 with value: 0.5622887592228428.
[I 2024-01-11 20:52:36,617] Trial 1 finished with value: 0.6213997472125059 and parameters: {'scaling': 'Minmax', 'imputation': 'mean', 'fselect': False, 'imbalanced': 'SMOTE', 'solver': 'sag', 'multi_class': 'multinomial', 'class_weight': 'None', 'fit_intercept': False, 'penalty': None}. Best is trial 1 with value: 0.6213997472125059.
[I 2024-01-11 20:52:36,695] Trial 2 finished with value: 0.550012504892885 and parameters: {'scaling': 'Standard', 'imputation': 'mean', 'fselect': False, 'imbalanced': 'SMOTE', 'solver': 'newton-cg', 'multi_class': 

In [135]:
study.best_params

{'scaling': 'Standard',
 'imputation': 'mean',
 'fselect': True,
 'imbalanced': 'Tomeklinks',
 'solver': 'lbfgs',
 'multi_class': 'multinomial',
 'class_weight': 'None',
 'fit_intercept': True,
 'penalty': None}

In [138]:
df_train = pd.read_csv("../data_raw/training_data.csv", sep=",", header=0, na_values=['?', '', 'NA'])
# Cargamos csv con los datos de test
df_test = pd.read_csv("../data_raw/test_data.csv", sep=",", header=0, na_values=['?', '', 'NA'])

df_train_num = df_train.copy()
df_test_num = df_test.copy()

# 1. "OrdinalEncoder" para X24
orden_x24 = ['VLOW', 'LOW', 'MED', 'HIGH', 'VHIGH']

ordinal_encoder_x24 = OrdinalEncoder(categories=[orden_x24], dtype=int)

df_train_num['X24'] = ordinal_encoder_x24.fit_transform(df_train_num[['X24']])
df_test_num['X24'] = ordinal_encoder_x24.transform(df_test_num[['X24']])

# 2. "OrdinalEncoder" para X25
orden_x25 = ['NO', 'YES']

ordinal_encoder_x25 = OrdinalEncoder(categories=[orden_x25], dtype=int)

df_train_num['X25'] = ordinal_encoder_x25.fit_transform(df_train_num[['X25']])
df_test_num['X25'] = ordinal_encoder_x25.transform(df_test_num[['X25']])


# Si es VTKGN 1 else 0
# Ya que la la clase está muy desbalanceada
df_train_encoded = df_train_num.copy()
df_test_encoded = df_test_num.copy()

df_train_encoded.loc[df_train_num['X30'] == 'VTKGN', 'X30'] = 1
df_train_encoded.loc[df_train_num['X30'] != 'VTKGN', 'X30'] = 0

df_test_encoded.loc[df_test_num['X30'] == 'VTKGN', 'X30'] = 1
df_test_encoded.loc[df_test_num['X30'] != 'VTKGN', 'X30'] = 0

# df_train_encoded['X30'].astype(int)
# df_test_encoded['X30'].astype(int)

# #3. "OneHotEncoder" para X30

# one_hot_encoder = OneHotEncoder(sparse=False, dtype=np.int32)
# col_encoded = one_hot_encoder.fit_transform(df_train_num[["X30"]])
# df_train_encoded = pd.concat([df_train_num, pd.DataFrame(col_encoded, columns=one_hot_encoder.get_feature_names_out(['X30']))], axis=1)

# one_hot_encoder_test = OneHotEncoder(sparse=False, dtype=np.int32)
# col_encoded_test = one_hot_encoder_test.fit_transform(df_train_num[["X30"]]) # ponemos train porque test no tiene todas las distintas categorias
# df_test_encoded = pd.concat([df_test_num, pd.DataFrame(col_encoded_test, columns=one_hot_encoder_test.get_feature_names_out(['X30']))], axis=1)


# Eliminamos original
# df_train_encoded.head()
# df_test_encoded.head()

df_train_encoded['X30'] = pd.to_numeric(df_train_encoded['X30'])
df_test_encoded['X30'] = pd.to_numeric(df_train_encoded['X30']) 



In [139]:
    train_ID = df_train_encoded['ID'].copy()
    train_RATE = df_train_encoded ['RATE'].copy()

    df_train_encoded = df_train_encoded .drop(['ID','RATE'], axis=1, inplace=False)
    test_ID = df_test_encoded['ID'].copy()
    df_test_encoded = df_test_encoded.drop('ID', axis=1, inplace=False)

In [140]:
train=df_train_encoded
test=df_test_encoded

train,test=scale_data(method="Standard",train=train,test=test)
train,test=value_imputation(method="mean",n_neigh=None,train=train,test=test)
train,test=feature_selection(train=train,test=test)
x_train,x_val,y_train,y_val = train_test_split(train,train_RATE,shuffle=True)
x_over,y_over=imbalanced_management(method="Tomeklinks",train_X=x_train,train_Y=y_train)


    

In [141]:
lr=LogisticRegression(penalty=None,solver="lbfgs",multi_class="multinomial",class_weight=None,fit_intercept=True)
lr.fit(x_over,y_over)

In [142]:
pred=lr.predict(x_val)
unique,count = np.unique(pred,return_counts=True)
print(count)

[ 13 111  56  47]


In [143]:
unique,count = np.unique(y_val,return_counts=True)
print(count)

[ 16 100  58  53]


In [144]:
pred=lr.predict(test)
results={'ID':test_ID, 'RATE': pred}
df_submission=pd.DataFrame(data=results)
df_submission.to_csv("SubmissionOPTUNA5.csv",index=False)