In [1]:
#Basandonos en el siguiente post: https://www.analyticsvidhya.com/blog/2020/10/feature-selection-techniques-in-machine-learning/

# Feature Selection
### A partir de distintos algoritmos buscaremos seleccionar los features que mas informacion aportan a la prediccion del target

## Preprocesamiento

In [2]:
import Utilidades as ut
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_selection import mutual_info_classif


GRAFICAR = False

#Cargamos el dataset
df = pd.read_pickle("Archivos/Feature_Selection_entrenamiento.pkl")
df_test = pd.read_pickle("Archivos/Feature_Selection_validacion.pkl")

if ('Opportunity_ID' in df_test):
    df_test = df_test.drop(columns=['Opportunity_ID'])

#Dividimos el dataframe en features-labels
df_x, df_y = ut.split_labels(df)
df_x_test, df_y_test = ut.split_labels(df_test)

##Procesamos los dataframes
df_x, df_x_test = ut.conversion_fechas(df_x, df_x_test)
df_x, df_x_test = ut.codificar_categoricas(df_x, df_y, df_x_test, modo='catboost')
df_x, df_x_test = ut.normalizacion_numericas(df_x, df_x_test, modo='normalizacion')

##Convertimos el dataframe a un array de numpy
x = ut.df_a_vector(df_x)
y = ut.df_a_vector(df_y).flatten()

  elif pd.api.types.is_categorical(cols):


## Information Gain

In [3]:
importances = mutual_info_classif(x, y)
feat_importances = pd.Series(importances, df_x.columns[0:len(df.columns)])

In [4]:
if GRAFICAR:
    feat_importances.plot(kind='barh', color='teal', figsize=(24, 24))

In [5]:
features = feat_importances.to_frame()
features.columns = ['Score']
features = features.sort_values('Score', ascending=False).reset_index()
features['idx'] = features.index
features['Score'] = (features.shape[0] - features.idx) / 10
features = features.drop(columns=['idx']).set_index('index')
features

Unnamed: 0_level_0,Score
index,Unnamed: 1_level_1
Opportunity_Name,548.3
Opportunity_Name_Planned_Opportunity_Duration_median,548.2
Opportunity_Name_Planned_Opportunity_Duration_min,548.1
Opportunity_Name_Planned_Opportunity_Duration_mean,548.0
Opportunity_Name_Planned_Opportunity_Duration_max,547.9
...,...
Account_Owner_Total_Products_Region_Week_Change_min,0.5
Delivery_Terms_ASP_by_Region_std_max,0.4
Account_Owner_Total_Products_Region_Last_Week_min,0.3
"ASP_Currency_Pricing, Delivery_Terms_Quote_Appr_min",0.2


## Fisher's Score

In [6]:
from skfeature.function.similarity_based import fisher_score

ranks = fisher_score.fisher_score(x, y)
feat_importances = pd.Series(ranks, df_x.columns[0:len(df.columns)])
if GRAFICAR:
    feat_importances.plot(kind='barh', color='teal', figsize=(24, 24))

In [7]:
actual = feat_importances.to_frame()
actual.columns = ['Score']
actual = actual.sort_values('Score', ascending=False).reset_index()
actual['idx'] = actual.index
actual['Score'] = (actual.shape[0] - actual.idx) / 10
actual = actual.drop(columns=['idx']).set_index('index')

features['Score'] = features['Score'] + actual['Score']
features

Unnamed: 0_level_0,Score
index,Unnamed: 1_level_1
Opportunity_Name,1096.6
Opportunity_Name_Planned_Opportunity_Duration_median,1096.2
Opportunity_Name_Planned_Opportunity_Duration_min,1095.9
Opportunity_Name_Planned_Opportunity_Duration_mean,1095.9
Opportunity_Name_Planned_Opportunity_Duration_max,1095.4
...,...
Account_Owner_Total_Products_Region_Week_Change_min,202.8
Delivery_Terms_ASP_by_Region_std_max,55.0
Account_Owner_Total_Products_Region_Last_Week_min,160.7
"ASP_Currency_Pricing, Delivery_Terms_Quote_Appr_min",118.5


## Correlation Coefficient

In [8]:
import seaborn as sns

#Damos un punto extra por cada threshold

thresholds = [0.1, 0.2, 0.3, 0.4]

combined = df_x.copy()
combined['Stage'] = df_y['Stage']

cor = combined.corr()
#plt.figure(figsize = (24, 24))
#sns.heatmap(cor, annot=True)
cor = cor.drop(index=['Stage'])[['Stage']]

for t in thresholds:
    index = cor[np.abs(cor.Stage) > t].index
    features.loc[index, ['Score']] = features.loc[index, ['Score']] + 1

In [9]:
#col = cor.select_dtypes(include=['category'])
cor.sort_values('Stage', ascending=False).head(20)

Unnamed: 0,Stage
Opportunity_Name,0.625846
Product_Name,0.49257
Account_Name,0.491055
Last_Modified_By,0.423758
Opportunity_Owner,0.421317
Bureaucratic_Code,0.420251
Product_Family,0.40496
"Bureaucratic_Code_Pricing, Delivery_Terms_Approved_median",0.402704
Bureaucratic_Code_Bureaucratic_Code_0_Approved_mean,0.401339
"Bureaucratic_Code_Pricing, Delivery_Terms_Approved_mean",0.398731


## Variance Threshold

In [10]:
from sklearn.feature_selection import VarianceThreshold

#Con threshold elegimos el valor de la varianza a partir de la cual queremos marcar a las columnas como "varianza cero"
v_t = VarianceThreshold(threshold=0.1)
#Procesamos con el dataset de train
v_t.fit(x)
#Obtenemos una serie de pandas donde cada fila indica si el feature tiene (True) o no tiene (False) varianza superior al threshold
result = pd.Series(v_t.get_support())
result.index = features['Score'].index
features['Score'] = features['Score'] + result

### Top 60 sin analizar correlacion entre features

['Opportunity_Name',
 'Opportunity_Name_Planned_Opportunity_Duration_median',
 'Opportunity_Name_Planned_Opportunity_Duration_min',
 'Opportunity_Name_Planned_Opportunity_Duration_mean',
 'Planned_Opportunity_Duration',
 'Opportunity_Name_Planned_Opportunity_Duration_max',
 'Account_Name',
 'Opportunity_Name_Planned_Time_Until_Deliver_median',
 'Product_Name_Planned_Opportunity_Duration_mean',
 'Opportunity_Name_Planned_Time_Until_Deliver_mean',
 'Planned_Time_Until_Deliver',
 'Opportunity_Name_Planned_Time_Until_Deliver_min',
 'Product_Name_Planned_Opportunity_Duration_median',
 'Product_Name_Planned_Time_Until_Deliver_mean',
 'Opportunity_Name_Planned_Time_Until_Deliver_max',
 'Product_Name',
 'Product_Name_Planned_Time_Until_Deliver_median',
 'Account_Name_Planned_Opportunity_Duration_median',
 'Account_Name_Planned_Opportunity_Duration_mean',
 'Account_Name_Planned_Time_Until_Deliver_median',
 'Opportunity_Name_Total_Amount(USD)_max',
 'Account_Name_Planned_Time_Until_Deliver_mean',
 'Account_Name_Product_Amount_Deviation_of_Product_Family_rate_mean',
 'Product_Name_Planned_Time_Until_Deliver_std',
 'Product_Name_Opportunity_Duration_by_Account_Type_mean',
 'Bureaucratic_Code',
 'Product_Name_Planned_Opportunity_Duration_std',
 'Last_Modified_By',
 'Opportunity_Name_Total_Amount(USD)_mean',
 'Product_Name_Product_Amount_Deviation_of_Product_Family_rate_mean',
 'Product_Name_Planned_Deliver_Duration_mean',
 'Opportunity_Name_Total_Amount(USD)_median',
 'Opportunity_Owner',
 'Account_Name_Product_Amount_Deviation_of_Product_Family_rate_median',
 'Product_Name_Total_Amount(USD)_mean',
 'Opportunity_Name_Product_Amount_Deviation_of_Product_Family_rate_min',
 'Bureaucratic_Code_Opportunity_ID_std',
 'Product_Name_Bureaucratic_Code_0_Approval_mean',
 'Product_Family',
 'Opportunity_Name_Total_Amount(USD)_min',
 'Account_Name_Planned_Deliver_Duration_mean',
 'Product_Name_Product_Amount_Deviation_of_Product_Family_rate_median',
 'Bureaucratic_Code_Opportunity_TRF_Region_avg_Ratio_std',
 'Total_Amount(USD)',
 'Product_Name_Total_Amount(USD)_median',
 'Bureaucratic_Code_Total_Products_Region_Last_Quarter_std',
 'Bureaucratic_Code_Pricing, Delivery_Terms_Approved_mean',
 'Opportunity_Name_Product_Amount_Deviation_of_Product_Family_rate_mean',
 'Product_Name_Total_Amount(USD)_max',
 'Product_Name_Planned_Deliver_Duration_std',
 'Bureaucratic_Code_TRF_std',
 'Account_Name_Total_Amount(USD)_min',
 'Product_Name_Total_Amount_median',
 'Bureaucratic_Code_ASP_min',
 'Opportunity_Name_Product_Amount_Deviation_of_Product_Family_rate_median',
 'Bureaucratic_Code_Product_Amount_Deviation_of_Product_Family_rate_max',
 'Bureaucratic_Code_Opportunity_Total_Amount_Region_avg_Ratio_max',
 'Account_Name_Total_Amount(USD)_mean',
 'Bureaucratic_Code_Opportunity_Size_max',
 'Bureaucratic_Code_Total_Products_Region_Week_Change_mean']

## Matriz de Correlación

In [11]:
#reducido = df[['Opportunity_Name', 'Planned_Opportunity_Duration', 'Account_Name', 'Product_Name', 'Bureaucratic_Code', 'Last_Modified_By', 'Opportunity_Owner']].copy()
#corr_mat = reducido.corr().abs().replace(to_replace=np.nan, value=0)

corr_mat = df.corr().abs().replace(to_replace=np.nan, value=0)

limite = 0.5

n = corr_mat.shape[0]

for i in range(1, n+1):
    corr_mat.iloc[i:n, 0:i] = 0

col_list = []

for ind in corr_mat:
    col_list.append(corr_mat[ind].sort_values(ascending=False))
    
res = []
    
for l in col_list:
    if l[1] >= limite:
        res.append((l.index[0], l.index[1], l[1]))
    
from operator import itemgetter

res.sort(reverse=True, key=itemgetter(2))

#for r in res:
#    print(r[0], r[1], r[2], sep= ' ; ')

In [12]:
import networkx as nx

grafo = nx.Graph()

for r in res:
    grafo.add_node(r[0])
    grafo.add_node(r[1])
    grafo.add_edge(r[0], r[1], weight=1)

result = nx.coloring.greedy_color(grafo, strategy="largest_first")

contador_colores = dict()
grupos = dict()

for nodo, color in result.items():
    if color in contador_colores:
        contador_colores[color] += 1
    else:
        contador_colores[color] = 1
    if color in grupos:
        grupos[color].append(nodo)
    else:
        grupos[color] = [nodo]
        

k = len(list(contador_colores.keys()))

print(f"Se utlizaron {k} colores")
grupos

Se utlizaron 3 colores


{0: ['ASP_by_Region_mean',
  'Total_Amount_by_Billing_Country_mean',
  'Opportunity_Total_Amount_Region_std',
  'ASP_Currency_Pricing, Delivery_Terms_Approved_median',
  'Product_Family_ASP_median',
  'Region_Opportunity_ID_std',
  'ASP_Currency_TRF_median',
  'Opportunity_Duration_by_Account_Type',
  'Total_Amount_by_Product_Family_std',
  'Opportunity_Duration_by_Product_Family_mean',
  'ASP_Currency_Pricing, Delivery_Terms_Quote_Appr_mean',
  'ASP_Currency_Bureaucratic_Code_0_Approval_mean',
  'ASP_Currency_TRF_std',
  'ASP_Currency_Total_Amount_mean',
  'Account_Type_Bureaucratic_Code_0_Approval_median',
  'Bureaucratic_Code_Opportunity_ID_max',
  'Delivery_Quarter_ASP_median',
  'Last_Modified_By_ASP_(converted)_std',
  'Account_Type_Total_Amount_max',
  'Bureaucratic_Code_Delivery_Year_max',
  'Opportunity_Type_ASP_mean',
  'Opportunity_Type_Bureaucratic_Code_0_Approval_mean',
  'Opportunity_Type_ASP_max',
  'Product_Family_Total_Amount(USD)_max',
  'Billing_Country_Total_Amount_

In [13]:
cont = 0
grupo = grupos[0]
for grupo in grupos.values():
    for r in res:
        a = r[0]
        b = r[1]
        if ((a in grupo) and (b in grupo)): cont += 1

print(f"Este contador deberia dar cero: {cont}")

Este contador deberia dar cero: 0


# Selección de los Mejores

In [14]:
# Ordenamos los features de cada grupo segun su score y elegimos la cantidad de features que deseamos

num_features = 50
grupos_reducido = dict()

for num_grupo, grupo in grupos.items():
    grupos[num_grupo] = sorted(grupo, reverse=True, key=lambda x: features.loc[x].Score)
    if (len(grupos[num_grupo]) < num_features):
        print(f"CUIDADO: El grupo {num_grupo} tiene asignado menos de {num_features} features")
    grupos_reducido[num_grupo] = grupos[num_grupo][:num_features]

In [15]:
mejor_grupo = -1
mejor_score = -1

for num_grupo, grupo in grupos_reducido.items():
    score = 0
    for col in grupo:
        score += features.loc[col].Score
    print(f"Grupo: {num_grupo} -> Score: {score}")
    if score > mejor_score:
        mejor_score = score
        mejor_grupo = num_grupo

print(f"Mejor_grupo: {mejor_grupo}")

Grupo: 0 -> Score: 52847.499999999985
Grupo: 1 -> Score: 53234.49999999999
Grupo: 2 -> Score: 45964.799999999996
Mejor_grupo: 1


In [16]:
features_filtrados = features.loc[[a for a in grupos_reducido[mejor_grupo]]]
suma = features_filtrados.Score.sum()
print(f"La suma total de puntos del dataframe filtrado da {suma}")
features_filtrados.sort_values('Score', ascending=False)

La suma total de puntos del dataframe filtrado da 53234.5


Unnamed: 0_level_0,Score
index,Unnamed: 1_level_1
Opportunity_Name_Planned_Opportunity_Duration_median,1101.2
Opportunity_Name_Planned_Opportunity_Duration_max,1100.4
Product_Name_Planned_Opportunity_Duration_median,1098.3
Opportunity_Name_Planned_Time_Until_Deliver_median,1098.2
Product_Name_Planned_Time_Until_Deliver_mean,1097.1
Planned_Time_Until_Deliver,1097.1
Opportunity_Name_Planned_Time_Until_Deliver_min,1097.0
Opportunity_Name_Planned_Time_Until_Deliver_max,1096.2
Account_Name_Planned_Opportunity_Duration_mean,1090.2
Account_Name_Planned_Time_Until_Deliver_median,1087.7


In [17]:
print(f"Los {num_features} mejores features del mejor grupo son:\n")

grupos_reducido[mejor_grupo]

Los 50 mejores features del mejor grupo son:



['Opportunity_Name_Planned_Opportunity_Duration_median',
 'Opportunity_Name_Planned_Opportunity_Duration_max',
 'Product_Name_Planned_Opportunity_Duration_median',
 'Opportunity_Name_Planned_Time_Until_Deliver_median',
 'Product_Name_Planned_Time_Until_Deliver_mean',
 'Planned_Time_Until_Deliver',
 'Opportunity_Name_Planned_Time_Until_Deliver_min',
 'Opportunity_Name_Planned_Time_Until_Deliver_max',
 'Account_Name_Planned_Opportunity_Duration_mean',
 'Account_Name_Planned_Time_Until_Deliver_median',
 'Opportunity_Name_Total_Amount(USD)_max',
 'Product_Name_Opportunity_Duration_by_Account_Type_mean',
 'Account_Name_Product_Amount_Deviation_of_Product_Family_rate_mean',
 'Product_Name_Planned_Opportunity_Duration_std',
 'Product_Name_Planned_Deliver_Duration_mean',
 'Opportunity_Name_Total_Amount(USD)_median',
 'Opportunity_Name_Product_Amount_Deviation_of_Product_Family_rate_min',
 'Product_Name_Bureaucratic_Code_0_Approval_mean',
 'Total_Amount(USD)',
 'Opportunity_Name_Total_Amount(US

In [18]:
#df[[col for col in df.columns if 'Delivery_Terms_Quote_Appr_std' in col]].columns
df['Bureaucratic_Code_Pricing, Delivery_Terms_Quote_Appr_std']

0        0.435723
1        0.707107
2        0.000000
3        0.000000
4        0.000000
           ...   
12135    0.435723
12136    0.000000
12137    0.000000
12138    0.435723
12139    0.435723
Name: Bureaucratic_Code_Pricing, Delivery_Terms_Quote_Appr_std, Length: 12140, dtype: float64

In [19]:
#Imprimimos el top n para los k grupos

for num_grupo, grupo in grupos_reducido.items():
    print(f"Mejores {num_features} features del grupo {num_grupo}:\n")
    print(grupo)
    print("\n")

Mejores 50 features del grupo 0:

['Opportunity_Name_Planned_Opportunity_Duration_mean', 'Opportunity_Name_Planned_Opportunity_Duration_min', 'Opportunity_Name_Planned_Time_Until_Deliver_mean', 'Product_Name_Planned_Opportunity_Duration_mean', 'Product_Name_Planned_Time_Until_Deliver_median', 'Account_Name_Planned_Opportunity_Duration_median', 'Account_Name_Planned_Time_Until_Deliver_mean', 'Product_Name_Planned_Time_Until_Deliver_std', 'Opportunity_Name_Total_Amount(USD)_mean', 'Product_Name_Product_Amount_Deviation_of_Product_Family_rate_mean', 'Account_Name_Product_Amount_Deviation_of_Product_Family_rate_median', 'Product_Name_Total_Amount(USD)_mean', 'Opportunity_Name_Product_Amount_Deviation_of_Product_Family_rate_mean', 'Last_Modified_By_Opportunity_Total_Amount_Region_std_Ratio_max', 'Bureaucratic_Code_Buro_Approved_by_Product_Family_mean', 'Bureaucratic_Code_Total_Products_Region_This_Month_std', 'Product_Name_Planned_Deliver_Duration_std', 'Bureaucratic_Code_Total_Taxable_Amou

In [20]:
grupos_reducido[2]

['Planned_Opportunity_Duration',
 'Account_Name_Planned_Deliver_Duration_mean',
 'Bureaucratic_Code_ASP_(converted)_mean',
 'Bureaucratic_Code_Opportunity_ID_std',
 'Last_Modified_By_Total_Taxable_Amount(USD)_max',
 'Product_Name_Opportunity_TRF_median',
 'Bureaucratic_Code_Pricing, Delivery_Terms_Quote_Appr_mean',
 'Account_Name_Total_Products_Region_This_Week_std',
 'Bureaucratic_Code_Opportunity_Taxable_Rate_mean',
 'Account_Name_Planned_Time_Until_Deliver_min',
 'Bureaucratic_Code_Product_Amount_Deviation_of_Product_Family_rate_mean',
 'Last_Modified_By_Planned_Time_Until_Deliver_min',
 'Bureaucratic_Code_Bureaucratic_Code_0_Approval_median',
 'Opportunity_Owner_Planned_Time_Until_Deliver_median',
 'Last_Modified_By_Opportunity_Duration_by_Billing_Country_max',
 'Account_Owner_Planned_Time_Until_Deliver_median',
 'Opportunity_Owner_TRF_std',
 'Product_Family_Total_Taxable_Amount_mean',
 'Last_Modified_By_Total_Products_Region_This_Month_max',
 'Last_Modified_By_Planned_Deliver_Dura

In [21]:
top_n = 20
features.loc[grupos[0]].sort_values('Score', ascending=False)[:top_n]

Unnamed: 0_level_0,Score
index,Unnamed: 1_level_1
Opportunity_Name_Planned_Opportunity_Duration_mean,1100.9
Opportunity_Name_Planned_Opportunity_Duration_min,1100.9
Opportunity_Name_Planned_Time_Until_Deliver_mean,1099.2
Product_Name_Planned_Opportunity_Duration_mean,1099.0
Product_Name_Planned_Time_Until_Deliver_median,1094.5
Account_Name_Planned_Opportunity_Duration_median,1093.1
Account_Name_Planned_Time_Until_Deliver_mean,1082.3
Product_Name_Planned_Time_Until_Deliver_std,1077.7
Opportunity_Name_Total_Amount(USD)_mean,1075.4
Product_Name_Product_Amount_Deviation_of_Product_Family_rate_mean,1074.3


In [22]:
features.loc[grupos[1]].sort_values('Score', ascending=False)[:top_n]

Unnamed: 0_level_0,Score
index,Unnamed: 1_level_1
Opportunity_Name_Planned_Opportunity_Duration_median,1101.2
Opportunity_Name_Planned_Opportunity_Duration_max,1100.4
Product_Name_Planned_Opportunity_Duration_median,1098.3
Opportunity_Name_Planned_Time_Until_Deliver_median,1098.2
Product_Name_Planned_Time_Until_Deliver_mean,1097.1
Planned_Time_Until_Deliver,1097.1
Opportunity_Name_Planned_Time_Until_Deliver_min,1097.0
Opportunity_Name_Planned_Time_Until_Deliver_max,1096.2
Account_Name_Planned_Opportunity_Duration_mean,1090.2
Account_Name_Planned_Time_Until_Deliver_median,1087.7


In [23]:
features.loc[grupos[2]].sort_values('Score', ascending=False)[:top_n]

Unnamed: 0_level_0,Score
index,Unnamed: 1_level_1
Planned_Opportunity_Duration,1100.5
Account_Name_Planned_Deliver_Duration_mean,1060.7
Bureaucratic_Code_ASP_(converted)_mean,1052.8
Bureaucratic_Code_Opportunity_ID_std,1049.4
Last_Modified_By_Total_Taxable_Amount(USD)_max,1045.2
Product_Name_Opportunity_TRF_median,1038.3
"Bureaucratic_Code_Pricing, Delivery_Terms_Quote_Appr_mean",1037.9
Account_Name_Total_Products_Region_This_Week_std,1031.6
Bureaucratic_Code_Opportunity_Taxable_Rate_mean,1026.8
Account_Name_Planned_Time_Until_Deliver_min,1026.4
