# Binary classificator problem

In [1]:
import pandas as pd # data processing
from fuzzywuzzy import fuzz # fuzzy logic, compare strings
from collections import defaultdict # dictionary with default value
# Functions I created for this task
from functions import find_similar_columns, investigate_similar_columns, swap_columns

from sklearn.preprocessing import LabelEncoder # label encoder
from sklearn.preprocessing import OneHotEncoder # one hot encoder

## Part 1: Load the data

In [2]:
# Read and inspect train and test datasets
train = pd.read_csv('../Data/train.csv', index_col=0)
test = pd.read_csv('../Data/test.csv', index_col=0)

display(train.head())

test.head()

Unnamed: 0_level_0,FRAUDE,VALOR,HORA_AUX,Dist_max_NAL,Canal1,FECHA,COD_PAIS,CANAL,DIASEM,DIAMES,...,INGRESOS,EGRESOS,NROPAISES,Dist_Sum_INTER,Dist_Mean_INTER,Dist_Max_INTER,NROCIUDADES,Dist_Mean_NAL,Dist_HOY,Dist_sum_NAL
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9000000001,1,0.0,13,659.13,ATM_INT,20150501,US,ATM_INT,5,1,...,1200000.0,1200000.0,1,,,,6,474.94,4552.41,5224.36
9000000002,1,0.0,17,594.77,ATM_INT,20150515,US,ATM_INT,5,15,...,5643700.0,500000.0,1,,,,5,289.99,4552.41,2029.9
9000000003,1,0.0,13,659.13,ATM_INT,20150501,US,ATM_INT,5,1,...,1200000.0,1200000.0,1,,,,6,474.94,4552.41,5224.36
9000000004,1,0.0,13,659.13,ATM_INT,20150501,US,ATM_INT,5,1,...,1200000.0,1200000.0,1,,,,6,474.94,4552.41,5224.36
9000000005,1,0.0,0,1.0,ATM_INT,20150510,CR,ATM_INT,0,10,...,0.0,0.0,1,,,,1,,1482.35,1.0


Unnamed: 0_level_0,FRAUDE,VALOR,HORA_AUX,Dist_max_COL,Dist_max_INTER,Canal1,FECHA_FRAUDE,COD_PAIS,CANAL,FECHA,...,Dist_Mean_INTER,Dist_Max_INTER,NROCIUDADES,Dist_Sum_NAL,Dist_Mean_NAL,Dist_HOY,Dist_sum_NAL,Dist_mean_NAL,Dist_sum_INTER,Dist_mean_INTER
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
98523068,,42230.09,18,1.0,1.0,POS,20150515,US,POS,20150515,...,,,1,,,4552.41,1.0,1.0,1.0,1.0
300237898,,143202.65,20,614.04,7632.97,POS,20150506,US,MCI,20150506,...,6092.69,7632.97,2,1228.07,614.04,4552.41,1228.07,614.04,24370.75,6092.69
943273308,,243591.25,2,286.84,2443.14,ATM_INT,20150517,EC,ATM_INT,20150517,...,1743.52,2443.14,7,1944.35,138.88,5083.41,1944.35,138.88,6974.09,1743.52
951645809,,238267.4,20,1.0,1.0,ATM_INT,20150508,EC,ATM_INT,20150508,...,,,1,,,904.81,1.0,1.0,1.0,1.0
963797516,,490403.58,13,1.0,1.0,ATM_INT,20150501,US,ATM_INT,20150501,...,,,1,,,4552.41,1.0,1.0,1.0,1.0


In [3]:
# Separate the target variable from the features
y_train = train['FRAUDE']
X_train = train.drop('FRAUDE', axis=1)

X_test = test.drop('FRAUDE', axis=1)

## Part 2: Analyze the data

First thing is to check what the columns in common between `X_train` and `X_test` are. Also it is useful to check the columns belonging to `X_train` that are not in `X_test` and viceversa. The two sets used should have exactly the same columns.

In [4]:
# Column names for features
train_columns = list(X_train.columns)
test_columns = list(X_test.columns)

# Extract columns in common
common_columns = list(set(train_columns) & set(test_columns))

# Extract columns in train but not in test and viceversa
train_not_test = list(set(train_columns) - set(test_columns))
test_not_train = list(set(test_columns) - set(train_columns))

In [5]:
print('Columns in common: ', common_columns)
print('Columns in train but not in test: ', train_not_test)
print('Columns in test but not in train: ', test_not_train)

Columns in common:  ['NROPAISES', 'Dist_Sum_INTER', 'SEGMENTO', 'COD_PAIS', 'NROCIUDADES', 'HORA_AUX', 'DIASEM', 'Dist_Max_INTER', 'OFICINA_VIN', 'Dist_HOY', 'Dist_Mean_NAL', 'CANAL', 'FECHA', 'SEXO', 'EDAD', 'EGRESOS', 'DIAMES', 'VALOR', 'INGRESOS', 'FECHA_VIN', 'Dist_sum_NAL', 'Canal1', 'Dist_Mean_INTER']
Columns in train but not in test:  ['Dist_max_NAL']
Columns in test but not in train:  ['FECHA_FRAUDE', 'Dist_Sum_NAL', 'Dist_sum_INTER', 'Dist_mean_NAL', 'Dist_max_INTER', 'Dist_mean_INTER', 'Dist_max_COL']


## Part 3: Investigate columns with similar names


In [6]:
# Analyze equality of columns "FECHA" and "FECHA_FRAUDE"
fecha_equals_fechafraude = X_test["FECHA"].equals(X_test["FECHA_FRAUDE"])

# If they're equal, drop "FECHA_FRAUDE" from X_test since it's redundant
if fecha_equals_fechafraude:
    X_test = X_test.drop('FECHA_FRAUDE', axis=1)

# Print if they're equal
print('Are columns "FECHA" and "FECHA_FRAUDE" equal? ', fecha_equals_fechafraude)

Are columns "FECHA" and "FECHA_FRAUDE" equal?  True


In [7]:
# Find similar column names in test
similar_columns = find_similar_columns(X_test, threshold=92)

In [8]:
# Print similar_columns
print(similar_columns.keys())
print(similar_columns)

dict_keys(['Dist_max_INTER', 'Dist_Sum_INTER', 'Dist_Mean_INTER', 'Dist_Sum_NAL', 'Dist_Mean_NAL'])
{'Dist_max_INTER': ['Dist_max_INTER', 'Dist_Max_INTER'], 'Dist_Sum_INTER': ['Dist_Sum_INTER', 'Dist_sum_INTER'], 'Dist_Mean_INTER': ['Dist_Mean_INTER', 'Dist_mean_INTER'], 'Dist_Sum_NAL': ['Dist_Sum_NAL', 'Dist_sum_NAL'], 'Dist_Mean_NAL': ['Dist_Mean_NAL', 'Dist_mean_NAL']}


In [9]:
# Iterate over the keys in similar_columns and apply the function investigate_similar_columns
for key in similar_columns.keys():
    print("\n")
    investigate_similar_columns(X_test, similar_columns[key][0], similar_columns[key][1],
                                common_columns)




Columns are equal:  False
Non-null values are the same:  True
Column with the non-null (imputed) values:  Dist_max_INTER
Column to drop:  Dist_Max_INTER
Column Dist_max_INTER is not in common_columns


Columns are equal:  False
Non-null values are the same:  True
Column with the non-null (imputed) values:  Dist_sum_INTER
Column to drop:  Dist_Sum_INTER
Column Dist_sum_INTER is not in common_columns


Columns are equal:  False
Non-null values are the same:  True
Column with the non-null (imputed) values:  Dist_mean_INTER
Column to drop:  Dist_Mean_INTER
Column Dist_mean_INTER is not in common_columns


Columns are equal:  False
Non-null values are the same:  True
Column with the non-null (imputed) values:  Dist_sum_NAL
Column to drop:  Dist_Sum_NAL
Column Dist_sum_NAL is in common_columns


Columns are equal:  False
Non-null values are the same:  True
Column with the non-null (imputed) values:  Dist_mean_NAL
Column to drop:  Dist_Mean_NAL
Column Dist_mean_NAL is not in common_columns


In [10]:
help(swap_columns)

Help on function swap_columns in module functions:

swap_columns(df: pandas.core.frame.DataFrame, first_column: str, second_column: str)
    Function to swap two column names in a dataframe
    
    Args:
        df (pd.DataFrame): Dataframe to swap columns in
        first_column (str): Name of the first column to swap
        second_column (str): Name of the second column to swap
    
    Returns:
        pd.DataFrame: Dataframe with swapped columns



In [11]:
# Create a list with the keys of similar_columns excluding the key "Dist_Sum_NAL"
keys_to_swap = [key for key in similar_columns.keys() if key != 'Dist_Sum_NAL']

# Iterate over the elements in keys_to_swap and apply the function swap_columns to X_test
for key in keys_to_swap:
    X_test = swap_columns(X_test, similar_columns[key][0], similar_columns[key][1])

In [12]:
# Run again to check if the swapping worked
# Iterate over the keys in similar_columns and apply the function investigate_similar_columns
for key in similar_columns.keys():
    print("\n")
    investigate_similar_columns(X_test, similar_columns[key][0], similar_columns[key][1],
                                common_columns)




Columns are equal:  False
Non-null values are the same:  True
Column with the non-null (imputed) values:  Dist_Max_INTER
Column to drop:  Dist_max_INTER
Column Dist_Max_INTER is in common_columns


Columns are equal:  False
Non-null values are the same:  True
Column with the non-null (imputed) values:  Dist_Sum_INTER
Column to drop:  Dist_sum_INTER
Column Dist_Sum_INTER is in common_columns


Columns are equal:  False
Non-null values are the same:  True
Column with the non-null (imputed) values:  Dist_Mean_INTER
Column to drop:  Dist_mean_INTER
Column Dist_Mean_INTER is in common_columns


Columns are equal:  False
Non-null values are the same:  True
Column with the non-null (imputed) values:  Dist_sum_NAL
Column to drop:  Dist_Sum_NAL
Column Dist_sum_NAL is in common_columns


Columns are equal:  False
Non-null values are the same:  True
Column with the non-null (imputed) values:  Dist_Mean_NAL
Column to drop:  Dist_mean_NAL
Column Dist_Mean_NAL is in common_columns


In [13]:
# For X_train and X_test keep only the columns in common
X_train = X_train[common_columns]
X_test = X_test[common_columns]

In [14]:
# Check numerical columns of X_test with null values
X_test.select_dtypes(include=['int64', 'float64']).isnull().sum()

Dist_Mean_NAL      0
EDAD               0
INGRESOS           0
Dist_Max_INTER     0
EGRESOS            0
FECHA              0
VALOR              0
Dist_Sum_INTER     0
Dist_sum_NAL       0
Dist_Mean_INTER    0
Dist_HOY           0
OFICINA_VIN        0
HORA_AUX           0
DIAMES             0
NROCIUDADES        0
NROPAISES          0
DIASEM             0
FECHA_VIN          0
dtype: int64

In [15]:
# Check numerical columns of X_train with null values
print(X_train.select_dtypes(include=['int64', 'float64']).isnull().sum())
print(X_train.shape)

print("\n")
# Percentage of null values in each numeric column
print(X_train.select_dtypes(include=['int64', 'float64']).isnull().sum() * 100. / X_train.shape[0])

Dist_Mean_NAL       457
EDAD                 24
INGRESOS             24
Dist_Max_INTER     1547
EGRESOS              24
FECHA                 0
VALOR                 0
Dist_Sum_INTER     1547
Dist_sum_NAL          0
Dist_Mean_INTER    1547
Dist_HOY              0
OFICINA_VIN          24
HORA_AUX              0
DIAMES                0
NROCIUDADES           0
NROPAISES             0
DIASEM                0
FECHA_VIN            24
dtype: int64
(2965, 23)


Dist_Mean_NAL      15.413153
EDAD                0.809444
INGRESOS            0.809444
Dist_Max_INTER     52.175379
EGRESOS             0.809444
FECHA               0.000000
VALOR               0.000000
Dist_Sum_INTER     52.175379
Dist_sum_NAL        0.000000
Dist_Mean_INTER    52.175379
Dist_HOY            0.000000
OFICINA_VIN         0.809444
HORA_AUX            0.000000
DIAMES              0.000000
NROCIUDADES         0.000000
NROPAISES           0.000000
DIASEM              0.000000
FECHA_VIN           0.809444
dtype: float64


In [16]:
# Null percentages
null_percentages = X_train.select_dtypes(include=['int64', 'float64']).isnull().sum() * 100. / X_train.shape[0]

# Columns with more than 50% null values
null_columns = list(null_percentages[null_percentages > 50].index)

# Drop columns with more than 50% null values in X_train and X_test
X_train = X_train.drop(null_columns, axis=1)
X_test = X_test.drop(null_columns, axis=1)


In [17]:
# Percentage of null values in each categorical column
print(X_train.select_dtypes(include=['object']).isnull().sum() * 100. / X_train.shape[0])
print("\n")

# Check cardinality of categorical columns
print(X_train.select_dtypes(include=['object']).nunique())

SEGMENTO    0.809444
CANAL       0.000000
COD_PAIS    0.000000
SEXO        1.854975
Canal1      0.000000
dtype: float64


SEGMENTO     6
CANAL        3
COD_PAIS    29
SEXO         2
Canal1       2
dtype: int64


In [18]:
# Percentage of null values in each categorical column
print(X_test.select_dtypes(include=['object']).isnull().sum() * 100. / X_test.shape[0])
print("\n")

# Check cardinality of categorical columns
print(X_test.select_dtypes(include=['object']).nunique())

SEGMENTO    0.0
CANAL       0.0
COD_PAIS    0.0
SEXO        0.0
Canal1      0.0
dtype: float64


SEGMENTO    5
CANAL       3
COD_PAIS    8
SEXO        2
Canal1      2
dtype: int64


In [19]:
categorical_binary_features = ["SEXO", "Canal1"]
categorical_to_drop = "COD_PAIS"
categorical_non_binary_features = ["CANAL", "SEGMENTO"]

In [20]:
# Print unique values of "CANAL" and "SEGMENTO" in X_train and X_test
print(X_train["CANAL"].unique())
print(X_test["CANAL"].unique())
print("\n")
print(X_train["SEGMENTO"].unique())
print(X_test["SEGMENTO"].unique())

['ATM_INT' 'POS' 'MCI']
['POS' 'MCI' 'ATM_INT']


['Personal Plus' 'Personal' 'Emprendedor' nan 'PYME' 'Preferencial'
 'Empresarial']
['Personal Plus' 'Preferencial' 'Personal' 'Emprendedor' 'PYME']


In [21]:
# Impute column "SEGMENTO" in X_train with the most frequent value
X_train["SEGMENTO"] = X_train["SEGMENTO"].fillna(X_train["SEGMENTO"].mode()[0])

In [22]:
# Save the index for X_test
X_test_index = X_test.index

X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

In [23]:
# Drop column "COD_PAIS" from X_train and X_test
X_train = X_train.drop("COD_PAIS", axis=1)
X_test = X_test.drop("COD_PAIS", axis=1)

In [24]:
# Apply LabelEncoder to categorical features in categorical_binary_features
le = LabelEncoder()

for feature in categorical_binary_features:
    X_train[feature] = le.fit_transform(X_train[feature])
    X_test[feature] = le.transform(X_test[feature])

# Apply OneHotEncoder to categorical features in categorical_non_binary_features
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

for feature in categorical_non_binary_features:
    X_train_ohe = ohe.fit_transform(X_train[[feature]])
    X_test_ohe = ohe.transform(X_test[[feature]])

    # Create column names for each OHE column
    ohe_categories = [f'{feature}_{category}' for category in ohe.categories_[0]]

    # Create a DataFrame with OHE columns
    X_train_ohe = pd.DataFrame(X_train_ohe, columns=ohe_categories)
    X_test_ohe = pd.DataFrame(X_test_ohe, columns=ohe_categories)

    # Concatenate X_train_ohe to X_train and X_test_ohe to X_test
    X_train = pd.concat([X_train, X_train_ohe], axis=1)
    X_test = pd.concat([X_test, X_test_ohe], axis=1)

    # Drop feature from X_train and X_test
    X_train = X_train.drop(feature, axis=1)
    X_test = X_test.drop(feature, axis=1)



## Check categorical data

In [25]:
# Print columns in X_train
print(X_train.columns)

# Print columns in X_test
print(X_test.columns)

# Verify that the columns in X_train and X_test are the same and have the same order
print(X_train.columns == X_test.columns)

Index(['Dist_Mean_NAL', 'EDAD', 'INGRESOS', 'EGRESOS', 'FECHA', 'VALOR',
       'Dist_sum_NAL', 'Dist_HOY', 'OFICINA_VIN', 'HORA_AUX', 'DIAMES', 'SEXO',
       'NROCIUDADES', 'NROPAISES', 'DIASEM', 'Canal1', 'FECHA_VIN',
       'CANAL_ATM_INT', 'CANAL_MCI', 'CANAL_POS', 'SEGMENTO_Emprendedor',
       'SEGMENTO_Empresarial', 'SEGMENTO_PYME', 'SEGMENTO_Personal',
       'SEGMENTO_Personal Plus', 'SEGMENTO_Preferencial'],
      dtype='object')
Index(['Dist_Mean_NAL', 'EDAD', 'INGRESOS', 'EGRESOS', 'FECHA', 'VALOR',
       'Dist_sum_NAL', 'Dist_HOY', 'OFICINA_VIN', 'HORA_AUX', 'DIAMES', 'SEXO',
       'NROCIUDADES', 'NROPAISES', 'DIASEM', 'Canal1', 'FECHA_VIN',
       'CANAL_ATM_INT', 'CANAL_MCI', 'CANAL_POS', 'SEGMENTO_Emprendedor',
       'SEGMENTO_Empresarial', 'SEGMENTO_PYME', 'SEGMENTO_Personal',
       'SEGMENTO_Personal Plus', 'SEGMENTO_Preferencial'],
      dtype='object')
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True 

In [26]:
y_train.reset_index(drop=True, inplace=True)

In [27]:
from xgboost import XGBClassifier

In [28]:
# Assuming y_train are the labels for the training set
model = XGBClassifier(objective='binary:logistic')  # for binary classification
model.fit(X_train, y_train)

In [29]:
# Create empty DataFrame y_test
y_test = pd.DataFrame()

# Add index to y_test using X_test_index
y_test.index = X_test_index

# Add column "FRAUDE" with the predicitions of the model using X_test
y_test["FRAUDE"] = model.predict(X_test)

In [30]:
y_test.to_csv("y_test.csv")

In [1]:
from functions import function1, function2


In [2]:
function1()

The function is working
