
# <font color='white gray'>panData</font>
## <font color='white gray'>Data Science for Multivariate Data Analysis</font>
### <font color='white gray'>Machine Learning Model Selection in Multivariate Analysis with Anonymized Data</font>


### Installing and Loading Packages


In [2]:
# To update a package, run the following command in the terminal or command prompt:
# pip install -U package_name

# To install the exact version of a package, run the following command in the terminal or command prompt:
# !pip install package_name==desired_version

# After installing or updating the package, restart the jupyter notebook.

# Install the watermark package.
# This package is used to record the versions of other packages used in this jupyter notebook.
!pip install -q -U watermark


In [3]:
# Imports
import pandas as pd
import numpy as np
import pickle

In [4]:
# Versions of the packages used in this jupyter notebook
%reload_ext watermark
%watermark -a "panData"

Author: panData



## Loading the Data


In [5]:
# Loading the data
df = pd.read_csv("dataset.csv")

In [6]:
# Shape
df.shape

(11500, 179)

In [7]:
# Viewing some records
df.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X170,X171,X172,X173,X174,X175,X176,X177,X178,LABEL_TARGET
0,135,190,229,223,192,125,55,-9,-33,-38,...,-17,-15,-31,-77,-103,-127,-116,-83,-51,False
1,386,382,356,331,320,315,307,272,244,232,...,164,150,146,152,157,156,154,143,129,True
2,-32,-39,-47,-37,-32,-36,-57,-73,-85,-94,...,57,64,48,19,-12,-30,-35,-35,-36,False
3,-105,-101,-96,-92,-89,-95,-102,-100,-87,-79,...,-82,-81,-80,-77,-85,-77,-72,-69,-65,False
4,-9,-65,-98,-102,-78,-48,-16,0,-21,-59,...,4,2,-12,-32,-41,-65,-83,-89,-73,False


## Exploratory Data Analysis and Data Cleaning


In [8]:
# Categories of the target variable
df.LABEL_TARGET.value_counts()

Unnamed: 0_level_0,count
LABEL_TARGET,Unnamed: 1_level_1
False,9200
True,2300


In [9]:
# Convert from string to numeric value
df["LABEL_TARGET"] = df["LABEL_TARGET"].astype(int)

In [10]:
# Viewing some records
df.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X170,X171,X172,X173,X174,X175,X176,X177,X178,LABEL_TARGET
0,135,190,229,223,192,125,55,-9,-33,-38,...,-17,-15,-31,-77,-103,-127,-116,-83,-51,0
1,386,382,356,331,320,315,307,272,244,232,...,164,150,146,152,157,156,154,143,129,1
2,-32,-39,-47,-37,-32,-36,-57,-73,-85,-94,...,57,64,48,19,-12,-30,-35,-35,-36,0
3,-105,-101,-96,-92,-89,-95,-102,-100,-87,-79,...,-82,-81,-80,-77,-85,-77,-72,-69,-65,0
4,-9,-65,-98,-102,-78,-48,-16,0,-21,-59,...,4,2,-12,-32,-41,-65,-83,-89,-73,0


In [11]:
print("Number of columns:", len(df.columns))

Number of columns: 179


In [12]:
# Statistical summary
df.describe()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X170,X171,X172,X173,X174,X175,X176,X177,X178,LABEL_TARGET
count,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,...,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0
mean,-11.581391,-10.911565,-10.18713,-9.143043,-8.009739,-7.003478,-6.502087,-6.68713,-6.558,-6.168435,...,-10.145739,-11.630348,-12.943478,-13.66887,-13.363304,-13.045043,-12.70513,-12.426,-12.195652,0.2
std,165.626284,166.059609,163.524317,161.269041,160.998007,161.328725,161.467837,162.11912,162.03336,160.436352,...,164.652883,166.14979,168.554058,168.556486,167.25729,164.241019,162.895832,162.886311,164.852015,0.400017
min,-1839.0,-1838.0,-1835.0,-1845.0,-1791.0,-1757.0,-1832.0,-1778.0,-1840.0,-1867.0,...,-1867.0,-1865.0,-1642.0,-1723.0,-1866.0,-1863.0,-1781.0,-1727.0,-1829.0,0.0
25%,-54.0,-55.0,-54.0,-54.0,-54.0,-54.0,-54.0,-55.0,-55.0,-54.0,...,-55.0,-56.0,-56.0,-56.0,-55.0,-56.0,-55.0,-55.0,-55.0,0.0
50%,-8.0,-8.0,-7.0,-8.0,-8.0,-8.0,-8.0,-8.0,-7.0,-7.0,...,-9.0,-10.0,-10.0,-10.0,-10.0,-9.0,-9.0,-9.0,-9.0,0.0
75%,34.0,35.0,36.0,36.0,35.0,36.0,35.0,36.0,36.0,35.25,...,34.0,34.0,33.0,33.0,34.0,34.0,34.0,34.0,34.0,0.0
max,1726.0,1713.0,1697.0,1612.0,1518.0,1816.0,2047.0,2047.0,2047.0,2047.0,...,1777.0,1472.0,1319.0,1436.0,1733.0,1958.0,2047.0,2047.0,1915.0,1.0


In [13]:
# Checking for missing values
df.isnull().values.any()

False

In [14]:
# Extract the list of columns
column_list = df.columns.tolist()

In [15]:
# Input variable columns
input_columns = column_list[0:178]

In [52]:
print(input_columns)

['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23', 'X24', 'X25', 'X26', 'X27', 'X28', 'X29', 'X30', 'X31', 'X32', 'X33', 'X34', 'X35', 'X36', 'X37', 'X38', 'X39', 'X40', 'X41', 'X42', 'X43', 'X44', 'X45', 'X46', 'X47', 'X48', 'X49', 'X50', 'X51', 'X52', 'X53', 'X54', 'X55', 'X56', 'X57', 'X58', 'X59', 'X60', 'X61', 'X62', 'X63', 'X64', 'X65', 'X66', 'X67', 'X68', 'X69', 'X70', 'X71', 'X72', 'X73', 'X74', 'X75', 'X76', 'X77', 'X78', 'X79', 'X80', 'X81', 'X82', 'X83', 'X84', 'X85', 'X86', 'X87', 'X88', 'X89', 'X90', 'X91', 'X92', 'X93', 'X94', 'X95', 'X96', 'X97', 'X98', 'X99', 'X100', 'X101', 'X102', 'X103', 'X104', 'X105', 'X106', 'X107', 'X108', 'X109', 'X110', 'X111', 'X112', 'X113', 'X114', 'X115', 'X116', 'X117', 'X118', 'X119', 'X120', 'X121', 'X122', 'X123', 'X124', 'X125', 'X126', 'X127', 'X128', 'X129', 'X130', 'X131', 'X132', 'X133', 'X134', 'X135', 'X136', 'X137', 'X138', 'X1

In [17]:
# Checking if there are duplicate columns in the input data
dup_cols = set([x for x in input_columns if input_columns.count(x) > 1])
print(dup_cols)
assert len(dup_cols) == 0, "You have duplicated columns in input_columns"

set()


In [18]:
# Checking if there are duplicate columns in the complete dataset
dup_cols = set([x for x in column_list if column_list.count(x) > 1])
print(dup_cols)
assert len(dup_cols) == 0, 'You have duplicated columns in column_list'

set()


In [19]:
# Categories of the target variable
df.LABEL_TARGET.value_counts()

Unnamed: 0_level_0,count
LABEL_TARGET,Unnamed: 1_level_1
0,9200
1,2300


The prevalence is the percentage of samples that have the characteristic you are trying to predict. In our case, this means that the people who renewed their insurance are in the positive class (event occurrence), and those who did not are in the negative class (no event occurred).

The terms positive and negative do not carry a good or bad connotation. They are simply the terminology used to indicate the occurrence or non-occurrence of the event.

The rate is calculated by (number of positive samples / total number of samples). Therefore, a prevalence rate of 0.2 means that 20% of our sample renewed their car insurance.

In [20]:
# This function calculates the prevalence of the positive class (label = 1)
def calculate_prevalence(y_actual):
    return sum(y_actual) / len(y_actual)

In [21]:
print("Prevalence of the positive class: %.3f" % calculate_prevalence(df["LABEL_TARGET"].values))

Prevalence of the positive class: 0.200


> Class imbalance is a problem that will need to be addressed during the data preprocessing phase.

## Data Splitting While Maintaining Class Prevalence


In [22]:
# Generating random samples from the data
df_data = df.sample(n = len(df))

In [23]:
df_data

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X170,X171,X172,X173,X174,X175,X176,X177,X178,LABEL_TARGET
10828,15,16,6,15,3,-8,-22,-56,-77,-83,...,38,16,16,24,16,5,-1,-5,-23,0
2731,-58,-73,-117,-170,-194,-185,-145,-90,-49,-2,...,4,-6,-8,-13,-16,-19,-21,-27,-38,0
1018,-41,-30,-26,-18,-11,-6,1,-4,-7,-10,...,-93,-83,-74,-66,-71,-72,-76,-80,-79,0
9192,53,37,14,-10,-18,-21,-32,-38,-68,-80,...,-6,-11,-26,-38,-41,-31,-15,-4,-10,0
2394,-290,-291,-299,-301,-299,-298,-298,-292,-288,-288,...,-211,-217,-229,-244,-260,-266,-276,-281,-285,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6604,-9,-15,-15,-25,-25,-19,-16,-15,-11,-14,...,30,31,34,31,28,24,23,23,24,0
10404,-47,-62,-87,-119,-131,-132,-115,-95,-69,-39,...,-140,-168,-177,-164,-144,-110,-72,-40,-17,0
5432,-44,-15,-14,-16,-23,-26,-25,-41,-39,-51,...,-34,-28,-15,-13,-20,-23,-33,-23,-29,0
10848,7,15,10,17,15,21,15,11,3,-5,...,-27,-29,-32,-41,-40,-47,-46,-59,-39,0


In [24]:
# Adjusting the dataset indices
df_data = df_data.reset_index(drop=True)

In [25]:
df_data

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X170,X171,X172,X173,X174,X175,X176,X177,X178,LABEL_TARGET
0,15,16,6,15,3,-8,-22,-56,-77,-83,...,38,16,16,24,16,5,-1,-5,-23,0
1,-58,-73,-117,-170,-194,-185,-145,-90,-49,-2,...,4,-6,-8,-13,-16,-19,-21,-27,-38,0
2,-41,-30,-26,-18,-11,-6,1,-4,-7,-10,...,-93,-83,-74,-66,-71,-72,-76,-80,-79,0
3,53,37,14,-10,-18,-21,-32,-38,-68,-80,...,-6,-11,-26,-38,-41,-31,-15,-4,-10,0
4,-290,-291,-299,-301,-299,-298,-298,-292,-288,-288,...,-211,-217,-229,-244,-260,-266,-276,-281,-285,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11495,-9,-15,-15,-25,-25,-19,-16,-15,-11,-14,...,30,31,34,31,28,24,23,23,24,0
11496,-47,-62,-87,-119,-131,-132,-115,-95,-69,-39,...,-140,-168,-177,-164,-144,-110,-72,-40,-17,0
11497,-44,-15,-14,-16,-23,-26,-25,-41,-39,-51,...,-34,-28,-15,-13,-20,-23,-33,-23,-29,0
11498,7,15,10,17,15,21,15,11,3,-5,...,-27,-29,-32,-41,-40,-47,-46,-59,-39,0


In [26]:
# Extracting a 30% random sample from the data
df_sample_30 = df_data.sample(frac=0.3)

In [27]:
df_sample_30

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X170,X171,X172,X173,X174,X175,X176,X177,X178,LABEL_TARGET
6951,8,-8,-29,-28,-9,8,15,7,-4,-17,...,38,17,22,65,102,123,99,59,22,0
9737,42,50,35,24,11,-2,-16,-10,-12,-1,...,86,87,81,66,62,76,79,78,72,0
11371,-8,-3,5,18,29,28,27,25,33,25,...,14,-26,-61,-93,-98,-91,-85,-82,-79,0
2493,45,45,50,47,49,43,26,10,10,20,...,30,27,32,40,42,44,47,49,54,0
3331,2,9,14,16,20,30,32,9,-12,-34,...,-46,-46,-46,-44,-45,-48,-47,-49,-55,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7081,-150,-101,-44,-10,23,76,154,215,218,173,...,334,294,220,146,99,82,81,71,42,1
10526,-84,-74,-56,-45,-28,-18,-7,-14,-9,-7,...,44,40,28,8,-10,-29,-35,-32,-32,0
5013,-98,-106,-97,-71,-37,15,33,55,63,45,...,-148,-170,-152,-147,-140,-115,-67,-31,-35,0
10742,15,9,6,15,13,17,20,14,4,1,...,-13,-9,-6,-12,-8,-6,-3,6,12,0


In [28]:
print("Validation/Test split size: %.1f" % (len(df_sample_30) / len(df_data)))

Validation/Test split size: 0.3


In [29]:
# Performing the split

# Test data
df_test = df_sample_30.sample(frac=0.5)

# Validation data
df_valid = df_sample_30.drop(df_test.index)

# Training data
df_train = df_data.drop(df_sample_30.index)

In [30]:
df_test

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X170,X171,X172,X173,X174,X175,X176,X177,X178,LABEL_TARGET
3600,82,53,-3,-36,-63,-74,-59,-57,-67,-64,...,17,49,69,63,40,8,-10,7,17,0
9519,-38,-30,-21,-23,-17,-18,-19,-24,-29,-39,...,-97,-87,-80,-73,-74,-74,-73,-71,-65,0
2436,38,10,-11,-2,-2,-3,-12,-12,34,86,...,-56,-4,11,46,86,95,92,61,30,0
546,-2,-4,-9,-19,-22,-27,-26,-29,-26,-33,...,-29,-31,-30,-26,-28,-16,-15,-6,-7,0
6097,47,96,171,190,173,130,64,21,16,20,...,-1,-6,-36,-96,-131,-131,-107,-115,-141,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5565,-8,-8,-10,-19,-18,-7,3,19,12,0,...,2,-5,-11,-21,-32,-36,-34,-31,-21,0
6960,14,14,11,19,27,28,25,30,31,17,...,-11,-20,-25,-27,-26,-16,-10,-9,1,0
5542,-37,-34,-38,-44,-38,-26,-20,-6,-13,-20,...,-45,-19,5,9,15,32,40,33,11,0
10339,-146,-139,-117,-97,-81,-72,-75,-83,-101,-111,...,-64,-55,-55,-50,-48,-44,-28,-19,-11,0


In [31]:
df_valid

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X170,X171,X172,X173,X174,X175,X176,X177,X178,LABEL_TARGET
11371,-8,-3,5,18,29,28,27,25,33,25,...,14,-26,-61,-93,-98,-91,-85,-82,-79,0
2493,45,45,50,47,49,43,26,10,10,20,...,30,27,32,40,42,44,47,49,54,0
3331,2,9,14,16,20,30,32,9,-12,-34,...,-46,-46,-46,-44,-45,-48,-47,-49,-55,1
4459,212,198,175,132,80,29,18,72,199,377,...,-217,-229,-154,-23,96,131,87,-19,-120,1
11359,59,61,63,61,59,61,66,76,75,71,...,-75,-71,-65,-64,-58,-50,-36,-23,-11,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4983,60,77,90,97,101,107,106,103,94,82,...,26,20,22,17,13,4,-2,-8,-14,0
5038,62,70,73,80,77,69,66,67,68,62,...,-73,-67,-70,-81,-95,-85,-69,-46,-31,0
7081,-150,-101,-44,-10,23,76,154,215,218,173,...,334,294,220,146,99,82,81,71,42,1
5013,-98,-106,-97,-71,-37,15,33,55,63,45,...,-148,-170,-152,-147,-140,-115,-67,-31,-35,0


In [32]:
df_train

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X170,X171,X172,X173,X174,X175,X176,X177,X178,LABEL_TARGET
0,15,16,6,15,3,-8,-22,-56,-77,-83,...,38,16,16,24,16,5,-1,-5,-23,0
1,-58,-73,-117,-170,-194,-185,-145,-90,-49,-2,...,4,-6,-8,-13,-16,-19,-21,-27,-38,0
2,-41,-30,-26,-18,-11,-6,1,-4,-7,-10,...,-93,-83,-74,-66,-71,-72,-76,-80,-79,0
3,53,37,14,-10,-18,-21,-32,-38,-68,-80,...,-6,-11,-26,-38,-41,-31,-15,-4,-10,0
4,-290,-291,-299,-301,-299,-298,-298,-292,-288,-288,...,-211,-217,-229,-244,-260,-266,-276,-281,-285,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11493,22,43,33,1,-28,-41,-57,-52,-60,-58,...,15,-45,-94,-76,-56,-13,0,21,64,0
11494,24,23,28,23,25,25,16,14,5,-4,...,-60,-58,-52,-58,-58,-46,-43,-36,-35,0
11495,-9,-15,-15,-25,-25,-19,-16,-15,-11,-14,...,30,31,34,31,28,24,23,23,24,0
11497,-44,-15,-14,-16,-23,-26,-25,-41,-39,-51,...,-34,-28,-15,-13,-20,-23,-33,-23,-29,0


In [33]:
# Check the prevalence of each subset
print(
    "Test (n = %d): %.3f"
    % (len(df_test), calculate_prevalence(df_test.LABEL_TARGET.values))
)
print(
    "Validation (n = %d): %.3f"
    % (len(df_valid), calculate_prevalence(df_valid.LABEL_TARGET.values))
)
print(
    "Train (n = %d): %.3f"
    % (len(df_train), calculate_prevalence(df_train.LABEL_TARGET.values))
)

Test (n = 1725): 0.202
Validation (n = 1725): 0.202
Train (n = 8050): 0.199


## Class Balancing

We will apply the undersampling strategy. Refer to Chapter 8 for the definition of these concepts.


In [34]:
df_test.shape

(1725, 179)

In [35]:
df_valid.shape

(1725, 179)

In [36]:
df_train.shape

(8050, 179)

In [37]:
df_train.LABEL_TARGET.value_counts()

Unnamed: 0_level_0,count
LABEL_TARGET,Unnamed: 1_level_1
0,6447
1,1603


In [38]:
# Create an index with True/False
index = df_train.LABEL_TARGET == 1

In [39]:
# Define positive and negative values from the index
df_train_pos = df_train.loc[index]
df_train_neg = df_train.loc[~index]

In [40]:
# Minimum value of records between positive and negative classes
min_value = np.min([len(df_train_pos), len(df_train_neg)])

In [41]:
min_value

1603

In [42]:
# Get random values for the training dataset
df_train_final = pd.concat([df_train_pos.sample(n = min_value, random_state = 69),
                            df_train_neg.sample(n = min_value, random_state = 69)],
                           axis = 0,
                           ignore_index = True)

In [43]:
# Random sampling of the training dataset
df_train_final = df_train_final.sample(n = len(df_train_final), random_state = 69).reset_index(drop=True)

In [44]:
df_train_final.shape

(3206, 179)

In [45]:
df_train_final.LABEL_TARGET.value_counts()

Unnamed: 0_level_0,count
LABEL_TARGET,Unnamed: 1_level_1
0,1603
1,1603


In [46]:
# Dataset balancing
print('Balance in Train (n = %d): %.3f' % (len(df_train_final),
                                           calculate_prevalence(df_train_final.LABEL_TARGET.values)))

Balance in Train (n = 3206): 0.500


## Saving the Preprocessing Results

In [48]:
# We save all datasets to disk in CSV format
df_train.to_csv('train_data.csv', index=False)
df_train_final.to_csv('train_final_data.csv', index=False)
df_valid.to_csv('validation_data.csv', index=False)
df_test.to_csv('test_data.csv', index=False)

In [49]:
# We save the names of the input data (predictor columns) to facilitate later use
pickle.dump(input_columns, open('input_columns.sav', 'wb'))

In [53]:
%watermark -v -m

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.34.0

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.1.85+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit



In [54]:
%watermark --iversions

pandas: 2.2.2
numpy : 1.26.4

