In [1]:
#!pip install scikit-learn
!pip install --force-reinstall numpy pandas
!pip install numpy==1.21 pandas==1.3
#!pip install feature-engine

Collecting numpy
  Using cached numpy-2.0.2-cp39-cp39-macosx_10_9_x86_64.whl (21.2 MB)
Collecting pandas
  Using cached pandas-2.2.3-cp39-cp39-macosx_10_9_x86_64.whl (12.6 MB)
Collecting tzdata>=2022.7
  Using cached tzdata-2024.2-py2.py3-none-any.whl (346 kB)
Collecting python-dateutil>=2.8.2
  Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl (229 kB)
Collecting pytz>=2020.1
  Using cached pytz-2024.2-py2.py3-none-any.whl (508 kB)
Collecting six>=1.5
  Using cached six-1.17.0-py2.py3-none-any.whl (11 kB)
Installing collected packages: pytz, tzdata, six, numpy, python-dateutil, pandas
  Attempting uninstall: pytz
    Found existing installation: pytz 2024.2
    Uninstalling pytz-2024.2:
      Successfully uninstalled pytz-2024.2
  Attempting uninstall: tzdata
    Found existing installation: tzdata 2024.2
    Uninstalling tzdata-2024.2:
      Successfully uninstalled tzdata-2024.2
  Attempting uninstall: six
    Found existing installation: six 1.17.0
    Uninstalling six-

In [2]:
# Verify Installed Versions
import numpy as np
import pandas as pd

print("Numpy version:", np.__version__)
print("Pandas version:", pd.__version__)

Numpy version: 1.21.0
Pandas version: 1.3.0


In [3]:
# importing libs
import os
import pandas as pd

# machine learning libs
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# train-test-split lib
from sklearn.model_selection import train_test_split

# pipeline lib
from sklearn.pipeline import Pipeline

# feature-engine libs
from sklearn.preprocessing import StandardScaler
from feature_engine.encoding import OneHotEncoder
from feature_engine.wrappers import SklearnTransformerWrapper

# model lib
import pickle

  from scipy.sparse import issparse


In [4]:
# Get the current working directory
current_directory = os.getcwd()
print("Current Directory:", current_directory)

Current Directory: /Users/dellacorte/py-projects/data-science/supervised-learning-classification-pipeline


In [5]:
# Read the dataset
df_abt = pd.read_csv('/Users/dellacorte/py-projects/data-science/supervised-learning-classification-pipeline/databases/propensao_revenda_abt.csv')
df_abt.head() 

Unnamed: 0,data_ref_safra,seller_id,uf,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia,nao_revendeu_next_6m
0,2018-01-01,0015a82c2db000af6aaaf3ae2ecb0532,SP,3,3,1,2685.0,74,1
1,2018-01-01,001cca7ae9ae17fb1caed9dfb1094831,ES,171,207,9,21275.23,2,0
2,2018-01-01,002100f778ceb8431b7a1020ff7ab48f,SP,38,42,15,781.8,2,0
3,2018-01-01,003554e2dce176b5555353e4f3555ac8,GO,1,1,1,120.0,16,1
4,2018-01-01,004c9cd9d87a3c30c522c48c4fc07416,SP,130,141,75,16228.88,8,0


In [6]:
# key variables
key_vars = ['data_ref_safra', 'seller_id']

# numeric variables
num_vars = ['tot_orders_12m', 'tot_items_12m', 'tot_items_dist_12m', 'receita_12m', 'recencia']

# categorical variables
cat_vars = ['uf']

# target
target = 'nao_revendeu_next_6m'

# creating our dataframe with the variables listed
# excluding key variables, as they are not important
features = cat_vars + num_vars

# creating a dataframe with variables
X = df_abt[features]
X.head()

Unnamed: 0,uf,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia
0,SP,3,3,1,2685.0,74
1,ES,171,207,9,21275.23,2
2,SP,38,42,15,781.8,2
3,GO,1,1,1,120.0,16
4,SP,130,141,75,16228.88,8


In [7]:
# creating a dataframe with target variable
y = df_abt[target]
y.head()

0    1
1    0
2    0
3    1
4    0
Name: nao_revendeu_next_6m, dtype: int64

In [8]:
# creating a variable with the logistic regression model
lr_model = LogisticRegression(random_state=42)

# making the model parameters (X, y) available
lr_model.fit(X=X[num_vars], y=y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=42)

In [9]:
# interceptor
lr_model.intercept_

array([-1.48739767])

In [10]:
# coefficient
lr_model.coef_

array([[-1.05500593e-02,  6.67061909e-03, -4.34068580e-02,
        -4.50547976e-05,  2.20512256e-02]])

In [11]:
# model prediction
y_pred = lr_model.predict(X[num_vars])
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [12]:
# passing the features manually into the model
lr_model.predict([[3, 3, 1, 2685.0, 74]])



array([0])

In [13]:
# dataframe with expected target and predicted target
df_res = pd.DataFrame()

df_res['y_expected'] = y.copy()
df_res['y_predicted'] = y_pred.copy()

df_res

Unnamed: 0,y_expected,y_predicted
0,1,0
1,0,0
2,0,0
3,1,0
4,0,0
...,...,...
5364,1,0
5365,0,0
5366,0,0
5367,0,0


In [14]:
# concatenating my expected/predicted dataframe with my features
df_res = pd.concat([df_abt, df_res], axis=1)

df_res

Unnamed: 0,data_ref_safra,seller_id,uf,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia,nao_revendeu_next_6m,y_expected,y_predicted
0,2018-01-01,0015a82c2db000af6aaaf3ae2ecb0532,SP,3,3,1,2685.00,74,1,1,0
1,2018-01-01,001cca7ae9ae17fb1caed9dfb1094831,ES,171,207,9,21275.23,2,0,0,0
2,2018-01-01,002100f778ceb8431b7a1020ff7ab48f,SP,38,42,15,781.80,2,0,0,0
3,2018-01-01,003554e2dce176b5555353e4f3555ac8,GO,1,1,1,120.00,16,1,1,0
4,2018-01-01,004c9cd9d87a3c30c522c48c4fc07416,SP,130,141,75,16228.88,8,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
5364,2018-03-01,ff82e8873fba613f2261a9acc896fd84,MG,4,4,3,124.60,12,1,1,0
5365,2018-03-01,ffc470761de7d0232558ba5e786e57b7,SP,5,5,5,385.59,0,0,0,0
5366,2018-03-01,ffdd9f82b9a447f6f8d4b91554cc7dd3,PR,11,12,8,1450.20,7,0,0,0
5367,2018-03-01,ffeee66ac5d5a62fe688b9d26f83f534,SP,13,13,3,1709.87,0,0,0,0


## Step by step on how to run a classification model in Python.

In the following example, we use Logistic Regression

In [15]:
# creating logistic regression model
lr_model = LogisticRegression(random_state=42)

# making the model parameters (X, y) available
lr_model.fit(X=X[num_vars], y=y)

# model prediction
y_pred = lr_model.predict(X[num_vars])
y_pred

# computing accuracy: how much the predicted accuracy is as expected
acc = accuracy_score(y_true=y, y_pred=y_pred)
acc

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8252933507170795

### Train-Test split

We will learn how to split the dataset into training and testing to balance the Bias-Variance tradeoff.

In [16]:
# computing proportion of target variable
y.value_counts(1)

0    0.61762
1    0.38238
Name: nao_revendeu_next_6m, dtype: float64

In [17]:
# splitting the dataset
# train_size parameter = amount of data in the training set (0 - 1)
# test_size parameter = amount of data in the test set (0 - 1)
train_test_split(X, y, test_size=0.2, random_state=42)

## the set is available in 4 values

[      uf  tot_orders_12m  tot_items_12m  tot_items_dist_12m  receita_12m  \
 4776  SP               5              6                   5       309.90   
 4578  SC               5              5                   1       995.00   
 2948  SP              14             14                   2     21519.50   
 4416  SP              17             17                   8      8563.93   
 2459  PR               4              4                   3       299.69   
 ...   ..             ...            ...                 ...          ...   
 3092  SE               3              3                   2       329.70   
 3772  SP              94             98                  61     23162.00   
 5191  SP              11             11                   6      4299.19   
 5226  SP               1              1                   1        10.90   
 860   SP              13             13                   2      2127.80   
 
       recencia  
 4776       246  
 4578         4  
 2948       174  
 4

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [19]:
# checking the proportions of the division
X_train.shape, X_test.shape

((4295, 6), (1074, 6))

In [20]:
# checking the proportions of the division
y_train.shape, y_test.shape

((4295,), (1074,))

In [21]:
# checking the proportions of the division
y_train.value_counts(1)

0    0.617695
1    0.382305
Name: nao_revendeu_next_6m, dtype: float64

In [22]:
# checking the proportions of the division
y_test.value_counts(1)

0    0.617318
1    0.382682
Name: nao_revendeu_next_6m, dtype: float64

In [23]:
# creating logistic regression model
lr_model = LogisticRegression(random_state=42)

# making the model parameters (X, y) available
lr_model.fit(X=X_train[num_vars], y=y_train)

# model prediction on the train set
y_pred_train = lr_model.predict(X_train[num_vars])
y_pred

# computing accuracy: how much the predicted hit is expected in the train set
acc_train = accuracy_score(y_true=y_train, y_pred=y_pred_train)
acc_train

0.8279394644935972

In [24]:
# model prediction in the test set
y_pred_test = lr_model.predict(X_test[num_vars])
y_pred

# computing accuracy: how much the predicted hit is expected in the test set
acc_test = accuracy_score(y_true=y_test, y_pred=y_pred_test)
acc_test

0.8081936685288641

Comparing the models in the 2 data sets (train and test), a slight reduction in our metric is observed. This comparison is used to balance the tradeoff between Bias-Variance.

In [25]:
# splitting the dataset
# train_size parameter = amount of data in the training set (0 - 1)
# test_size parameter = amount of data in the test set (0 - 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) 

# creating logistic regression model
model = LogisticRegression(random_state=42) 

# making the model parameters (X, y) available
model.fit(X=X_train[num_vars], y=y_train) 

# model prediction on the train set
y_pred_train = model.predict(X_train[num_vars]) 
# computing accuracy: how much the predicted hit is expected in the train set
acc_train = accuracy_score(y_true=y_train, y_pred=y_pred_train) 

# model prediction in the test set
y_pred_test = model.predict(X_test[num_vars]) 
# computing accuracy: how much the predicted hit is expected in the test set
acc_test = accuracy_score(y_true=y_test, y_pred=y_pred_test) 

print(f'Accuracy for Training {acc_train * 100:.2f}%') 
print(f'Accuracy for Test {acc_test * 100:.2f}%')

Accuracy for Training 82.79%
Accuracy for Test 80.82%


#### LogisticRegression model:

- Training Accuracy 82.79%
- Test Accuracy 80.82%

In [26]:
# splitting the dataset
# train_size parameter = amount of data in the training set (0 - 1)
# test_size parameter = amount of data in the test set (0 - 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) 

# creating logistic regression model
model = DecisionTreeClassifier(random_state=42, max_depth=4) 

# making the model parameters (X, y) available
model.fit(X=X_train[num_vars], y=y_train) 

# model prediction on the train set
y_pred_train = model.predict(X_train[num_vars]) 
# computing accuracy: how much the predicted hit is expected in the train set
acc_train = accuracy_score(y_true=y_train, y_pred=y_pred_train) 

# model prediction in the test set
y_pred_test = model.predict(X_test[num_vars]) 
# computing accuracy: how much the predicted hit is expected in the test set
acc_test = accuracy_score(y_true=y_test, y_pred=y_pred_test) 

print(f'Accuracy for Training {acc_train * 100:.2f}%') 
print(f'Accuracy for Test {acc_test * 100:.2f}%')

Accuracy for Training 84.19%
Accuracy for Test 81.75%


#### DecisionTreeClassifier:

- Training Accuracy 84.19%
- Test Accuracy 81.75%

### Applying feature engineering

In [27]:
# viewing our training dataset
X_train.head()

Unnamed: 0,uf,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia
2658,SP,2,2,2,85.29,258
876,MG,2,2,2,223.9,50
959,SP,1,1,1,670.0,1
1143,SP,1,4,1,143.6,61
3787,SP,2,2,2,115.0,4


In [28]:
# viewing our test dataset
X_test.head()

Unnamed: 0,uf,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia
72,SP,4,4,4,269.6,42
5029,SP,58,78,25,3088.82,10
126,SP,4,4,3,1094.0,44
4399,PR,1,1,1,343.0,173
166,GO,3,3,3,594.99,62


In [29]:
# transformation of my DataFrame for use in the ML model

# applying OneHotEncoder to categorical variables
ohe = OneHotEncoder(variables=cat_vars)
ohe.fit(X_train)

OneHotEncoder(variables=['uf'])

In [30]:
# applying OneHotEncoder to variable ['uf']
ohe.transform(X_train)

Unnamed: 0,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia,uf_SP,uf_MG,uf_SC,uf_DF,uf_PR,...,uf_PA,uf_MS,uf_GO,uf_RN,uf_MT,uf_CE,uf_RO,uf_AM,uf_PI,uf_MA
2658,2,2,2,85.29,258,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
876,2,2,2,223.90,50,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
959,1,1,1,670.00,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1143,1,4,1,143.60,61,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3787,2,2,2,115.00,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4785,16,21,13,6821.43,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
691,13,13,2,4387.00,48,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1113,2,2,2,677.00,26,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4580,59,69,10,5633.62,211,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
# after counting the columns in training and testing, it is clear that there are
# 4 extra columns in training
ohe.transform(X_train).columns

Index(['tot_orders_12m', 'tot_items_12m', 'tot_items_dist_12m', 'receita_12m',
       'recencia', 'uf_SP', 'uf_MG', 'uf_SC', 'uf_DF', 'uf_PR', 'uf_BA',
       'uf_RS', 'uf_RJ', 'uf_PB', 'uf_ES', 'uf_SE', 'uf_PE', 'uf_PA', 'uf_MS',
       'uf_GO', 'uf_RN', 'uf_MT', 'uf_CE', 'uf_RO', 'uf_AM', 'uf_PI', 'uf_MA'],
      dtype='object')

In [32]:
# after counting the columns in training and testing, it is clear that there are
# 4 more columns in trainingohe.transform(X_test).columns
ohe.transform(X_test).columns

Index(['tot_orders_12m', 'tot_items_12m', 'tot_items_dist_12m', 'receita_12m',
       'recencia', 'uf_SP', 'uf_MG', 'uf_SC', 'uf_DF', 'uf_PR', 'uf_BA',
       'uf_RS', 'uf_RJ', 'uf_PB', 'uf_ES', 'uf_SE', 'uf_PE', 'uf_PA', 'uf_MS',
       'uf_GO', 'uf_RN', 'uf_MT', 'uf_CE', 'uf_RO', 'uf_AM', 'uf_PI', 'uf_MA'],
      dtype='object')

In [33]:
# transformation of my DataFrame for use in the ML model

# applying OneHotEncoder to categorical variables
ohe = OneHotEncoder(variables=cat_vars)
ohe.fit(X_train)

X_train_transformed = ohe.transform(X_train)
X_train_transformed.head(3)

Unnamed: 0,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia,uf_SP,uf_MG,uf_SC,uf_DF,uf_PR,...,uf_PA,uf_MS,uf_GO,uf_RN,uf_MT,uf_CE,uf_RO,uf_AM,uf_PI,uf_MA
2658,2,2,2,85.29,258,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
876,2,2,2,223.9,50,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
959,1,1,1,670.0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
# creating my logistic regression model and applying it to the sets
model = LogisticRegression(random_state=42)
model.fit(X=X_train_transformed, y=y_train)

# model prediction on the train set
y_pred_train = model.predict(X_train_transformed)

# computing accuracy: how much the predicted hit is expected in the train set
acc_train = accuracy_score(y_true=y_train, y_pred=y_pred_train)

# model prediction in the test set
# it is necessary to do this transformation in the test set for it to work
y_pred_test = model.predict(ohe.transform(X_test)) 
# computing accuracy: how much the predicted hit is expected in the test set
acc_test = accuracy_score(y_true=y_test, y_pred=y_pred_test) 


print(f'Accuracy for Training {acc_train * 100:.2f}%') 
print(f'Accuracy for Test {acc_test * 100:.2f}%')

Accuracy for Training 82.77%
Accuracy for Test 80.73%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [35]:
# applying OneHotEncoder to my categorical variables

# dividing my dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) 

# transformation of categorical data
ohe = OneHotEncoder(variables=cat_vars) 
ohe.fit(X_train)
# applying the transformation to the training and test sets
X_train_transformed = ohe.transform(X_train) 
X_test_transformed = ohe.transform(X_test) 

# creation of the machine learning model for categorical data
model = LogisticRegression(random_state=42) 
model.fit(X=X_train_transformed, y=y_train)

# Result based on training 
y_pred_train = model.predict(X_train_transformed) 
acc_train = accuracy_score(y_true=y_train, y_pred=y_pred_train) 
print(f'Accuracy for Training {acc_train * 100:.2f}%') 

# Result in the test base 
y_pred_test = model.predict(X_test_transformed) 
acc_test = accuracy_score(y_true=y_test, y_pred=y_pred_test) 
print(f'Accuracy for Test {acc_test * 100:.2f}%')

Accuracy for Training 82.77%
Accuracy for Test 80.73%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [37]:
# aplicando StandardScaler para minhas variáveis numéricas

# dividing my dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# transformation of numeric data
std_scaler = SklearnTransformerWrapper(transformer=StandardScaler(), variables=num_vars)
std_scaler.fit(X_train_transformed)

# applying the transformation to the training and test sets
X_train_transformed_with_std_scaler = std_scaler.transform(X_train_transformed) 
X_test_transformed_with_std_scaler = std_scaler.transform(X_test_transformed) 

# creation of the machine learning model for numerical data
model = LogisticRegression(random_state=42) 
model.fit(X=X_train_transformed_with_std_scaler, y=y_train) 

# Result based on training 
y_pred_train = model.predict(X_train_transformed_with_std_scaler) 
acc_train = accuracy_score(y_true=y_train, y_pred=y_pred_train) 
print(f'Accuracy for Training {acc_train * 100:.2f}%') 

# Result in the test base 
y_pred_test = model.predict(X_test_transformed_with_std_scaler) 
acc_test = accuracy_score(y_true=y_test, y_pred=y_pred_test) 
print(f'Accuracy for Test {acc_test * 100:.2f}%')

Accuracy for Training 83.07%
Accuracy for Test 80.73%
