In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tpot import TPOTClassifier
from sklearn.metrics import roc_auc_score



In [2]:
data = pd.read_csv(r'transfusion.data')
data.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column                                      Non-Null Count  Dtype
---  ------                                      --------------  -----
 0   Recency (months)                            748 non-null    int64
 1   Frequency (times)                           748 non-null    int64
 2   Monetary (c.c. blood)                       748 non-null    int64
 3   Time (months)                               748 non-null    int64
 4   whether he/she donated blood in March 2007  748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB


In [4]:
data.rename(
    columns={'whether he/she donated blood in March 2007': 'Target'},
    inplace=True
)

data.head(2)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),Target
0,2,50,12500,98,1
1,0,13,3250,28,1


In [5]:
data.Target.value_counts(normalize=True).round(3)

0    0.762
1    0.238
Name: Target, dtype: float64

In [6]:
xtrain, xtest, ytrain, ytest = train_test_split(
    data.drop(columns='Target'),
    data.Target,
    test_size=0.25,
    random_state=42,
    stratify=data.Target
)

In [7]:
xtrain.head(2)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months)
334,16,2,500,16
99,5,7,1750,26


In [8]:
xtest.head(2)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months)
41,2,5,1250,16
682,11,2,500,25


In [9]:
ytrain.head(2)

334    0
99     1
Name: Target, dtype: int64

In [10]:
ytest.head(2)

41     0
682    0
Name: Target, dtype: int64

In [11]:

tpot = TPOTClassifier(
    generations=5,
    population_size=20,
    verbosity=2,
    scoring='roc_auc',
    random_state=42,
    disable_update_check=True,
    config_dict='TPOT light'
)
tpot.fit(xtrain, ytrain)

tpot_auc_score = roc_auc_score(ytest, tpot.predict_proba(xtest)[:, 1])
print(f'\nAUC score: {tpot_auc_score:.4f}')


print('\nBest pipeline steps:', end='\n')
for idx, (name, transform) in enumerate(tpot.fitted_pipeline_.steps, start=1):
    
    print(f'{idx}. {transform}')

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.7422459184429089

Generation 2 - Current best internal CV score: 0.7422459184429089

Generation 3 - Current best internal CV score: 0.7422459184429089

Generation 4 - Current best internal CV score: 0.7422459184429089

Generation 5 - Current best internal CV score: 0.7456308339276876

Best pipeline: MultinomialNB(Normalizer(input_matrix, norm=l2), alpha=0.001, fit_prior=True)

AUC score: 0.7637

Best pipeline steps:
1. Normalizer(copy=True, norm='l2')
2. MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True)


In [12]:
xtrain.var().round(3)

Recency (months)              66.929
Frequency (times)             33.830
Monetary (c.c. blood)    2114363.700
Time (months)                611.147
dtype: float64

In [13]:
xtrain_normed, xtest_normed = xtrain.copy(), xtest.copy()

col_to_normalize = 'Monetary (c.c. blood)'


for df_ in [xtrain_normed, xtest_normed]:

    df_['monetary_log'] = np.log(df_[col_to_normalize])
    df_.drop(columns=col_to_normalize, inplace=True)


xtrain_normed.var().round(3)

Recency (months)      66.929
Frequency (times)     33.830
Time (months)        611.147
monetary_log           0.837
dtype: float64

In [14]:
from sklearn import linear_model

logreg = linear_model.LogisticRegression(
    solver='liblinear',
    random_state=42
)

logreg.fit(xtrain_normed, ytrain)

logreg_auc_score = roc_auc_score(ytest, logreg.predict_proba(xtest_normed)[:, 1])
print(f'\nAUC score: {logreg_auc_score:.4f}')


AUC score: 0.7891


In [15]:
from operator import itemgetter

sorted(
    [('tpot', tpot_auc_score), ('logreg', logreg_auc_score)],
    key=itemgetter(1),
    reverse=True)

[('logreg', 0.7890972663699937), ('tpot', 0.7637476160203432)]

In [18]:
import pickle
with open('model.pickle','wb') as f:
    pickle.dump(logreg,f)
print("Success")

Success
