In [1]:
# Import pandas
import pandas as pd

# Read in dataset
transfusion = pd.read_csv(r"C:\Users\yamun\Downloads\Give-Life-Predict-Blood-Donations-master (1)\Give-Life-Predict-Blood-Donations-master\datasets\transfusion.data")

# Print out the first rows of our dataset
transfusion.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


In [2]:
# Print a concise summary of transfusion DataFrame
transfusion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column                                      Non-Null Count  Dtype
---  ------                                      --------------  -----
 0   Recency (months)                            748 non-null    int64
 1   Frequency (times)                           748 non-null    int64
 2   Monetary (c.c. blood)                       748 non-null    int64
 3   Time (months)                               748 non-null    int64
 4   whether he/she donated blood in March 2007  748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB


In [3]:
# Rename target column as 'target' for brevity 
transfusion.rename(
    columns={'whether he/she donated blood in March 2007': 'target'},
    inplace=True
)

# Print out the first 2 rows
transfusion.head(2)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),target
0,2,50,12500,98,1
1,0,13,3250,28,1


In [4]:
# Print target incidence proportions, rounding output to 3 decimal places
transfusion.target.value_counts(normalize=True)

target
0    0.762032
1    0.237968
Name: proportion, dtype: float64

In [5]:
# 1. Import required modules
from tpot import TPOTClassifier
from sklearn.metrics import roc_auc_score

In [6]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score


In [7]:
# Assuming your dataframe is 'transfusion' and target column is 'target'
X = transfusion.drop(columns='target')
y = transfusion['target']


In [8]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)


In [9]:
# Initialize TPOT with only supported params
tpot = TPOTClassifier(
    generations=5,
    population_size=20,
    random_state=42,
)


In [10]:
# Fit TPOT (will use default scoring internally)
tpot.fit(X_train, y_train)


Generation: 100%|████████████████████████████████████████████████████████████████████████| 5/5 [05:42<00:00, 68.57s/it]


In [None]:
tpot = TPOTClassifier(
    generations=5,
    population_size=20,
    random_state=42,
    max_time_mins=None,  # Disable max time stopping
)


In [11]:
# Predict probabilities on test set
y_proba = tpot.predict_proba(X_test)[:, 1]

In [12]:
# Calculate ROC AUC manually
auc = roc_auc_score(y_test, y_proba)
print(f"ROC AUC: {auc:.4f}")


ROC AUC: 0.7637


In [13]:
# Print best pipeline steps
print("\nBest pipeline steps:")
for i, (name, step) in enumerate(tpot.fitted_pipeline_.steps, 1):
    print(f"{i}. {step}")



Best pipeline steps:
1. Normalizer()
2. SelectFromModel(estimator=ExtraTreesClassifier(bootstrap=True,
                                               class_weight='balanced',
                                               criterion='entropy',
                                               max_features=0.1035557631159,
                                               min_samples_leaf=11,
                                               min_samples_split=13, n_jobs=1,
                                               random_state=42),
                threshold=0.027324438513)
3. FeatureUnion(transformer_list=[('skiptransformer', SkipTransformer()),
                               ('passthrough', Passthrough())])
4. FeatureUnion(transformer_list=[('skiptransformer', SkipTransformer()),
                               ('passthrough', Passthrough())])
5. MultinomialNB(alpha=0.0106940790452, fit_prior=False)


In [14]:
# X_train's variance, rounding the output to 3 decimal places
X_train.var().round(3)



Recency (months)              66.929
Frequency (times)             33.830
Monetary (c.c. blood)    2114363.700
Time (months)                611.147
dtype: float64

In [15]:
# Import numpy
import numpy as np

# Copy X_train and X_test into X_train_normed and X_test_normed
X_train_normed, X_test_normed = X_train.copy(), X_test.copy()


In [16]:
# Specify which column to normalize
col_to_normalize = 'Monetary (c.c. blood)'

In [17]:
# Log normalization
for df_ in [X_train_normed, X_test_normed]:
    # Add log normalized column
    df_['monetary_log'] = np.log(df_[col_to_normalize])
    # Drop the original column
    df_.drop(columns=col_to_normalize, inplace=True)


In [18]:
# Check the variance for X_train_normed
X_train_normed.var().round(3)

Recency (months)      66.929
Frequency (times)     33.830
Time (months)        611.147
monetary_log           0.837
dtype: float64

In [19]:
# Importing modules
from sklearn import linear_model

# Instantiate LogisticRegression
logreg =  linear_model.LogisticRegression(
    solver='liblinear',
    random_state=42
)

# Train the model
logreg.fit(X_train_normed, y_train)

# AUC score for tpot model
logreg_auc_score = roc_auc_score(y_test, logreg.predict_proba(X_test_normed)[:, 1])
print(f'\nAUC score: {logreg_auc_score:.4f}')


AUC score: 0.7891


In [22]:
from operator import itemgetter

# Sort models based on their AUC score from highest to lowest
sorted_models = sorted(
    [('tpot', auc), ('logreg', logreg_auc_score)],
    key=itemgetter(1),
    reverse=True
)

print(sorted_models)


[('logreg', 0.7890972663699937), ('tpot', 0.7637476160203432)]
