In [None]:
import pandas as pd
import numpy as np

from imblearn.under_sampling import TomekLinks 
from collections import Counter

In [None]:
dataset = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Cancellation/data/noshow_processed_data.csv', index_col = 'Unnamed: 0')
dataset.head()

Unnamed: 0,PatientId,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMSReceived,NoShow,BookHour,DaysToAppointment,AppointmentWeekDay
0,29872499824296,0,2016-04-29,2016-04-29,62,0,0,1,0,0,0,0,0,19,0,4
1,558997776694438,1,2016-04-29,2016-04-29,56,0,0,0,0,0,0,0,0,16,0,4
2,4262962299951,0,2016-04-29,2016-04-29,62,1,0,0,0,0,0,0,0,16,0,4
3,867951213174,0,2016-04-29,2016-04-29,8,2,0,0,0,0,0,0,0,17,0,4
4,8841186448183,0,2016-04-29,2016-04-29,56,0,0,1,1,0,0,0,0,16,0,4


In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110521 entries, 0 to 110526
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   PatientId           110521 non-null  int64 
 1   Gender              110521 non-null  int64 
 2   ScheduledDay        110521 non-null  object
 3   AppointmentDay      110521 non-null  object
 4   Age                 110521 non-null  int64 
 5   Neighbourhood       110521 non-null  int64 
 6   Scholarship         110521 non-null  int64 
 7   Hypertension        110521 non-null  int64 
 8   Diabetes            110521 non-null  int64 
 9   Alcoholism          110521 non-null  int64 
 10  Handicap            110521 non-null  int64 
 11  SMSReceived         110521 non-null  int64 
 12  NoShow              110521 non-null  int64 
 13  BookHour            110521 non-null  int64 
 14  DaysToAppointment   110521 non-null  int64 
 15  AppointmentWeekDay  110521 non-null  int64 
dtypes:

In [None]:
from collections import Counter

In [None]:
y = dataset["NoShow"]
y.head()

0    0
1    0
2    0
3    0
4    0
Name: NoShow, dtype: int64

In [None]:
dataset['ScheduledDay'] = dataset['ScheduledDay'].astype('datetime64').astype(int).astype(float)
dataset['AppointmentDay'] = dataset['AppointmentDay'].astype('datetime64').astype(int).astype(float)

In [None]:
X = dataset.loc[:, dataset.columns != 'NoShow']
X.head()

Unnamed: 0,PatientId,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hypertension,Diabetes,Alcoholism,Handicap,SMSReceived,BookHour,DaysToAppointment,AppointmentWeekDay
0,29872499824296,0,1.461888e+18,1.461888e+18,62,0,0,1,0,0,0,0,19,0,4
1,558997776694438,1,1.461888e+18,1.461888e+18,56,0,0,0,0,0,0,0,16,0,4
2,4262962299951,0,1.461888e+18,1.461888e+18,62,1,0,0,0,0,0,0,16,0,4
3,867951213174,0,1.461888e+18,1.461888e+18,8,2,0,0,0,0,0,0,17,0,4
4,8841186448183,0,1.461888e+18,1.461888e+18,56,0,0,1,1,0,0,0,16,0,4


In [None]:
from matplotlib import pyplot

In [None]:
# define the undersampling method
TomekLinksAuto = TomekLinks()
#Auto - not minority - próbkujemy każdą klasę ale nie najmniejszą
TomekLinksMajority = TomekLinks(sampling_strategy='majority') 
#Majority - próbkujemy tylko największą klasę
TomekLinksNotMajority = TomekLinks(sampling_strategy='not majority')
#Not majority - próbkujemy każdą kalsę ale nie największą
TomekLinksAll= TomekLinks(sampling_strategy='all')
#all - próbkujemy każdą klasę

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# summarize class distribution
counter = Counter(y)
print("raw count: ",  counter)

# transform the dataset
XAuto, yAuto = TomekLinksAuto.fit_resample(X, y)
# summarize the new class distribution
counter = Counter(yAuto)
print("auto/not minority count: ", counter)

# transform the dataset
XMajority, yMajority = TomekLinksMajority.fit_resample(X, y)
# summarize the new class distribution
counter = Counter(yMajority)
print("Majority: ", counter)

# transform the dataset
XNotMajority, yNotMajority = TomekLinksNotMajority.fit_resample(X, y)
# summarize the new class distribution
counter = Counter(yNotMajority)
print("Not Majority: ", counter)

# transform the dataset
XAll, yAll = TomekLinksAll.fit_resample(X, y)
# summarize the new class distribution
counter = Counter(yAll)
print("All: ", counter)

raw count:  Counter({0: 88207, 1: 22314})
auto/not minority count:  Counter({0: 78925, 1: 22314})
Majority:  Counter({0: 78925, 1: 22314})
Not Majority:  Counter({0: 88207, 1: 13032})
All:  Counter({0: 78925, 1: 13032})


Bibliography:


*   https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.TomekLinks.html
*   https://imbalanced-learn.readthedocs.io/en/stable/auto_examples/under-sampling/plot_illustration_tomek_links.html#sphx-glr-auto-examples-under-sampling-plot-illustration-tomek-links-py


*   https://machinelearningmastery.com/undersampling-algorithms-for-imbalanced-classification/
*   Element listy



