# Handling Data Imbalance in Classification Models

- Building a model to predict who will donate (TargetB) and how much they will give (TargetD) (will be used for lab on Friday). You will be using files_for_lab/learningSet.csv file which you have already downloaded from class.

Begin the Modeling here:

- Look critically at the dtypes of numerical and categorical columns and make changes where appropriate.

In [50]:
#import libraries

import pandas as pd
import numpy as np
import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from sklearn.impute import SimpleImputer

In [2]:
numerical = pd.read_csv('numerical.csv')
numerical.head()
#numerical.shape

Unnamed: 0.1,Unnamed: 0,ODATEDW,TCODE,DOB,AGE,NUMCHLD,INCOME,WEALTH1,HIT,MBCRAFT,...,FISTDATE,NEXTDATE,TIMELAG,AVGGIFT,CONTROLN,TARGET_B,TARGET_D,HPHONE_D,RFA_2F,CLUSTER2
0,0,8901,0,3712,60.0,,5.0,5.0,0,,...,8911,9003.0,4.0,7.741935,95515,0,0.0,0,4,39.0
1,1,9401,1,5202,46.0,1.0,6.0,9.0,16,0.0,...,9310,9504.0,18.0,15.666667,148535,0,0.0,0,2,1.0
2,2,9001,1,0,61.611649,,3.0,1.0,2,0.0,...,9001,9101.0,12.0,7.481481,15078,0,0.0,1,4,60.0
3,3,8701,0,2801,70.0,,1.0,4.0,2,0.0,...,8702,8711.0,9.0,6.8125,172556,0,0.0,1,4,41.0
4,4,8601,0,2001,78.0,1.0,3.0,2.0,60,1.0,...,7903,8005.0,14.0,6.864865,7112,0,0.0,1,2,26.0


In [3]:
# numerical.dtypes
numerical.dtypes

Unnamed: 0      int64
ODATEDW         int64
TCODE           int64
DOB             int64
AGE           float64
               ...   
TARGET_B        int64
TARGET_D      float64
HPHONE_D        int64
RFA_2F          int64
CLUSTER2      float64
Length: 385, dtype: object

In [4]:
# Drop Unnamed
numerical = numerical.drop('Unnamed: 0', axis=1)

In [5]:
# Drop NUMCHLD
numerical = numerical.drop('NUMCHLD', axis=1)

In [6]:
#changing type of AGE
numerical['AGE'] = numerical['AGE'].astype(int)

In [7]:
numerical.dtypes


ODATEDW       int64
TCODE         int64
DOB           int64
AGE           int64
INCOME      float64
             ...   
TARGET_B      int64
TARGET_D    float64
HPHONE_D      int64
RFA_2F        int64
CLUSTER2    float64
Length: 383, dtype: object

In [8]:
numerical.isna().sum()
#we don't have nulls!!

ODATEDW     0
TCODE       0
DOB         0
AGE         0
INCOME      0
           ..
TARGET_B    0
TARGET_D    0
HPHONE_D    0
RFA_2F      0
CLUSTER2    0
Length: 383, dtype: int64

In [9]:
categorical = pd.read_csv('categorical.csv')
categorical.head()

Unnamed: 0,STATE,NOEXCH,MDMAUD,CLUSTER,HOMEOWNR,GENDER,DATASRCE,RFA_2R,RFA_2A,MDMAUD_R,MDMAUD_F,MDMAUD_A,GEOCODE2,DOMAIN_A,DOMAIN_B
0,IL,0,XXXX,36,U,F,Not matched,L,E,X,X,X,C,T,2
1,CA,0,XXXX,14,H,M,3,L,G,X,X,X,A,S,1
2,NC,0,XXXX,43,U,M,3,L,E,X,X,X,C,R,2
3,CA,0,XXXX,44,U,F,3,L,E,X,X,X,C,R,2
4,FL,0,XXXX,16,H,F,3,L,F,X,X,X,A,S,2


In [10]:
# categorical.dtypes
categorical.dtypes

STATE       object
NOEXCH      object
MDMAUD      object
CLUSTER      int64
HOMEOWNR    object
GENDER      object
DATASRCE    object
RFA_2R      object
RFA_2A      object
MDMAUD_R    object
MDMAUD_F    object
MDMAUD_A    object
GEOCODE2    object
DOMAIN_A    object
DOMAIN_B     int64
dtype: object

In [11]:
# Drop NOEXCH
categorical = categorical.drop('NOEXCH', axis=1)

In [12]:
categorical['DOMAIN_B'] = categorical['DOMAIN_B'].astype(str)

In [13]:
categorical['CLUSTER'] = categorical['CLUSTER'].astype(str)

In [14]:
categorical.dtypes


STATE       object
MDMAUD      object
CLUSTER     object
HOMEOWNR    object
GENDER      object
DATASRCE    object
RFA_2R      object
RFA_2A      object
MDMAUD_R    object
MDMAUD_F    object
MDMAUD_A    object
GEOCODE2    object
DOMAIN_A    object
DOMAIN_B    object
dtype: object

In [15]:
categorical.isna().sum()
#we don't have nulls!!

STATE       0
MDMAUD      0
CLUSTER     0
HOMEOWNR    0
GENDER      0
DATASRCE    0
RFA_2R      0
RFA_2A      0
MDMAUD_R    0
MDMAUD_F    0
MDMAUD_A    0
GEOCODE2    0
DOMAIN_A    0
DOMAIN_B    0
dtype: int64

- Concatenate numerical and categorical back together again for your X dataframe. Designate the Target B as y

In [16]:
donors = pd.concat([numerical, categorical], axis=1)
donors.head()

Unnamed: 0,ODATEDW,TCODE,DOB,AGE,INCOME,WEALTH1,HIT,MBCRAFT,MBGARDEN,MBBOOKS,...,GENDER,DATASRCE,RFA_2R,RFA_2A,MDMAUD_R,MDMAUD_F,MDMAUD_A,GEOCODE2,DOMAIN_A,DOMAIN_B
0,8901,0,3712,60,5.0,5.0,0,,,,...,F,Not matched,L,E,X,X,X,C,T,2
1,9401,1,5202,46,6.0,9.0,16,0.0,0.0,3.0,...,M,3,L,G,X,X,X,A,S,1
2,9001,1,0,61,3.0,1.0,2,0.0,0.0,1.0,...,M,3,L,E,X,X,X,C,R,2
3,8701,0,2801,70,1.0,4.0,2,0.0,0.0,0.0,...,F,3,L,E,X,X,X,C,R,2
4,8601,0,2001,78,3.0,2.0,60,1.0,0.0,9.0,...,F,3,L,F,X,X,X,A,S,2


In [17]:
donors['TARGET_B'].value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

In [18]:
X = donors.drop(columns=['TARGET_B','TARGET_D'])
y = donors['TARGET_B']

- Split the data into a training set and a test set.

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [20]:
# Check the number of rows

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(76329, 395)
(19083, 395)
(76329,)
(19083,)


- Split further into train_num and train_cat. Also test_num and test_cat.

In [21]:
X_train_num = X_train.select_dtypes(include = np.number)
X_test_num  = X_test.select_dtypes(include = np.number)
X_train_cat = X_train.select_dtypes(include = object)
X_test_cat  = X_test.select_dtypes(include = object)

- Scale the features either by using MinMax Scaler or a Standard Scaler. (train_num, test_num)

In [22]:
transformer = MinMaxScaler().fit(X_train_num) 
X_train_scaled_arr = transformer.transform(X_train_num)
X_train_scaled = pd.DataFrame(X_train_scaled_arr, columns=X_train_num.columns)
#X_train_scaled.head()
X_train_scaled.shape

(76329, 381)

In [23]:
transformer = MinMaxScaler().fit(X_test_num) 
X_test_scaled_arr = transformer.transform(X_test_num)
X_test_scaled = pd.DataFrame(X_test_scaled_arr, columns=X_test_num.columns)
#X_test_scaled.head()
X_test_scaled.shape

(19083, 381)

- Encode the categorical features using One-Hot Encoding or Ordinal Encoding. (train_cat, test_cat)
     - fit only on train data, transform both train and test
     - Again re-concatenate train_num and train_cat as X_train as well as test_num and test_cat as X_test

In [24]:
encoder = OneHotEncoder(drop='first')

# Fit and Transform X_train
encoded_train_cat = encoder.fit_transform(X_train_cat).toarray()
cols = encoder.get_feature_names_out(input_features=X_train_cat.columns)
onehot_encoded_train = pd.DataFrame(encoded_train_cat, columns=cols)

# Transform X_test
encoder.set_params(handle_unknown='ignore')
encoded_test_cat = encoder.transform(X_test_cat).toarray()
onehot_encoded_test = pd.DataFrame(encoded_test_cat, columns=cols)



In [25]:
# Dataframe from X_train
X_train_treated = pd.concat([X_train_scaled, onehot_encoded_train], axis=1)
X_train_treated

Unnamed: 0,ODATEDW,TCODE,DOB,AGE,INCOME,WEALTH1,HIT,MBCRAFT,MBGARDEN,MBBOOKS,...,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,DOMAIN_B_2,DOMAIN_B_3,DOMAIN_B_4
0,0.498208,0.000000,0.000000,0.618557,0.166667,0.666667,0.000000,,,,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.928315,0.000389,0.000000,0.618557,0.666667,0.555556,0.000000,,,,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.784946,0.000014,0.185479,0.814433,0.666667,0.555556,0.000000,,,,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.211470,0.000000,0.226674,0.773196,0.166667,0.666667,0.062241,0.0,0.0,0.222222,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0.856631,0.000000,0.000000,0.618557,0.666667,0.555556,0.000000,,,,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76324,0.283154,0.000000,0.267971,0.731959,0.000000,0.000000,0.000000,,,,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
76325,0.784946,0.000028,0.133986,0.865979,0.166667,0.444444,0.004149,0.0,0.0,0.000000,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
76326,0.354839,0.000028,0.206076,0.793814,0.666667,0.555556,0.000000,,,,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
76327,0.784946,0.000000,0.710711,0.288660,0.833333,0.888889,0.000000,,,,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [26]:
#X_train_treated.isna().sum()

In [27]:
# Dataframe from X_test
X_test_treated = pd.concat([X_test_scaled, onehot_encoded_test], axis=1)
X_test_treated

Unnamed: 0,ODATEDW,TCODE,DOB,AGE,INCOME,WEALTH1,HIT,MBCRAFT,MBGARDEN,MBBOOKS,...,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,DOMAIN_B_2,DOMAIN_B_3,DOMAIN_B_4
0,0.692308,0.000051,0.000000,0.618557,0.666667,0.555556,0.000000,,,,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.230769,0.000051,0.392276,0.597938,0.333333,0.555556,0.000000,,,,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.769231,0.000051,0.535633,0.463918,0.500000,0.444444,0.012448,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.923077,0.000026,0.000000,0.618557,0.666667,0.555556,0.000000,,,,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0.307692,0.000000,0.412976,0.577320,0.666667,0.555556,0.000000,,,,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19078,0.153846,0.000000,0.299794,0.690722,0.666667,0.555556,0.000000,,,,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
19079,0.307692,0.000000,0.103090,0.896907,0.166667,0.555556,0.000000,,,,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
19080,0.615385,0.000026,0.000000,0.618557,0.000000,0.666667,0.000000,,,,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
19081,0.615385,0.000051,0.401751,0.597938,0.333333,0.333333,0.000000,,,,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


- Fit a logistic regression model on the training data.

In [31]:
X_train_treated.isna().sum()

ODATEDW       0
TCODE         0
DOB           0
AGE           0
INCOME        0
             ..
DOMAIN_A_T    0
DOMAIN_A_U    0
DOMAIN_B_2    0
DOMAIN_B_3    0
DOMAIN_B_4    0
Length: 499, dtype: int64

In [37]:
X_train_treated.fillna(0, inplace=True)

In [35]:
classification = LogisticRegression()
classification.fit(X_train_treated, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [38]:
predictions = classification.predict(X_train_treated)
predictions

array([0, 0, 0, ..., 0, 0, 0])

In [39]:
classification.score(X_train_treated, y_train)


0.9492984317887042

In [40]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, predictions)

array([[72459,     0],
       [ 3870,     0]])

In [41]:
array = confusion_matrix(y_train, predictions)
print('             Predicted Labels')
print('             |   A    |     B')
print('---------------------------------')
print('True label A | ',array[0][0],'|   ', array[0][1])
print('---------------------------------')
print('           B | ',array[1][0],' |   ', array[1][1])

             Predicted Labels
             |   A    |     B
---------------------------------
True label A |  72459 |    0
---------------------------------
           B |  3870  |    0


In [42]:
y_train.value_counts()

0    72459
1     3870
Name: TARGET_B, dtype: int64

- Check the accuracy on the test data.

In [43]:
# classification.score == accuracy of prediction
(72453+0)/len(predictions)

0.949219824706206

In [44]:
from sklearn.metrics import accuracy_score

y_pred = classification.predict(X_train_treated)
accuracy = accuracy_score(y_train, y_pred)
print("Accuracy on test data: ", accuracy)
print("y_pred is: ", y_pred)

Accuracy on test data:  0.9492984317887042
y_pred is:  [0 0 0 ... 0 0 0]


# Managing imbalance in the dataset

- Check for the imbalance

In [45]:
print("Imbalance Check:")
print(y_train.value_counts())

Imbalance Check:
0    72459
1     3870
Name: TARGET_B, dtype: int64


In [46]:
train_data = pd.concat([X_train_treated, y_train], axis=1)

In [47]:
category_0 = train_data[train_data['TARGET_B'] == 0]
category_1 = train_data[train_data['TARGET_B'] == 1]

- Downsampling (undersampling)

In [48]:
category_0_undersampled = resample(category_0,
                                   replace=False,
                                   n_samples = len(category_1))

In [49]:
print(category_0_undersampled.shape)
print(category_1.shape)

(3870, 500)
(3870, 500)


In [51]:
imputer = SimpleImputer(strategy='mean')
category_0_undersampled_imputed = imputer.fit_transform(category_0_undersampled)

category_1_imputed = imputer.transform(category_1)

In [52]:
data_downsampled = pd.concat([pd.DataFrame(category_0_undersampled_imputed, columns=category_0_undersampled.columns),
                              pd.DataFrame(category_1_imputed, columns=category_1.columns)],
                             axis=0)

In [53]:
y_downsampled = data_downsampled['TARGET_B']
X_downsampled = data_downsampled.drop(['TARGET_B'], axis=1)

In [54]:
X_train_downsampled, X_test_downsampled, y_train_downsampled, y_test_downsampled = train_test_split(X_downsampled, y_downsampled, test_size=0.2)


In [55]:
classification_down = LogisticRegression(max_iter=100)
classification_down.fit(X_train_downsampled, y_train_downsampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [56]:
y_pred_downsampled = classification_down.predict(X_train_downsampled)

In [57]:
accuracy_downsampled = accuracy_score(y_train_downsampled, y_pred_downsampled)
print("Accuracy on test data (downsampled):", accuracy_downsampled)
print("y_pred is: ", y_pred_downsampled)

Accuracy on test data (downsampled): 0.583656330749354
y_pred is:  [1. 0. 1. ... 0. 1. 1.]


- Upsampling (oversampling)

In [58]:
category_1_oversampled = resample(category_1,
                                  replace=True,
                                  n_samples = len(category_0))

In [59]:
print(category_0.shape)
print(category_1_oversampled.shape)

(72459, 500)
(72459, 500)


In [60]:
category_1_oversampled = resample(category_1, replace=True, n_samples=len(category_0))

imputer = SimpleImputer(strategy='mean')
category_0_imputed = imputer.fit_transform(category_0)
category_1_oversampled_imputed = imputer.transform(category_1_oversampled)

In [62]:
data_oversampled = pd.concat([pd.DataFrame(category_0_imputed, columns=category_0.columns),
                              pd.DataFrame(category_1_oversampled_imputed, columns=category_1_oversampled.columns)],
                             axis=0)

- Each time fit the model and see how the accuracy of the model has changed.

In [64]:
y_oversampled = data_oversampled['TARGET_B']
X_oversampled = data_oversampled.drop(['TARGET_B'], axis=1)

In [65]:
X_train_oversampled, X_test_oversampled, y_train_oversampled, y_test_oversampled = train_test_split(X_oversampled, y_oversampled, test_size=0.2)

In [66]:
classification_over = LogisticRegression(max_iter=100)
classification_over.fit(X_train_oversampled, y_train_oversampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [67]:
y_pred_oversampled = classification_over.predict(X_train_oversampled)

In [68]:
accuracy_oversampled = accuracy_score(y_train_oversampled, y_pred_oversampled)
print("Accuracy on test data (oversampled):", accuracy_oversampled)
print("y_pred is: ", y_pred_oversampled)

Accuracy on test data (oversampled): 0.5574119757793227
y_pred is:  [1. 1. 0. ... 1. 0. 1.]


# Conclusions

We have a very high unbalance in our sample, and when we perform two balance techniques, we opt for low accuracy in both sets, which indicates that we must improve the unbalance in other way.

In [69]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1, 10, 100],
    'fit_intercept': [True, False]
}

In [70]:
model = LogisticRegression(max_iter=100)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)

In [71]:
grid_search.fit(X_train_treated, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [72]:
grid_search.best_params_

{'C': 0.1, 'fit_intercept': True, 'penalty': 'l2'}

In [73]:
grid_search.best_score_

0.9492984317538928