## Operations done on different df's
df_cleaned - cleaned>scaled

df3 - cleaned> train-test split> scaled>  NearMiss Undersample

df4 - cleaned> train-test split> scaled>  SMOTE Oversample

In [1]:
# For this practical example we will need the following libraries and modules
import numpy as np
import pandas as pd

In [2]:
# set max number of rows to see
pd.options.display.max_columns = None
pd.options.display.max_rows = None

### Split into targets and features


In [2]:
df_cleaned = pd.read_csv("df_cleaned.csv")

In [3]:
X = df_cleaned.drop(['is_safe'],axis=1)
y = df_cleaned["is_safe"]

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=22,train_size=.8)

#### Normalize the data

In [6]:
# Import the scaling module
from sklearn.preprocessing import StandardScaler

# Create a scaler object
scaler = StandardScaler()

# Fit the inputs (calculate the mean and standard deviation feature-wise)
scaler.fit(X_train)

StandardScaler()

In [7]:
# Scale the features and store them in a new variable (the actual scaling procedure).
X_scaled = pd.DataFrame(scaler.transform(X_train),columns=X_train.columns,index=X_train.index)

In [28]:
X_scaled.shape

(6396, 20)

### Create Test df for later

In [36]:
# transform test data with what we fit X.train to avoid issues when testing models later
X_test_scaled = pd.DataFrame(scaler.transform(X_test),columns=X_test.columns,index=X_test.index)

In [38]:
#build test_df
test_df = pd.concat([X_test_scaled,y_test],axis=1)
test_df.shape

(1600, 21)

In [40]:
test_df["is_safe"].value_counts()

0    1392
1     208
Name: is_safe, dtype: int64

In [42]:
test_df.to_csv("test_df.csv",index=False)

### Fix Data Imbalance in 4 different ways
1. Near miss before splitting into train and test sets .
2. SMOTE before splitting into train and test sets.
3. Near miss after splitting into train and test sets.
4. SMOTE after splitting into train and test sets.

#### Create a function to display the shape of features and labels.

In [9]:
def display_shape_count(feature_name,label_name,features,label):
    print("Current shape of {} a shape of {}".format(feature_name,features.shape))
    print("Current shape of {} a shape of {}\n".format(label_name,label.shape))

    print("Current counts of {} '1': {}".format(label_name,sum(label== 1)))
    print("Current counts of {} '0': {} \n".format(label_name,sum(label == 0)))


##### Near miss instance 

In [10]:
# apply near miss
from imblearn.under_sampling import NearMiss
nr = NearMiss()

#### 3. Near miss (Undersampling) after splitting into train and test sets.

In [11]:
# import SMOTE module from imblearn library
# pip install imblearn (if you don't have imblearn in your system)
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 22)


In [12]:
print(f"X_scaled shape is {X_scaled.shape},X_test shape is{X_test.shape}")
print(f"y_train shape is {y_train.shape},y_test shape is{y_test.shape}")

X_scaled shape is (6396, 20),X_test shape is(1600, 20)
y_train shape is (6396,),y_test shape is(1600,)


In [29]:
display_shape_count("X_train","y_train",X_train,y_train)
  
# apply Near Miss
  
X_3, y_3 = nr.fit_resample(X_scaled, y_train)
  
display_shape_count("X_3","y_3",X_3,y_3)

Current shape of X_train a shape of (6396, 20)
Current shape of y_train a shape of (6396,)

Current counts of y_train '1': 704
Current counts of y_train '0': 5692 

Current shape of X_3 a shape of (1408, 20)
Current shape of y_3 a shape of (1408,)

Current counts of y_3 '1': 704
Current counts of y_3 '0': 704 



#### 4. SMOTE (Oversampling) after splitting into train and test sets

In [30]:
display_shape_count("X_train","y_train",X_train,y_train)
  
# apply SMOTE
  
X_4, y_4 = sm.fit_resample(X_scaled, y_train)
  

display_shape_count("X_4","y_4",X_4,y_4)

Current shape of X_train a shape of (6396, 20)
Current shape of y_train a shape of (6396,)

Current counts of y_train '1': 704
Current counts of y_train '0': 5692 

Current shape of X_4 a shape of (11384, 20)
Current shape of y_4 a shape of (11384,)

Current counts of y_4 '1': 5692
Current counts of y_4 '0': 5692 



In [31]:
df_3 = pd.concat([X_3,y_3],axis=1)
df_3.head()

Unnamed: 0,aluminium,ammonia,arsenic,barium,cadmium,chloramine,chromium,copper,flouride,bacteria,...,lead,nitrates,nitrites,mercury,perchlorate,radium,selenium,silver,uranium,is_safe
0,-0.4627,0.417333,-0.360493,-1.030961,-0.632929,-0.67338,-0.64921,-1.078454,-0.411992,-0.971987,...,1.183202,-0.878598,-2.16261,-0.399717,-0.497788,-0.759258,0.716618,-0.677023,-1.287071,0
1,-0.29727,-0.067797,-0.559848,-0.858022,-0.994349,0.328087,-0.501059,0.509699,-0.93982,-0.579268,...,-0.496028,1.596866,-0.138347,-0.399717,1.303104,-0.314944,-0.32992,-0.95642,-0.915519,0
2,-0.517843,-0.447121,-0.360493,-0.858022,-0.910945,-0.818123,-0.575134,-1.108995,-0.870973,-0.820942,...,0.429262,-1.371156,-2.214962,-0.736909,-0.920961,-1.013768,0.018926,-0.95642,-0.915519,0
3,-0.4627,-0.824194,-0.519977,-1.278017,-0.076898,-0.841595,-0.575134,-1.21589,-0.802126,-0.971987,...,0.257912,-0.779,-1.464588,-0.062525,-0.774805,-0.224356,0.367772,-0.677023,0.570689,0
4,-0.454822,1.325685,-0.400364,-0.643906,0.201118,-0.829859,-0.612172,-0.803581,0.781358,-0.971987,...,-0.821594,-1.615624,-1.813599,0.949051,-0.906799,0.219958,0.367772,-0.397626,-0.915519,0


In [32]:
df_4 = pd.concat([X_4,y_4],axis=1)
df_4.head()

Unnamed: 0,aluminium,ammonia,arsenic,barium,cadmium,chloramine,chromium,copper,flouride,bacteria,...,lead,nitrates,nitrites,mercury,perchlorate,radium,selenium,silver,uranium,is_safe
0,-0.4627,-1.607606,-0.599719,-0.141559,1.035165,-0.716411,-0.908475,0.52497,-0.549686,1.021816,...,-0.204733,-0.215818,0.926137,-1.411293,-0.921528,0.228586,-1.725306,-0.607174,-0.915519,0
1,-0.478455,1.662237,-0.280751,-0.001561,1.035165,-0.802475,-0.612172,0.586052,-0.893922,1.565581,...,-0.615974,-0.480206,-0.57461,-0.062525,-0.762909,-0.220042,0.716618,-0.746872,0.570689,0
2,-0.446944,1.075804,-0.559848,0.994899,-1.105556,-0.423013,1.091568,0.830384,0.597766,-0.851151,...,-0.564569,0.731269,-0.033643,-1.748485,-0.230969,-0.211414,0.018926,0.580265,-1.658623,0
3,2.987691,1.546301,-0.559848,1.950182,-1.077754,1.153515,0.498962,-0.437084,-1.261107,1.112444,...,-0.615974,0.490422,0.07106,-0.399717,1.770464,1.604664,-0.678767,-0.048379,0.942241,0
4,2.948303,1.351573,3.14815,2.123122,-0.632929,2.2919,1.128605,1.242692,1.630473,2.018718,...,1.063257,-1.4617,0.140862,-0.736909,0.384813,2.139566,-1.027613,1.139059,-0.172415,0


In [33]:
print(df_3["is_safe"].value_counts())
print(df_4["is_safe"].value_counts())

0    704
1    704
Name: is_safe, dtype: int64
0    5692
1    5692
Name: is_safe, dtype: int64


In [34]:
dfs= [df_3,df_4]

for i in range(2):
    dfs[i].to_csv(f"df_{i+3}.csv",index=False)

In [35]:
df_3

Unnamed: 0,aluminium,ammonia,arsenic,barium,cadmium,chloramine,chromium,copper,flouride,bacteria,...,lead,nitrates,nitrites,mercury,perchlorate,radium,selenium,silver,uranium,is_safe
0,-0.462700,0.417333,-0.360493,-1.030961,-0.632929,-0.673380,-0.649210,-1.078454,-0.411992,-0.971987,...,1.183202,-0.878598,-2.162610,-0.399717,-0.497788,-0.759258,0.716618,-0.677023,-1.287071,0
1,-0.297270,-0.067797,-0.559848,-0.858022,-0.994349,0.328087,-0.501059,0.509699,-0.939820,-0.579268,...,-0.496028,1.596866,-0.138347,-0.399717,1.303104,-0.314944,-0.329920,-0.956420,-0.915519,0
2,-0.517843,-0.447121,-0.360493,-0.858022,-0.910945,-0.818123,-0.575134,-1.108995,-0.870973,-0.820942,...,0.429262,-1.371156,-2.214962,-0.736909,-0.920961,-1.013768,0.018926,-0.956420,-0.915519,0
3,-0.462700,-0.824194,-0.519977,-1.278017,-0.076898,-0.841595,-0.575134,-1.215890,-0.802126,-0.971987,...,0.257912,-0.779000,-1.464588,-0.062525,-0.774805,-0.224356,0.367772,-0.677023,0.570689,0
4,-0.454822,1.325685,-0.400364,-0.643906,0.201118,-0.829859,-0.612172,-0.803581,0.781358,-0.971987,...,-0.821594,-1.615624,-1.813599,0.949051,-0.906799,0.219958,0.367772,-0.397626,-0.915519,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1403,1.041923,-0.764538,-0.480106,-0.026266,-1.161159,-0.184382,0.387849,-1.170078,1.217390,-0.881360,...,0.172237,-1.593894,-0.417555,0.949051,1.599382,0.086233,0.716618,-0.118228,-0.172415,1
1404,0.553511,0.634572,-0.559848,0.731372,-1.133357,1.552536,0.387849,1.349587,1.332135,0.931189,...,0.103697,1.484592,-0.085995,1.623435,0.283410,0.530547,-0.329920,-0.188078,0.199137,1
1405,2.239319,-1.513056,-0.440235,-1.022726,-1.049953,-0.063111,1.017492,0.066849,1.332135,-0.971987,...,-0.701649,-0.568939,0.123412,-0.736909,2.305236,-0.522003,-0.678767,-0.746872,-0.543967,1
1406,3.074345,0.430840,-0.635602,0.171379,-1.161159,2.237133,0.424887,1.593918,-0.756228,-0.971987,...,-0.907269,0.724025,0.088511,-1.074101,-0.653575,0.448586,0.018926,2.186799,-1.658623,1


In [60]:
df_scaled_inputs = pd.concat([X_scaled,y],axis=1)
df_scaled_inputs.head()

Unnamed: 0,aluminium,ammonia,arsenic,barium,cadmium,chloramine,chromium,copper,flouride,bacteria,viruses,lead,nitrates,nitrites,mercury,perchlorate,radium,selenium,silver,uranium,is_safe
0,0.777403,-0.585491,-0.480875,1.054205,-0.993241,-0.711941,2.152993,-0.97305,-1.657446,-0.363345,-0.869387,-0.78107,1.129767,-0.348629,0.609178,1.203362,1.661842,1.053694,1.338736,-0.91704,1
1,1.306945,0.775118,-0.599632,1.432447,-1.131948,1.208552,1.598765,-0.223303,0.294799,1.00246,0.849782,0.009785,-1.409197,1.04696,-0.739005,0.892977,0.124811,1.053694,0.851134,0.198023,1
2,0.271572,-0.029083,-0.480875,-0.81234,-0.965499,0.803417,1.044536,-1.202564,0.501507,-0.818614,-0.861452,-0.36845,0.783298,-0.383518,0.272132,1.911763,1.786699,0.706129,2.03531,-1.288728,0
3,0.548198,-0.332067,-0.480875,1.144654,-1.15969,1.968179,-0.802892,1.306793,0.708215,1.184567,1.008475,-1.434385,-1.517469,-0.069511,-0.40196,-0.415275,-0.516695,-1.0317,2.104968,0.198023,1
4,0.200439,1.132165,-0.52046,-1.124801,-1.020982,0.19182,1.635713,-0.361012,-0.371261,-0.575804,-0.866742,0.302058,-0.555658,-0.383518,-0.739005,0.024578,-0.219622,-1.0317,-0.611672,-0.91704,1


In [62]:
df_scaled_inputs.to_csv("df_scaled_inputs.csv",index=False)