# Imbalanced Dataset

* When the minority class is represented very scarcily in the data then it might lead to problems. 
So we need to balance the data

**SMOTE ->** **Synthetic Minority Oversampling Technique**

In [1]:
import pandas as pd 

from imblearn.over_sampling import SMOTE 

import warnings
warnings.filterwarnings('ignore')

In [2]:
 df = pd.read_csv('card_transdata.csv')

In [3]:
df.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0


In [None]:
# df.dropna(inplace= True) in this dataset not null value found of found then use this

In [4]:
df.shape

(1000000, 8)

In [5]:
df.groupby('fraud').count()

Unnamed: 0_level_0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order
fraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,912597,912597,912597,912597,912597,912597,912597
1.0,87403,87403,87403,87403,87403,87403,87403


In [6]:
df.groupby('fraud').count().sum()

distance_from_home                1000000
distance_from_last_transaction    1000000
ratio_to_median_purchase_price    1000000
repeat_retailer                   1000000
used_chip                         1000000
used_pin_number                   1000000
online_order                      1000000
dtype: int64

In [7]:
x = df.drop('fraud', axis=1)
y = df['fraud']

In [10]:
# split into train & test
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.2, random_state=42)

In [16]:
# Apply SMOTE

smote = SMOTE(random_state=40)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train,y_train)


In [17]:
df.isnull().sum() # if null found then use df.dropna(inplace=True)

distance_from_home                0
distance_from_last_transaction    0
ratio_to_median_purchase_price    0
repeat_retailer                   0
used_chip                         0
used_pin_number                   0
online_order                      0
fraud                             0
dtype: int64

In [18]:
x_train_resampled.shape

(1460080, 7)

In [19]:
x_train.shape

(800000, 7)

In [20]:
len(y_train)

800000

In [21]:
len(y_train_resampled)

1460080

In [24]:
y_train_resampled.value_counts()

fraud
0.0    730040
1.0    730040
Name: count, dtype: int64