### VIDEO 9: HANDLING IMBALANCED DATASET

In [2]:
#Importing libraries
import pandas as pd
import numpy as np

#Loading the data
data = pd.read_csv(r"Synergix_data_preprocessed_new.csv")

#Storing the ratio in a list named Rating_ratio
Rating_ratio = []
for row in data.values:
    if(row[4]+row[5] == 0):
        if(row[7]+row[8] == 0):
            #If all the ratings are zero then overall rating ratio will also be zero
            Rating_ratio.append(0.0)
        else:
            #If only the numerator(1 and 2 star) ratings are zero then adding -99999 to the list temporarily which
            #will be taken care of in the next cell.
            Rating_ratio.append(-99999)
    else:
        Rating_ratio.append((int(row[7])+(row[8]))/(int(row[4])+int(row[5])))

#replacing -99999 with the maximum ratio in the list
max_rating = max(Rating_ratio)
for x in range(len(Rating_ratio)):
    if(Rating_ratio[x] == -99999):
        Rating_ratio[x] = max_rating

#adding the column 'Good_By_Bad_Rating' to the dataframe
data['Good_By_Bad_Rating'] = Rating_ratio

data = data.drop(columns = ['1_Star_Rating', '2_Star_Rating', '3_Star_Rating', '4_Star_Rating', '5_Star_Rating'])

In [3]:
from sklearn.preprocessing import LabelEncoder
data [['Segment']]= data [['Segment']].apply(LabelEncoder().fit_transform)

In [4]:
data = data.drop(columns = 'Units_sold',axis=1)

X = data.drop(columns = 'Units_sold>1000')
y = data['Units_sold>1000']

In [5]:
# Importing the train-test split from scikit-learn
from sklearn.model_selection import train_test_split

# Performing train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 6)

In [6]:
y_train.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
Units_sold>1000,Unnamed: 1_level_1
1,0.598137
0,0.401863


In [7]:
y_test.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
Units_sold>1000,Unnamed: 1_level_1
1,0.614261
0,0.385739


In [8]:
# Performing train test split with stratification
X_train_st, X_test_st, y_train_st, y_test_st = train_test_split(X, y, test_size = 0.3, stratify = y, random_state = 6)

In [9]:
y_train_st.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
Units_sold>1000,Unnamed: 1_level_1
1,0.60294
0,0.39706


In [10]:
y_test_st.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
Units_sold>1000,Unnamed: 1_level_1
1,0.603056
0,0.396944


In [11]:
from sklearn.tree import DecisionTreeClassifier
DT_model = DecisionTreeClassifier(max_depth = 11, min_samples_leaf= 6, random_state=42, class_weight = 'balanced')

In [12]:
# Train the model
DT_model.fit(X_train_st, y_train_st)

In [13]:
from sklearn.metrics import f1_score

# Make predictions on the train dataset
y_train_pred = DT_model.predict(X_train_st)

# Make predictions on the test dataset
y_test_pred = DT_model.predict(X_test_st)

# Let's display the model performance on the train and test data.

print('Train score: ', f1_score(y_train_st, y_train_pred))
print('Test score: ', f1_score(y_test_st, y_test_pred))

Train score:  0.8870789957134109
Test score:  0.8261986301369862


---

### Undersampling

In [None]:
#pip install --upgrade scikit-learn imbalanced-learn

In [14]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


In [15]:
from imblearn.under_sampling import RandomUnderSampler

In [16]:
X_train.values

array([[3.17500000e+03, 9.37300843e+00, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 8.23076923e+00],
       [3.26600000e+03, 1.64978139e+01, 1.00000000e+00, ...,
        9.99049800e+06, 6.00000000e+00, 4.50000000e+01],
       [4.90200000e+03, 2.02381616e+01, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.39716981e+01],
       ...,
       [5.18700000e+03, 1.96472727e+01, 2.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.85118110e+01],
       [3.16500000e+03, 1.98898734e+01, 1.00000000e+00, ...,
        7.53972300e+06, 2.00000000e+00, 1.57222222e+01],
       [3.18500000e+03, 1.68407692e+01, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.81569767e+01]])

In [20]:
sampler = RandomUnderSampler(random_state = 42)
X_train_rus, y_train_rus = sampler.fit_resample(X_train, y_train)

In [21]:
print(y_train_rus.value_counts(normalize = True))

Units_sold>1000
0    0.5
1    0.5
Name: proportion, dtype: float64


In [22]:
DT_model = DecisionTreeClassifier(max_depth = 11, min_samples_leaf= 6, random_state=42)

DT_model.fit(X_train_rus, y_train_rus)

y_train_pred = DT_model.predict(X_train_rus)
y_pred = DT_model.predict(X_test)


print('Train F1 Score: ', f1_score(y_train_rus, y_train_pred))
print('Test F1 Score: ', f1_score(y_test, y_pred))

Train F1 Score:  0.8667085539897674
Test F1 Score:  0.8229858504187121


### Random oversampling

In [23]:
from imblearn.over_sampling import RandomOverSampler

In [24]:
sampler = RandomOverSampler(random_state = 42)
X_train_ros, y_train_ros = sampler.fit_resample(X_train, y_train)

In [25]:
y_train_ros.value_counts(normalize = True)

Unnamed: 0_level_0,proportion
Units_sold>1000,Unnamed: 1_level_1
1,0.5
0,0.5


In [26]:
DT_model = DecisionTreeClassifier(max_depth = 11, min_samples_leaf= 6, random_state=42)

DT_model.fit(X_train_ros, y_train_ros)

y_train_pred = DT_model.predict(X_train_ros)
y_pred = DT_model.predict(X_test)

y_train_pred = DT_model.predict(X_train_ros)
y_pred = DT_model.predict(X_test)

# Printing the F1 score of the train and test data
print('Train F1 Score: ', f1_score(y_train_ros, y_train_pred))
print('Test F1 Score: ', f1_score(y_test, y_pred))

Train F1 Score:  0.8634057971014493
Test F1 Score:  0.829598308668076


#### SMOTE

In [27]:
from imblearn.over_sampling import SMOTE

In [29]:
smote = SMOTE(random_state = 42)

In [30]:
X_train_smt, y_train_smt = smote.fit_resample(X_train,y_train)

In [31]:
y_train_smt.value_counts(normalize = True)

Unnamed: 0_level_0,proportion
Units_sold>1000,Unnamed: 1_level_1
1,0.5
0,0.5


In [32]:
DT_model = DecisionTreeClassifier(max_depth = 11, min_samples_leaf= 6, random_state=42)
DT_model.fit(X_train_smt, y_train_smt)


#Making predictions
y_train_pred = DT_model.predict(X_train_smt)
y_pred = DT_model.predict(X_test)

#Evaluating the model
print('Training F1 score: ', f1_score(y_train_smt, y_train_pred))
print('Testing F1 score: ', f1_score(y_test, y_pred))

Training F1 score:  0.8627522489666909
Testing F1 score:  0.8321230243485691
