# **Data Balancing**

In [201]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df=pd.read_csv("C:\\Users\\varun\\Downloads\\car_evaluation.csv")
df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,outcome
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5,6,med,med,good
1724,low,low,5,6,med,high,vgood
1725,low,low,5,6,big,low,unacc
1726,low,low,5,6,big,med,good


In [178]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   int64 
 3   persons   1728 non-null   int64 
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   outcome   1728 non-null   object
dtypes: int64(2), object(5)
memory usage: 94.6+ KB


In [179]:
df.shape

(1728, 7)

In [180]:
df['outcome'].value_counts() #minority classes--> good and vgood

outcome
unacc    1210
acc       384
good       69
vgood      65
Name: count, dtype: int64

In [181]:
df.isnull().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
outcome     0
dtype: int64

In [182]:
print(df.dtypes)


buying      object
maint       object
doors        int64
persons      int64
lug_boot    object
safety      object
outcome     object
dtype: object


# X and Y declaration

In [183]:
x=df.iloc[ :, :-1]
y=df['outcome']

# Label Encoding

In [184]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
categorical_columns = x.select_dtypes(include=['object'])
for col in categorical_columns:
    x[col]=le.fit_transform(x[col])



In [185]:
x

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,3,3,2,2,2,1
1,3,3,2,2,2,2
2,3,3,2,2,2,0
3,3,3,2,2,1,1
4,3,3,2,2,1,2
...,...,...,...,...,...,...
1723,1,1,5,6,1,2
1724,1,1,5,6,1,0
1725,1,1,5,6,0,1
1726,1,1,5,6,0,2


# Train/Test Split

In [186]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=0)

In [187]:
x_train.shape,y_train.value_counts()

((1209, 6),
 outcome
 unacc    847
 acc      269
 vgood     49
 good      44
 Name: count, dtype: int64)

# Model Building

In [188]:

from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(x_train,y_train)

In [189]:
y_pred = model.predict(x_test)
y_pred_train  = model.predict(x_train)

In [190]:
from sklearn.metrics import accuracy_score,classification_report
print(accuracy_score(y_test,y_pred))
accuracy_score(y_train,y_pred_train)

0.8959537572254336


0.9536807278742763

In [191]:
print(classification_report(y_test,y_pred))
pd.crosstab(y_test,y_pred)

              precision    recall  f1-score   support

         acc       0.80      0.73      0.76       115
        good       0.70      0.28      0.40        25
       unacc       0.93      0.99      0.96       363
       vgood       0.93      0.88      0.90        16

    accuracy                           0.90       519
   macro avg       0.84      0.72      0.76       519
weighted avg       0.89      0.90      0.89       519



col_0,acc,good,unacc,vgood
outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
acc,84,3,27,1
good,16,7,2,0
unacc,3,0,360,0
vgood,2,0,0,14


# Predictive analysis

In [192]:
print(model.predict([[1,1,5,6,0,2]]))
print(model.predict([[1,3,3,1,4,0]]))
print(model.predict([[1,2,1,2,2,1]]))

['good']
['unacc']
['unacc']




# Balancing Technique

In [193]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
x_train_smote,y_train_smote = smote.fit_resample(x_train,y_train)
y_train.value_counts()

outcome
unacc    847
acc      269
vgood     49
good      44
Name: count, dtype: int64

In [194]:
y_train_smote.value_counts()

outcome
unacc    847
vgood    847
acc      847
good     847
Name: count, dtype: int64

# Model building with balanced data

In [195]:
# model building after balancing
model1 = KNeighborsClassifier()
model1.fit(x_train_smote,y_train_smote)
y_pred1 = model1.predict(x_test)
y_pred1_train = model1.predict(x_train_smote)

In [196]:
accuracy_score(y_test,y_pred1)

0.8304431599229287

In [197]:
accuracy_score(y_train_smote,y_pred1_train)

0.9613341204250295

In [198]:
pd.crosstab(y_test,y_pred1)

col_0,acc,good,unacc,vgood
outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
acc,89,16,7,3
good,8,17,0,0
unacc,44,5,313,1
vgood,2,2,0,12


In [199]:
print(classification_report(y_test,y_pred1))

              precision    recall  f1-score   support

         acc       0.62      0.77      0.69       115
        good       0.42      0.68      0.52        25
       unacc       0.98      0.86      0.92       363
       vgood       0.75      0.75      0.75        16

    accuracy                           0.83       519
   macro avg       0.69      0.77      0.72       519
weighted avg       0.87      0.83      0.84       519



In [200]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         acc       0.80      0.73      0.76       115
        good       0.70      0.28      0.40        25
       unacc       0.93      0.99      0.96       363
       vgood       0.93      0.88      0.90        16

    accuracy                           0.90       519
   macro avg       0.84      0.72      0.76       519
weighted avg       0.89      0.90      0.89       519

