In [27]:
import pandas as pd

df = pd.read_excel("Raisin_Dataset.xlsx")
df.head()

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,Class
0,87524,442.246011,253.291155,0.819738,90546,0.758651,1184.04,Kecimen
1,75166,406.690687,243.032436,0.801805,78789,0.68413,1121.786,Kecimen
2,90856,442.267048,266.328318,0.798354,93717,0.637613,1208.575,Kecimen
3,45928,286.540559,208.760042,0.684989,47336,0.699599,844.162,Kecimen
4,79408,352.19077,290.827533,0.564011,81463,0.792772,1073.251,Kecimen


In [28]:
X = df.drop("Class",axis=1)
y = df["Class"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =0.2,random_state=10)

# Scale the Data

In [64]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = StandardScaler()
scaler.fit(X)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [66]:
X_train_scaled

array([[ 0.98369528,  0.41344215,  1.83005975, ...,  1.06982849,
        -1.39598956,  1.40080884],
       [ 1.42589144,  0.33065747,  2.8586639 , ...,  1.35328748,
         0.94973473,  0.93602325],
       [ 0.10158614,  0.02447351,  0.3162051 , ...,  0.05629692,
        -0.12319515, -0.00300658],
       ...,
       [ 1.94040593,  1.41267887,  1.94025974, ...,  1.87217696,
         0.09689362,  1.66678384],
       [-0.85910087, -0.68228209, -1.1222849 , ..., -0.86451533,
        -0.81612799, -0.8599019 ],
       [-0.22886166, -0.30895625,  0.04957521, ..., -0.25064868,
        -0.39412057, -0.26752331]])

# Model with rbf kernel with no scaling

In [30]:
from sklearn.svm import SVC

model = SVC(kernel = "rbf")
model.fit(X_train, y_train)

In [31]:
y_pred = model.predict(X_test)

In [32]:
model.n_iter_

array([229], dtype=int32)

In [33]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

       Besni       0.86      0.75      0.80        83
     Kecimen       0.81      0.90      0.85        97

    accuracy                           0.83       180
   macro avg       0.83      0.82      0.82       180
weighted avg       0.83      0.83      0.83       180



# Model with rbf kernel with scaling

In [72]:
from sklearn.svm import SVC

model = SVC(kernel = "rbf")
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

In [74]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)
print(report)                                             # 83 to 88

              precision    recall  f1-score   support

       Besni       0.91      0.83      0.87        83
     Kecimen       0.87      0.93      0.90        97

    accuracy                           0.88       180
   macro avg       0.89      0.88      0.88       180
weighted avg       0.88      0.88      0.88       180



In [76]:
model.n_iter_

array([419], dtype=int32)

# Model with Linear Kernel with no scaling

##takes more iterations and training time but overall accuracy improved

In [35]:
model = SVC(kernel = "linear")
model.fit(X_train, y_train)

In [36]:
y_pred = model.predict(X_test)

In [37]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

       Besni       0.90      0.87      0.88        83
     Kecimen       0.89      0.92      0.90        97

    accuracy                           0.89       180
   macro avg       0.90      0.89      0.89       180
weighted avg       0.89      0.89      0.89       180



In [38]:
model.n_iter_

array([60953539], dtype=int32)

In [50]:
df.shape

(900, 8)

In [52]:
X_train.shape

(720, 7)

In [None]:
# for 720 records it is taking more training time, for accuracy from 83 to 89

# Model with Linear Kernel with scaling

In [82]:
from sklearn.svm import SVC

model = SVC(kernel = "linear")
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

report = classification_report(y_test, y_pred)
print(report) 

              precision    recall  f1-score   support

       Besni       0.90      0.84      0.87        83
     Kecimen       0.87      0.92      0.89        97

    accuracy                           0.88       180
   macro avg       0.88      0.88      0.88       180
weighted avg       0.88      0.88      0.88       180



In [84]:
model.n_iter_

array([2164], dtype=int32)

In [None]:
# no of iterations drastically dropped many thousand times with the same performance and training time reduced so much

# Scaling

Bringing the column values in standardized scale (0 to 1 commonly)
it will help model to perform better

## Min-Max Scaling

calculate min and max value of the column (x_max, x_min)

for every value

x_scaled = (xi - x_min) / (x_max - x_min)

# Standard Scaler / Z-score Normalization

for every value

x_scaled = (xi - x_mean) / std_dev


# Data Pipeline using sklearn

In [96]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('scale',StandardScaler()),
    ('svc',SVC(kernel='rbf'))
])

In [98]:
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)
print(report)

model.n_iter_

              precision    recall  f1-score   support

       Besni       0.91      0.83      0.87        83
     Kecimen       0.87      0.93      0.90        97

    accuracy                           0.88       180
   macro avg       0.89      0.88      0.88       180
weighted avg       0.88      0.88      0.88       180



array([2164], dtype=int32)

In [None]:
# you can change parameters, you have the option in sklearn.pipeline