In [1]:
#Importing Dependencies

import pandas as pd    #for dataframes, reading csv files
import numpy as np     #for arrays
from sklearn.model_selection import train_test_split #splitting data
from sklearn.linear_model import LogisticRegression #model
from sklearn.metrics import confusion_matrix   #Provides depth of understanding

In [2]:
df = pd.read_csv('/kaggle/input/raisin-binary-classification/Raisin_Dataset.csv') #Reading the dataset

In [3]:
df   #visualizing the dataset

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,Class
0,87524,442.246011,253.291155,0.819738,90546,0.758651,1184.040,Kecimen
1,75166,406.690687,243.032436,0.801805,78789,0.684130,1121.786,Kecimen
2,90856,442.267048,266.328318,0.798354,93717,0.637613,1208.575,Kecimen
3,45928,286.540559,208.760042,0.684989,47336,0.699599,844.162,Kecimen
4,79408,352.190770,290.827533,0.564011,81463,0.792772,1073.251,Kecimen
...,...,...,...,...,...,...,...,...
895,83248,430.077308,247.838695,0.817263,85839,0.668793,1129.072,Besni
896,87350,440.735698,259.293149,0.808629,90899,0.636476,1214.252,Besni
897,99657,431.706981,298.837323,0.721684,106264,0.741099,1292.828,Besni
898,93523,476.344094,254.176054,0.845739,97653,0.658798,1258.548,Besni


In [4]:
df.info()   #more information

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Area             900 non-null    int64  
 1   MajorAxisLength  900 non-null    float64
 2   MinorAxisLength  900 non-null    float64
 3   Eccentricity     900 non-null    float64
 4   ConvexArea       900 non-null    int64  
 5   Extent           900 non-null    float64
 6   Perimeter        900 non-null    float64
 7   Class            900 non-null    object 
dtypes: float64(5), int64(2), object(1)
memory usage: 56.4+ KB


Key Observations:
1) No null values.
2) Column: Class (Kecimen, Besni) is the target, we will assign 1 to Kecimen and 0 to Besni.
3) We could do further data analysis and drop some features but the dataset is too small to improve the prediction performance so we will keep things very simple.

In [5]:
label_change = {'Kecimen':1, 'Besni':0}
df['Class'] = df['Class'].replace(label_change)
df

  df['Class'] = df['Class'].replace(label_change)


Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,Class
0,87524,442.246011,253.291155,0.819738,90546,0.758651,1184.040,1
1,75166,406.690687,243.032436,0.801805,78789,0.684130,1121.786,1
2,90856,442.267048,266.328318,0.798354,93717,0.637613,1208.575,1
3,45928,286.540559,208.760042,0.684989,47336,0.699599,844.162,1
4,79408,352.190770,290.827533,0.564011,81463,0.792772,1073.251,1
...,...,...,...,...,...,...,...,...
895,83248,430.077308,247.838695,0.817263,85839,0.668793,1129.072,0
896,87350,440.735698,259.293149,0.808629,90899,0.636476,1214.252,0
897,99657,431.706981,298.837323,0.721684,106264,0.741099,1292.828,0
898,93523,476.344094,254.176054,0.845739,97653,0.658798,1258.548,0


In [6]:
#Producing Input and Output tensors
#Input tensor
X = df.drop(columns = ['Class'])
X

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter
0,87524,442.246011,253.291155,0.819738,90546,0.758651,1184.040
1,75166,406.690687,243.032436,0.801805,78789,0.684130,1121.786
2,90856,442.267048,266.328318,0.798354,93717,0.637613,1208.575
3,45928,286.540559,208.760042,0.684989,47336,0.699599,844.162
4,79408,352.190770,290.827533,0.564011,81463,0.792772,1073.251
...,...,...,...,...,...,...,...
895,83248,430.077308,247.838695,0.817263,85839,0.668793,1129.072
896,87350,440.735698,259.293149,0.808629,90899,0.636476,1214.252
897,99657,431.706981,298.837323,0.721684,106264,0.741099,1292.828
898,93523,476.344094,254.176054,0.845739,97653,0.658798,1258.548


In [7]:
#Output Tensor
y = df['Class']
y

0      1
1      1
2      1
3      1
4      1
      ..
895    0
896    0
897    0
898    0
899    0
Name: Class, Length: 900, dtype: int64

In [8]:
#Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 10)

In [9]:
#Building the model. We will not normalize the datasets, we will keep things simple
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [10]:
#Measuring accuracy of the model
accuracy = logreg.score(X_test,y_test)
print(f"The accuracy for this logistic regression model = {100*accuracy:0.2f} % ")

The accuracy for this logistic regression model = 87.22 % 


In [11]:
#Confusion Matrix
y_pred = logreg.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix")
print(cm)

Confusion Matrix
[[68 15]
 [ 8 89]]


The confusion matrix has 4 elements. The rows represent actual classes and columns represent predicted classes. Cell (0,0) is true negatives (Actual Label = 0, Predicted label  = 0) and the logic follows.