## Logistic Regression: Classification Problem

### Data Gathering

In [3]:
from warnings import filterwarnings
filterwarnings('ignore')

In [4]:
path = r"https://raw.githubusercontent.com/sindhura-nk/Datasets/refs/heads/main/iris.csv"
import pandas as pd
df = pd.read_csv(path)
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [7]:
df.shape


(150, 5)

In [8]:
df.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [9]:
df['species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [10]:
# check for missing data
df.isna().sum()


sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [11]:
# check for duplicated rows
df.duplicated().sum()

np.int64(1)

In [12]:
df = df.drop_duplicates()

### Separate X and Y features
Y : Species X : all remaining features, sepal length/width, petal length/width

In [13]:
df.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [14]:
X = df.drop(columns=['species'])
Y = df[['species']]

In [15]:
X.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [16]:
Y.head()

Unnamed: 0,species
0,setosa
1,setosa
2,setosa
3,setosa
4,setosa


## Data Preprocessing and Data Cleaning

In [17]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [18]:
pre = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
).set_output(transform='pandas')

In [19]:
X_pre = pre.fit_transform(X)
X_pre.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,-0.898033,1.012401,-1.333255,-1.308624
1,-1.139562,-0.137353,-1.333255,-1.308624
2,-1.381091,0.322549,-1.390014,-1.308624
3,-1.501855,0.092598,-1.276496,-1.308624
4,-1.018798,1.242352,-1.333255,-1.308624


## Train Test Split
    rate at which you can divide:

    100%

    60 % training 40% testing

    70% training 30% testing

    80% training 20% testing

In [20]:
from sklearn.model_selection import train_test_split

random_state = this is generally used for reproducing model results

In [21]:
xtrain,xtest,ytrain,ytest = train_test_split(X_pre,Y,train_size=0.7,test_size=0.3,random_state=21)

In [22]:
xtrain.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
36,-0.414976,1.012401,-1.390014,-1.308624
37,-1.139562,1.242352,-1.333255,-1.440192
13,-1.864149,-0.137353,-1.503531,-1.440192
68,0.430375,-1.97696,0.426261,0.40177
11,-1.260327,0.78245,-1.219738,-1.308624


In [23]:
xtrain.index

Index([ 36,  37,  13,  68,  11,  52,  49,  65, 117,  55,
       ...
       122,  61, 110,  72,  98, 120, 112,  48,   4,  56],
      dtype='int64', length=104)

In [24]:
xtest.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
92,-0.052682,-1.057157,0.142468,0.007064
44,-0.898033,1.702254,-1.049462,-1.045486
7,-1.018798,0.78245,-1.276496,-1.308624
21,-0.898033,1.472303,-1.276496,-1.045486
95,-0.173447,-0.137353,0.255985,0.007064


In [25]:
ytrain.head()

Unnamed: 0,species
36,setosa
37,setosa
13,setosa
68,versicolor
11,setosa


In [26]:
ytest.head()

Unnamed: 0,species
92,versicolor
44,setosa
7,setosa
21,setosa
95,versicolor


## Build the model

In [27]:
from sklearn.linear_model import LogisticRegression

In [28]:
model = LogisticRegression()
model.fit(xtrain,ytrain)

In [29]:
model.classes_

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [30]:
ypred_train = model.predict(xtrain)

In [31]:
yprob_train = model.predict_proba(xtrain)

In [32]:
yprob_train[0]

array([9.67343450e-01, 3.26556334e-02, 9.16479348e-07])

In [33]:
model.classes_

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [34]:
ypred_train[:5]

array(['setosa', 'setosa', 'setosa', 'versicolor', 'setosa'], dtype=object)

In [35]:
ytrain.head()

Unnamed: 0,species
36,setosa
37,setosa
13,setosa
68,versicolor
11,setosa


## Model Evaluation

In [36]:
model.score(xtrain,ytrain)

0.9711538461538461

In [37]:
model.score(xtest,ytest)

0.9111111111111111