In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import numpy as np
from sklearn.datasets import load_digits
digits = load_digits()

In [2]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(digits.data,digits.target,test_size=0.2)

**Logistic Regression**

In [3]:
lr = LogisticRegression()
lr.fit(x_train,y_train)
lr.score(x_test,y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9638888888888889

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

lr = LogisticRegression(max_iter=1000)
lr.fit(x_train_scaled, y_train)
accuracy = lr.score(x_test_scaled, y_test)
print(f"Accuracy: {accuracy}")


Accuracy: 0.9638888888888889


**SVM**

In [5]:
svm = SVC()
svm.fit(x_train,y_train)
svm.score(x_test,y_test)

0.9833333333333333

**Random Forest**

In [6]:
rf=RandomForestClassifier()
rf.fit(x_train,y_train)
rf.score(x_test,y_test)

0.9611111111111111

#KFold cross validation
**Basic example**

In [7]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=4)
kf

KFold(n_splits=4, random_state=None, shuffle=False)

**kf.split will return an iterator and iterator will return train_index and test_index**

In [8]:
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9,10]):
  print(train_index, test_index)

[3 4 5 6 7 8 9] [0 1 2]
[0 1 2 6 7 8 9] [3 4 5]
[0 1 2 3 4 5 8 9] [6 7]
[0 1 2 3 4 5 6 7] [8 9]


**Above we can see that kf.split divided this [1,2,3,4,5,6,7,8,9,10] into 4 folds**
1.   **First Iteration (Fold 1):**

    *  Training indices: [3, 4, 5, 6, 7, 8, 9]
    *  Testing indices: [0, 1, 2]

2.   **Second Iteration (Fold 2):**

    *  Training indices: [0, 1, 2, 6, 7, 8, 9]
    *  Testing indices: [3, 4, 5]

3.   **Third Iteration (Fold 3):**

    *  Training indices: [0, 1, 2, 3, 4, 5, 8, 9]
    *  Testing indices: [6, 7]

4.   **Fourth Iteration (Fold 4):**

    *  Training indices: [0, 1, 2, 3, 4, 5, 6, 7]
    *  Testing indices: [8, 9]

**Now instead of writing all these codes individually. So we can make a function and call the model according to our requirement instead of writing whole code again and again**

In [9]:
def get_score(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    return model.score(x_test, y_test)

In [10]:
get_score(LogisticRegression(),x_train, x_test, y_train, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9638888888888889

In [11]:
get_score(SVC(),x_train, x_test, y_train, y_test)

0.9833333333333333

In [12]:
get_score(RandomForestClassifier(),x_train, x_test, y_train, y_test)

0.9638888888888889

**KFold simply divides the dataset into k folds without considering the class distribution, while StratifiedKFold ensures that each fold maintains the same class distribution as the original dataset, which can be crucial for certain machine learning tasks, especially in classification with imbalanced data.**

In [13]:
def get_score(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    return model.score(x_test, y_test)

In [14]:
from sklearn.model_selection import StratifiedKFold
folds=StratifiedKFold(n_splits=4)

In [15]:
scores_l=[]
scores_svm=[]
scores_rf=[]
for train_index,test_index in kf.split(digits.data):
  x_train,x_test,y_train,y_test=digits.data[train_index],digits.data[test_index],\
                                digits.target[train_index],digits.target[test_index]

  scores_l.append(get_score(LogisticRegression(max_iter=1000,solver='liblinear',multi_class='ovr'),x_train,x_test,y_train,y_test))
  scores_svm.append(get_score(SVC(gamma='auto',C=3.0, kernel='rbf'),x_train,x_test,y_train,y_test))
  scores_rf.append(get_score(RandomForestClassifier(n_estimators=40),x_train,x_test,y_train,y_test))

In [16]:
scores_l

[0.9266666666666666,
 0.9265033407572383,
 0.9576837416481069,
 0.9020044543429844]

In [17]:
scores_svm

[0.4777777777777778,
 0.42538975501113585,
 0.45657015590200445,
 0.5434298440979956]

In [18]:
scores_rf

[0.9577777777777777,
 0.9198218262806236,
 0.9576837416481069,
 0.9198218262806236]

In [19]:
# Calculate average scores for each model
avg_score_l = np.mean(scores_l)
avg_score_svm = np.mean(scores_svm)
avg_score_rf = np.mean(scores_rf)

# Print average scores
print("Average Score - Logistic Regression:", avg_score_l)
print("Average Score - SVM:", avg_score_svm)
print("Average Score - Random Forest:", avg_score_rf)

# Identify the best model
best_model = max([(avg_score_l, 'Logistic Regression'),
                  (avg_score_svm, 'SVM'),
                  (avg_score_rf, 'Random Forest')])

print("Best Performing Model:", best_model[1])

Average Score - Logistic Regression: 0.928214550853749
Average Score - SVM: 0.4757918831972284
Average Score - Random Forest: 0.938776292996783
Best Performing Model: Random Forest


In [20]:
from sklearn.model_selection import cross_val_score

In [24]:
cross_val_score(LogisticRegression(max_iter=5000),digits.data,digits.target)

array([0.925     , 0.87777778, 0.93871866, 0.93314763, 0.89693593])

In [25]:
cross_val_score(RandomForestClassifier(n_estimators=60),digits.data,digits.target)

array([0.92222222, 0.90277778, 0.96100279, 0.97771588, 0.92479109])

In [26]:
cross_val_score(SVC(),digits.data,digits.target)

array([0.96111111, 0.94444444, 0.98328691, 0.98885794, 0.93871866])