In [1]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

In [2]:
# Create a sample dataset
#  it is a 2D array with 5 rows and 2 columns.

X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
y = np.array([0, 1, 0, 1, 0])

In [3]:
# Define the number of folds
k = 3

In [4]:
# Shuffle the dataset
# permutation is a NumPy function that returns a shuffled sequence of integers from 0 to len(X)-1. 
# The sequence is used to index the rows of the dataset, effectively shuffling the rows.
# This means that the rows of the dataset are shuffled, but the columns remain in the same order.
indices = np.random.permutation(len(X))
indices

array([3, 1, 0, 2, 4])

In [5]:
# Split the dataset into k folds
folds = np.array_split(indices,k)

<img src="1.png" alt="Alternative text" />

<img src="2.png" alt="Alternative text" />

In [6]:
# Perform k-fold cross-validation
for i in range(k):
    test_indices = folds[i]
    train_indices = np.concatenate([folds[j] for j in range(k) if j != i])
    X_train, y_train = X[train_indices], y[train_indices]
    X_test, y_test = X[test_indices], y[test_indices]

In [7]:

# from main dataset
# [1, 2], [5, 6], [7, 8], [9, 10] selected as train_set after shuffle process
#  [3, 4] selected for test_set after shuffle process

In [8]:

# between this labels from main y [0, 1, 0, 1, 0]: [0, 0, 0, 1] selected as labels after shuffle process
# label of 1 expected test answer

In [9]:
# ********************************************************************

In [10]:


from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits


In [11]:
digits = load_digits()

In [12]:
X_train, X_test, y_train, y_test = train_test_split(digits.data,digits.target,test_size=0.3)


# Logistic Regression

In [13]:
# Logistic Regression
# solver: This parameter represents which algorithm to use in the optimization problem. 
# The 'liblinear' solver is used for small datasets. it is also used for binary classification problems.
#  while 'newton-cg', 'lbfgs', 'sag', and 'saga' solvers are used for large datasets. 
# The 'ovr' (one-vs-rest) strategy is used for multiclass classification problems.
# In logistic regression, the goal is to find the best coefficients for a linear equation that can predict the probability of a binary outcome (0 or 1).

lr = LogisticRegression(solver='liblinear',multi_class='ovr')
lr.fit(X_train, y_train)


In [14]:
lr_score = lr.score(X_test, y_test)
lr_score

0.9648148148148148

# SVC

In [15]:
svm = SVC(gamma='auto')
svm.fit(X_train, y_train)


In [16]:
svm_score = svm.score(X_test, y_test)
svm_score


0.3888888888888889

# Random Forest

In [17]:
rf = RandomForestClassifier(n_estimators=40)
rf.fit(X_train, y_train)


In [18]:
rf_score = rf.score(X_test, y_test)
rf_score

0.9796296296296296

In [19]:
# Scores from the above list of algorithms Logistic Regression and Random Forest are doing comparatively better than SVM.
lr_score, svm_score, rf_score

(0.9648148148148148, 0.3888888888888889, 0.9796296296296296)

In [20]:
# Now will use cross_val_score function and get the scores, passing different algorithms with dataset and cv.
# A model is a representation of a system, while cross-validation is a technique used to evaluate the performance of a model
# for each model we are going to implement cross-validation 
from sklearn.model_selection import cross_val_score

# Set LogisticRegression, CV =3

In [21]:
score_lr=cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), digits.data, digits.target,cv=3)


In [22]:
print(score_lr)

[0.89482471 0.95325543 0.90984975]


In [23]:
print("Avg :",np.average(score_lr))

Avg : 0.9193099610461881


# Set SVM and CV=3

In [24]:
score_svm =cross_val_score(SVC(gamma='auto'), digits.data, digits.target,cv=3)

In [25]:
print(score_svm)


[0.38063439 0.41068447 0.51252087]


In [26]:
print('Avg :',np.average(score_svm))

Avg : 0.4346132442960489


# Set Random Forest and CV=3

In [27]:
score_rf=cross_val_score(RandomForestClassifier(n_estimators=40),digits.data, digits.target,cv=3)


In [28]:
print(score_rf)


[0.93322204 0.95158598 0.93322204]


In [29]:
print('Avg :',np.average(score_rf))

Avg : 0.9393433500278242


<img src="12.jpg" alt="Alternative text" />


In [30]:
# Based on the above table, we will go with Random Forest for this dataset for production.
# But we have to monitor the model performance based on the data drift and as the business case changes,
# we have to revisit the model and redeploy.

# Parameter Tuning Using K-Fold

In [31]:
# Let us consider the RandomForestClassifier for this analysis.
# n_estimators is our parameter for this case and CV as 10 (commonly used)
# score_rf=cross_val_score(RandomForestClassifier(n_estimators=40),digits.data, digits.target,cv=3)

scores1 = cross_val_score(RandomForestClassifier(n_estimators=5),digits.data, digits.target, cv=10)
print('Avg Score for Estimators=5 and CV=10 :',np.average(scores1))

Avg Score for Estimators=5 and CV=10 : 0.8859155803848541


In [32]:
scores2 = cross_val_score(RandomForestClassifier(n_estimators=20),digits.data, digits.target, cv=10)
print('Avg Score for Estimators=20 and CV=10 :',np.average(scores2))

Avg Score for Estimators=20 and CV=10 : 0.9404469273743017


In [33]:
scores3 = cross_val_score(RandomForestClassifier(n_estimators=30),digits.data, digits.target, cv=10)
print('Avg Score for Estimators=30 and CV=10 :',np.average(scores3))

Avg Score for Estimators=30 and CV=10 : 0.9415580384854128


In [34]:
scores4 = cross_val_score(RandomForestClassifier(n_estimators=40),digits.data, digits.target, cv=10)
print('Avg Score for Estimators=40 and CV=10 :',np.average(scores4))

Avg Score for Estimators=40 and CV=10 : 0.9437771570453133


In [35]:
# Based on the above observation, we will go with Estimators=30.

In [36]:
# To calculate the error of each fold in cross-validation, you can use the following steps:
# 1 - Split the data into k folds.
# 2 - For each fold, train the model on the remaining k-1 folds and test it on the current fold.
# 3 - Calculate the error for each fold.
# 4 - Repeat steps 2-3 for all k folds.

In [37]:
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]) # we first create a sample dataset with two features and a binary target variable.
y = np.array([0, 1, 0, 1, 0])

In [38]:
k = 3
# Create a KFold object
kf = KFold(n_splits=k)

In [39]:
model = LinearRegression()

In [52]:
# Calculate the error for each fold
# We train and evaluate the model on each fold separately and calculate the mean squared error for each fold. 
mse_scores = []

for train_indices, test_indices in kf.split(X):
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    
    model.fit(X_train,y_train)
    
    y_pred = model.predict(X_test)
    mse_scores.append(mean_squared_error(y_test, y_pred))
    # print("MSE for fold:", mean_squared_error(y_test, y_pred))


 # Print the mean squared error for each fold
for i, mse in enumerate(mse_scores):
    print("Fold", i+1, "MSE:", mse)   

Fold 1 MSE: 0.51
Fold 2 MSE: 0.505
Fold 3 MSE: 0.57
Fold 4 MSE: 0.515
Fold 5 MSE: 0.485


<img src="4.png" alt="Alternative text" />


In [42]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

In [44]:
# Create a sample dataset
X = np.random.rand(1000, 10) # a sample dataset with 1000 samples and 10 features. 
y = np.random.randint(0, 2, size=1000) # randomly generate a binary target variable

In [47]:
len(X[0]) # 10

10

In [48]:
# Define the number of folds
k = 5
# Create a KFold object
kf = KFold(n_splits=k)

In [51]:
# Create a KNN model
model = KNeighborsClassifier(n_neighbors=3)

# Calculate the error for each fold
scores = cross_val_score(model, X, y, cv=kf)

# Print the error for each fold
for i, score in enumerate(scores):
    print("Fold", i+1, "score:", score) # see the error for each fold and evaluate the performance of the KNN model on unseen data.


Fold 1 score: 0.49
Fold 2 score: 0.495
Fold 3 score: 0.43
Fold 4 score: 0.485
Fold 5 score: 0.515
