# Image Classification using Logistic Regression and LDA

Welcome to this notebook, which explores image classification using the CIFAR-10 dataset. The primary goal is to demonstrate the use of Linear Discriminant Analysis (LDA) as a feature reduction technique and Logistic Regression as the classification model.

The notebook presents an overview of the image classification process with LDA and Logistic Regression. Let's dive in!

## Model Results

- **Training Scores**: [0.540125, 0.536, 0.528375, 0.52925, 0.545375]
- **Prediction Scores**: [3, 1, 5, ..., 8, 8, 8]
- **Precision**: 0.534187034361574
- **Recall**: 0.5357915163306523
- **F1 Score**: 0.5346558723184491

In [None]:
import tensorflow as tf
import numpy as np

# Load the CIFAR-10 dataset
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()

# x_train and x_test contain the images, y_train and y_test contain the labels

# Look at the shape
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold, cross_val_predict, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression

# Stratified split is used to ensure that the distribution of classes in the training and holdout sets
# closely reflects the original dataset, which helps maintain the actual probabilities of the dataset.

stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, holdout_index in stratified_split.split(x_train, y_train):
    X_train, X_holdout = x_train[train_index], x_train[holdout_index]
    Y_train, Y_holdout = y_train[train_index], y_train[holdout_index]

X_train_flattened = np.array([image.flatten() for image in X_train])
X_holdout_flattened = np.array([image.flatten() for image in X_holdout])
Y_train_flat = Y_train.ravel()
Y_holdout_flat = Y_holdout.ravel()

# Normalise the data
X_train_flattened = X_train_flattened / 255
X_holdout_flattened = X_holdout_flattened / 255

# Shape of the normalised data
print('X_train shape:', X_train_flattened.shape)
print('Y_train shape:', Y_train_flat.shape)
print('X_holdout shape:', X_holdout_flattened.shape)
print('Y_holdout shape:', Y_holdout_flat.shape)

In [None]:
# Linear Discriminant Analysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import recall_score

lda = LinearDiscriminantAnalysis()

lda.fit(X_train_flattened, Y_train_flat)

In [None]:
# Evaluation
from sklearn.model_selection import cross_val_score, KFold, cross_val_predict

# KFold cross-validation on training set
cv = KFold(n_splits=5, shuffle=True, random_state=42)
training_scores = cross_val_score(lda, X_train_flattened, Y_train_flat, cv=cv, scoring='accuracy')

mean_score = training_scores.mean()
std_score = training_scores.std()

# Use cross_val_predict to obtain predictions for each sample within the training set
training_predictions = cross_val_predict(lda, X_train_flattened, Y_train_flat, cv=cv, method='predict')
training_recall = recall_score(Y_train_flat, training_predictions, average='macro')

# Use the LDA model to predict on the holdout set
holdout_predictions = lda.predict(X_holdout_flattened)
holdout_recall = recall_score(Y_holdout_flat, holdout_predictions, average='macro')

print('LDA training scores:', training_scores)
print('LDA Training Recall:', training_recall)
print('LDA Holdout Recall:', holdout_recall)
print('STD', std_score)

In [None]:
# Using the LDA to tranform the data for feature reduction

# Applying Transform
X_train_reduced = lda.transform(X_train_flattened)
print(X_train_reduced)

# shape of the reduced data
print(X_train_reduced.shape)

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

X_train = X_train_reduced.copy()
y_train = Y_train_flat.copy()

# Choosing the model
lrModel = LogisticRegression(max_iter=100)

# Fit the model on the entire training set
lrModel.fit(X_train, y_train)

In [None]:
# Evaluation

from sklearn.model_selection import cross_val_score, KFold, cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score

# Cross-validation method within the training set
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation within the training set
training_scores = cross_val_score(lrModel, X_train, y_train, cv=cv, scoring='accuracy')

# You can also calculate the mean and standard deviation of the scores
mean_score = training_scores.mean()
std_score = training_scores.std()

# Use cross_val_predict to obtain predictions for each sample within the training set
training_predictions = cross_val_predict(lrModel, X_train, y_train, cv=cv, method='predict')
training_recall = recall_score(Y_train_flat, training_predictions, average='macro')
training_precision = precision_score(Y_train_flat, training_predictions, average='macro')
training_f1 = f1_score(Y_train_flat, training_predictions, average='macro')

# Use the Logistic Regression model to predict on the holdout set
# Transform the holdout set using the trained LDA

X_holdout_reduced = lda.transform(X_holdout_flattened)

holdout_predictions = lrModel.predict(X_holdout_reduced)

# Get predicted probabilities
predicted_probabilities = lrModel.predict_proba(X_holdout_reduced)[:, 1]

# Set the threshold value to 0.8 for higher precision, low recall
# Set a lower threshold value for higher recall, lower precision
threshold = 0.5

# Predictions based on the threshold
threshold_predictions = (predicted_probabilities >= threshold).astype(int)

holdout_recall = recall_score(Y_holdout_flat, threshold_predictions, average='macro')
holdout_precision = precision_score(Y_holdout_flat, threshold_predictions, average='macro')
holdout_f1 = f1_score(Y_holdout_flat, threshold_predictions, average='macro')

print('Training scores:', training_scores)
print('Training Recall:', training_recall)
print('Training Precision:', training_precision)
print('Training F1 Score:', training_f1)
print('Holdout Recall:', holdout_recall)
print('Holdout Precision:', holdout_precision)
print('Holdout F1 Score:', holdout_f1)
print('STD', std_score)