In [2]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score


In [7]:
import tensorflow as tf

# Load the CIFAR-10 dataset
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()

# print(y_train.dtype)
# Preprocess the data
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0

xtrain1 = x_train.reshape(x_train.shape[0], -1)
xtest1 = x_test.reshape(x_test.shape[0], -1)
ytrain1 = y_train.flatten()
ytest1 = y_test.flatten()


logistic_classifier = LogisticRegression()

print('\nTraining a model with', xtrain1.shape[0], 'examples...')

# training
logistic_classifier.fit(xtrain1, ytrain1)

# computing training accuracy
train_predictions = logistic_classifier.predict(xtrain1)
train_accuracy = accuracy_score(train_predictions, ytrain1)
print('\nTraining accuracy:', format(100 * train_accuracy , '.2f')) 

# computing testing accuracy
test_predictions = logistic_classifier.predict(xtest1)
test_accuracy = accuracy_score(test_predictions, ytest1)

print('\nTesting accuracy:', format(100 * test_accuracy , '.2f'))




Training a model with 50000 examples...


ValueError: Found array with dim 4. LogisticRegression expected <= 2.

In [11]:
def accuracy_vs_reg():
    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()

    x_train = x_train.astype('float32') / 255.0
    x_test = x_test.astype('float32') / 255.0

    xtrain1 = x_train.reshape(x_train.shape[0], -1)
    xtest1 = x_test.reshape(x_test.shape[0], -1)
    ytrain1 = y_train.flatten()
    ytest1 = y_test.flatten()
    
    # log initialization
    train_accuracies = []
    test_accuracies = []
    cs = [0, 0.1, 1, 10, 50]

    ### YOUR CODE STARTS HERE ###
    for c in cs:
        # Specify the logistic classifier model
        if c == 0:
            classifier = LogisticRegression(penalty=None, fit_intercept=True)
        else:
            # Refer https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
            # to see which argument can adjust the regularization strength,
            # Additionally, please use 'l1' penalty type, 'liblinear' solver and enable fit_intercept
            classifier = LogisticRegression(penalty='l1', solver='liblinear', fit_intercept=True, C=c)


        classifier.fit(xtrain1, ytrain1)

        # Training
        train_predictions = classifier.predict(xtrain1)
        train_accuracy = accuracy_score(train_predictions, ytrain1)
        train_accuracies.append(train_accuracy)
        
        
        # Testing: compute and print accuracy and AUC on the test data
        test_predictions = classifier.predict(xtest1)
        test_accuracy = accuracy_score(test_predictions, ytest1)
        test_accuracies.append(test_accuracy)
        

    ### YOUR CODE ENDS HERE ###
    
    fig, axes = plt.subplots()
    axes.semilogx(cs, train_accuracies, color='red', label='training accuracy')
    axes.semilogx(cs, test_accuracies, color='blue', label='testing accuracy')
    
    axes.set_xlabel('regularization strength', fontsize=14)
    axes.set_ylabel('accuracy', fontsize=14)
    
    axes.legend()

    return train_accuracies, test_accuracies


accuracy_vs_reg()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [3]:
data_dir = 'cifar-10-batches-py'

def convert_to_numpy(data_dir): 
    """ 
    Returns the following into a numpy array:
    - training images (50_000, 3072)
    - training labels (50_000, )
    - testing images (10_000, 3072)
    - testing labels (10_000) 
    """
    train_files = ['data_batch_1', 'data_batch_2', 'data_batch_3', 'data_batch_4', 'data_batch_5']
    test_file = 'test_batch'

    def load_batch(file_path):
        with open(file_path, 'rb') as file:
            batch = pickle.load(file, encoding='bytes')
            images = batch[b'data']
            labels = batch[b'labels']

            # convert labels to numpy array
            labels = np.array(labels)
            return images, labels

    # load training data
    train_images_list = []
    train_labels_list = []

    for file_name in train_files:
        file_path = os.path.join(data_dir, file_name)
        images, labels = load_batch(file_path)
        train_images_list.append(images)
        train_labels_list.append(labels)

    # concatenate all training data
    train_images = np.concatenate(train_images_list, axis=0)
    train_labels = np.concatenate(train_labels_list, axis=0)

    # load test data
    test_file_path = os.path.join(data_dir, test_file)
    test_images, test_labels = load_batch(test_file_path)

    # convert to numpy arrays
    train_images = np.array(train_images)
    train_labels = np.array(train_labels)
    test_images = np.array(test_images)
    test_labels = np.array(test_labels)

     
    return train_images, train_labels, test_images, test_labels

In [4]:
data_dir = 'cifar-10-batches-py'

train_images, train_labels, test_images, test_labels = convert_to_numpy(data_dir)

logistic_classifier = LogisticRegression()

print('\nTraining a model with', train_images.shape[0], 'examples...')

# training
logistic_classifier.fit(train_images, train_labels)

# computing training accuracy
train_predictions = logistic_classifier.predict(train_images)
train_accuracy = accuracy_score(train_labels, train_predictions)
print('\nTraining accuracy:', format(100 * train_accuracy , '.2f')) 

# computing testing accuracy
test_predictions = logistic_classifier.predict(test_images)
test_accuracy = accuracy_score(test_labels, test_predictions)

print('\nTesting accuracy:', format(100 * test_accuracy , '.2f'))



Training a model with 50000 examples...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Training accuracy: 42.58

Testing accuracy: 40.24


In [6]:
def accuracy_vs_reg():
    data_dir = 'cifar-10-batches-py'
    X_train, y_train, X_test, y_test = convert_to_numpy(data_dir)
   
    # log initialization
    train_accuracies = []
    test_accuracies = []
    cs = [0, 0.1, 1, 10, 50]

    ### YOUR CODE STARTS HERE ###
    for c in cs:
        # Specify the logistic classifier model
        if c == 0:
            classifier = LogisticRegression(penalty=None, fit_intercept=True)
        else:
            # Refer https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
            # to see which argument can adjust the regularization strength,
            # Additionally, please use 'l1' penalty type, 'liblinear' solver and enable fit_intercept
            classifier = LogisticRegression(penalty='l1', solver='liblinear', fit_intercept=True, C=c)


        classifier.fit(X_train, y_train)

        # Training
        train_predictions = classifier.predict(X_train)
        train_accuracy = accuracy_score(y_train, train_predictions)
        train_accuracies.append(train_accuracy)
        
        
        # Testing: compute and print accuracy and AUC on the test data
        test_predictions = classifier.predict(X_test)
        test_accuracy = accuracy_score(y_test, test_predictions)
        test_accuracies.append(test_accuracy)
        

    ### YOUR CODE ENDS HERE ###
    
    fig, axes = plt.subplots()
    axes.semilogx(cs, train_accuracies, color='red', label='training accuracy')
    axes.semilogx(cs, test_accuracies, color='blue', label='testing accuracy')
    
    axes.set_xlabel('regularization strength', fontsize=14)
    axes.set_ylabel('accuracy', fontsize=14)
    
    axes.legend()

    return train_accuracies, test_accuracies


accuracy_vs_reg()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
