# Ayush Patel
## NUID: 002765119

In [1]:
# Imports
import pandas as pd
import numpy as np
from scipy.stats import binom
import random
import cv2
from sklearn.metrics import euclidean_distances
from tqdm import tqdm

# Problem 1 <br>
<b>Note: </b> : You need to install (!pip install opencv-python) to import cv2

In [2]:
train_data = np.loadtxt('trainPB1.txt')
train_labels = train_data[:, 0]
train_pixels = train_data[:, 1:]

test_data = np.loadtxt('testPB1.txt')
test_labels = test_data[:, 0]
test_pixels = test_data[:, 1:]

# Applying Gaussian Blur (Reason is stated below)
train_pixels = np.array([cv2.GaussianBlur(p.reshape(28, 28), (5, 5), 0).ravel() for p in train_pixels])
test_pixels = np.array([cv2.GaussianBlur(p.reshape(28, 28), (5, 5), 0).ravel() for p in test_pixels])

# KNN from scratch
class KNN:
    def __init__(self, k):
        self.k = k
        
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    
    def predict(self, X_test):
        y_pred = []
        distances = euclidean_distances(X_test, self.X_train)
        for i in range(X_test.shape[0]):
            indices = np.argsort(distances[i])[:self.k]
            k_nearest_labels = [self.y_train[j] for j in indices]
            y_pred.append(max(k_nearest_labels, key = k_nearest_labels.count))
        return y_pred
    
    def score(self, X_test, y_test):
        y_pred = self.predict(X_test)
        return (y_pred == y_test).mean()

knn = KNN(k = 20)

# Training the classifier
knn.fit(train_pixels, train_labels)

# Predicting labels for the test data
test_pred = knn.predict(test_pixels)

# Evaluate training performance
train_acc = (knn.score(train_pixels, train_labels))*100
print("Training Accuracy:", train_acc)

# Evaluate testing performance
test_acc = (knn.score(test_pixels, test_labels))*100
print("Testing Accuracy:", test_acc)

Training Accuracy: 81.8875
Testing Accuracy: 76.4


<b> Reason: </b> <br>
In this code, the Gaussian Blur function from OpenCV library is applied to each image in the training and testing datasets using a kernel size of (5, 5) and sigma (standard deviation) value of 0. The blurred images are then reshaped back to a 1D vector and used for training and testing the KNN classifier.

# Problem 2

from tqdm import tqdm

In [3]:
# Load the data
data = []
for line in open('studentgrades_pb2.txt'):
    line_l = [int(x) for x in line.split(' ')]
    data.append(line_l)
data = np.array(data)

In [4]:
def EM(data, num_sessions = 50, num_students = 20, num_quizzes = 3, num_iterations = 1000):
    # Giving equal probability for each quiz
    quiz_probabilities = [1/num_quizzes] * num_quizzes
    # Initializing a random question probability
    question_probabilities = [random.random() for _ in range(num_quizzes)]
    question_probabilities = [pi / sum(question_probabilities) for pi in question_probabilities]

    # EM algorithm 
    for i in range(num_iterations):
        # E-step
        session_quiz_probabilities = []
        for session_index in range(num_sessions):
            current_probs = []
            for quiz_index in range(num_quizzes):
                likelihood = 1
                for student_index in range(num_students):
                    if data[session_index][student_index] == 1:
                        likelihood *= question_probabilities[quiz_index]
                    else:
                        likelihood *= (1 - question_probabilities[quiz_index])
                current_probs.append(quiz_probabilities[quiz_index] * likelihood)
            session_quiz_probabilities.append(current_probs)
        session_quiz_probabilities = [[w / sum(current_probs) for w in current_probs] for current_probs in session_quiz_probabilities]

        # M-step
        for quiz_index in range(num_quizzes):
            quiz_probabilities[quiz_index] = sum(session_quiz_probabilities[session_index][quiz_index] for session_index in range(num_sessions)) / num_sessions
            question_probabilities[quiz_index] = sum(session_quiz_probabilities[session_index][quiz_index] * sum(data[session_index][student_index] for student_index in range(num_students)) for session_index in range(num_sessions)) / (sum(session_quiz_probabilities[session_index][quiz_index] for session_index in range(num_sessions)) * num_students)

    return question_probabilities, quiz_probabilities

In [5]:
# Running the EM Algorithm
question_probabilities, quiz_probabilities = EM(data)

In [6]:
# Output
print("Probability of solving probelm", question_probabilities)
print("Quiz selection probabilities: ", quiz_probabilities)

Probability of solving probelm [0.6100378318845449, 0.9317285332315947, 0.23691866848935975]
Quiz selection probabilities:  [0.5146283397819821, 0.17855767772923006, 0.30681398248878794]


<b>Explaination: </b> <br>
The function takes in several parameters including the data about student responses, the number of quiz sessions, students, and quizzes, and the number of iterations to run the algorithm. The function then initializes the quiz probabilities as equal for all quizzes and the question probabilities as random values that sum to 1.

The EM algorithm is then performed. In the E-step, the algorithm calculates the probability of a student answering a question correctly given the quiz and question probabilities. In the M-step, the algorithm updates the quiz and question probabilities based on the probabilities calculated in the E-step.

The function repeats the E-step and M-step for the number of iterations specified and returns the final quiz and question probabilities that were estimated by the algorithm. These probabilities can be used to gain insights into which questions are more difficult or easier for students, and to evaluate the effectiveness of the quiz overall.