
# **CS 4361/5361 Machine Learning**

**Exam 1, Part 1**


In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix,mean_squared_error,mean_absolute_error
from sklearn.model_selection import train_test_split
import time
from google.colab import files
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeRegressor

## **Question 1**

Write a function that removes from x_train and x_test all attributes that have the same value for all examples in x_train (for example the pixels that are black in all MNIST images).

In [2]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train = np.float32(x_train/255).reshape(x_train.shape[0],-1)
x_test = np.float32(x_test/255).reshape(x_test.shape[0],-1)
x_train = x_train[::5]
y_train = y_train[::5]
x_test = x_test[::5]
y_test = y_test[::5]

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [3]:
def remove_constant_attributes(x_train, x_test):
  v = np.var(x_train,axis=0)
  select = v>0
  x_train = x_train[:,select]
  x_test = x_test[:,select]
  return x_train, x_test

In [4]:
x_train_new, x_test_new = remove_constant_attributes(x_train, x_test)
print(x_train.shape)
print(x_test.shape)
print(x_train_new.shape)
print(x_test_new.shape)

(12000, 784)
(2000, 784)
(12000, 688)
(2000, 688)


## **Question 2** 


Write a program to compare the performance of random forest and multilayer perceptron on the MNIST dataset using using the original and a new version of the dataset with constant attributes removed. 

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

def evaluate_model(model, x_train, y_train, x_test, y_test):
  m = model()
  m.fit(x_train,y_train)
  pred = m.predict(x_test)
  return accuracy_score(y_test, pred)

models = [RandomForestClassifier, MLPClassifier] # These are functions!
model_names = ['Random Forest', 'Multilayer Perceptron']

acc_list = []
for i in range(len(models)):
  print('Evaluating',model_names[i],'using original data set')
  acc_list.append(evaluate_model(models[i], x_train, y_train, x_test, y_test))
  print('Accuracy = {:6.4f}'.format(acc_list[-1]))

best = np.argmax(acc_list)
print('The best model is',model_names[best])
print('Accuracy = {:6.4f}'.format(acc_list[best]))

acc_list = []
for i in range(len(models)):
  print('Evaluating',model_names[i],'using reduced data set')
  acc_list.append(evaluate_model(models[i], x_train_new, y_train, x_test_new, y_test))
  print('Accuracy = {:6.4f}'.format(acc_list[-1]))

best = np.argmax(acc_list)
print('The best model is',model_names[best])
print('Accuracy = {:6.4f}'.format(acc_list[best]))

Evaluating Random Forest using original data set
Accuracy = 0.9465
Evaluating Multilayer Perceptron using original data set
Accuracy = 0.9550
The best model is Multilayer Perceptron
Accuracy = 0.9550
Evaluating Random Forest using reduced data set
Accuracy = 0.9455
Evaluating Multilayer Perceptron using reduced data set
Accuracy = 0.9555
The best model is Multilayer Perceptron
Accuracy = 0.9555


**Explanation of results:**

Since the attributes that were removed are irrelevant, resulting accuracies don't change. Running times would be slightly lower. 

## **Question 3.** 

Write a program that determines the best attribute to use in a random forest regressor to predict running times from the GPU running times dataset.

Your program should build and evaluate 14 models, each using a single attribute and then determine which attribute yields the best results.

In [None]:
uploaded = files.upload()

In [None]:
df = pd.read_csv('gpu_running_time.csv')

In [None]:
data = df.to_numpy()
X = data[:,:14]
y = np.mean(data[:,14:],axis=1)
feature_names = df.columns
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4361)

In [None]:
mae = []
for i in range(X_train.shape[1]):
  model = RandomForestRegressor()
  model.fit(X_train[:,i:i+1], y_train)
  pred = model.predict(X_test[:,i:i+1])
  err = mean_absolute_error(pred,y_test)
  print('MAE using attribute {} = {:5.2f}'.format(i,err))
  mae.append(err)

print('The best attribute is ',np.argmin(mae))