In [151]:
from math import *
import numpy as np
import matplotlib.pyplot as plt

### Generating Data

In [154]:
def generateData(means, cov, size):

    x1 = np.random.multivariate_normal(means[0], cov[0], size//3) 
    x2 = np.random.multivariate_normal(means[1], cov[1], size//3+1) 
    x3 = np.random.multivariate_normal(means[2], cov[2], size//3)
    
    y1 = 0 * np.ones(size//3, dtype=int)
    y2 = 1 * np.ones(size//3 + 1, dtype=int)
    y3 = 2 * np.ones(size//3, dtype=int)
    
    x_train = np.vstack([x1, x2, x3])
    y_train = np.hstack([y1, y2, y3])
    
    x1 = np.random.multivariate_normal(means[0], cov[0], size//3) 
    x2 = np.random.multivariate_normal(means[1], cov[1], size//3+1) 
    x3 = np.random.multivariate_normal(means[2], cov[2], size//3) 
    
    y1 = 0 * np.ones(size//3, dtype=int)
    y2 = 1 * np.ones(size//3 + 1, dtype=int)
    y3 = 2 * np.ones(size//3, dtype=int)
    
    x_test = np.vstack([x1, x2, x3])
    y_test = np.hstack([y1, y2, y3])
    
    return x_train, y_train, x_test, y_test
    

### Calculate Error

In [157]:
def calculateError(y_test, predictions):
    sampleSize = np.size(y_test)
    errors = np.size(np.where(y_test != predictions))
    return (errors/sampleSize) * 100

In [159]:
means = [
    [0, 0, 0],
    [1, 2, 2],
    [3, 3, 4]
    ]
cov = [
    [
        [0.8, 0.2, 0.1],
        [0.2, 0.8, 0.2],
        [0.1, 0.2, 0.8]
    ],
    [
        [0.6, 0.01, 0.01],
        [0.01, 0.8, 0.01],
        [0.01, 0.01, 0.6]
    ],
    [
        [0.6, 0.1, 0.1],
        [0.1, 0.6, 0.1],
        [0.1, 0.1, 0.6]
    ],
]
size = 1000

### Euclidean Classifier

In [162]:
def euclideanClassifier(x_train, y_train, x_test, y_test):

    x1 = x_train[y_train == 0]
    x2 = x_train[y_train == 1]
    x3 = x_train[y_train == 2]
    means = np.array([np.mean(x1, 0), np.mean(x2, 0), np.mean(x3, 0)])


    dist1 = np.sqrt(np.sum(np.square(np.repeat([means[0]], repeats=np.shape(x_test)[0], axis=0))))
    dist2 = np.sqrt(np.sum(np.square(np.repeat([means[1]], repeats=np.shape(x_test)[0], axis=0))))
    dist3 = np.sqrt(np.sum(np.square(np.repeat([means[2]], repeats=np.shape(x_test)[0], axis=0))))
    euclideanDistances = np.array([dist1, dist2, dist3])
    predictions = np.where( euclideanDistances == np.min(euclideanDistances, 0))[0]
    
    return predictions

x_train, y_train, x_test, y_test = generateData(means, cov, size)

predictions = euclideanClassifier(x_train, y_train, x_test, y_test)
error_e = calculateError(y_test, predictions)
accuracy = 100 - error_e  

print(f'Euclidean Accuracy: {accuracy}%')


Euclidean Accuracy: 33.3%


### Mahalanobis Classifier

In [174]:
def mahalanobisClassifier(cov, x_train, y_train, x_test, y_test):
    
    x1 = x_train[y_train == 0]
    x2 = x_train[y_train == 1]
    x3 = x_train[y_train == 2]
    means = np.array([np.mean(x1, 0), np.mean(x2, 0), np.mean(x3, 0)])

    mahalanobisProbabilities = [[] for i in range(3)]
        
    for row in x_test:
        for i in range(3):
            xm = np.matrix(row - means[i])
            md = float(xm * np.linalg.inv(cov[i]) * xm.T)
            mahalanobisProbabilities[i].append(md)
            
    mahalanobisDistances = np.array(mahalanobisProbabilities)
    predictions = np.where( mahalanobisDistances == np.min(mahalanobisDistances, 0))[0]
    
    return predictions
    
x_train, y_train, x_test, y_test = generateData(means, cov, size)

predictions = mahalanobisClassifier(cov, x_train, y_train, x_test, y_test)

error_m = calculateError(y_test, predictions)
accuracy = 100 - error_m  
print(f'Mahalanobis Accuracy: {accuracy}%')


Mahalanobis Accuracy: 98.3%


### Bayesian Classifier

In [168]:
from numpy.linalg import det

def bayesianClassifier(cov, x_train, y_train, x_test, y_test):
    
    x1 = x_train[y_train == 0]
    x2 = x_train[y_train == 1]
    x3 = x_train[y_train == 2]
    means = np.array([np.mean(x1, 0), np.mean(x2, 0), np.mean(x3, 0)])

    bayesianProbabilities = [[] for i in range(3)]

    for row in x_test:
        for i in range(3):
            xm = np.matrix(row-means[i])
            md = -1/2 * float(xm * np.linalg.inv(cov[i]) * xm.T)
            D =  np.power(2*np.pi, means[i].shape[0]/2) * np.power(det(cov[i]),1/2)
            P = 1/D * np.exp(md)
            bayesianProbabilities[i].append(P)
            
    bayesianProbabilities = np.array(bayesianProbabilities)
    predictions = np.where( bayesianProbabilities == np.max(bayesianProbabilities, 0))[0]
    
    return predictions
    
x_train, y_train, x_test, y_test = generateData(means, cov, size)
predictions = bayesianClassifier(cov, x_train, y_train, x_test, y_test)

error_b = calculateError(y_test, predictions)
accuracy = 100 - error_b
print(f'Bayesian Accuracy: {accuracy}%')


Bayesian Accuracy: 98.0%


### Error Probability

In [178]:
print(f'Error in Euclidean Classifier: {error_e}%')
print(f'Error in Mahalanobis Classifier: {error_m}%')
print(f'Error in Bayesian Classifier: {error_b}%')

Error in Euclidean Classifier: 66.7%
Error in Mahalanobis Classifier: 1.7000000000000002%
Error in Bayesian Classifier: 2.0%


### Considering the California Housing dataset, design a linear regression model considering each feature with non zero values, and report the best feature and model accordng to the R2 metric.

In [210]:
import sklearn.datasets
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd

### Importing DataSets

In [212]:
caldata = sklearn.datasets.fetch_california_housing()
print(caldata.data.shape, caldata.target.shape)
print(caldata.feature_names)

(20640, 8) (20640,)
['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


In [221]:
X_california = pd.DataFrame(caldata.data, columns=caldata.feature_names)
y_california = caldata.target


features = X_california.columns
r2_scores = []
adjusted_r2_scores = []
ssr_list = []
sse_list = []
sst_list = []
p=8

#samples
n = len(y_california)

for feature in features:
    
    X_feature = X_california[[feature]]
    
    model = LinearRegression()
    model.fit(X_feature, y_california)
    
    y_pred = model.predict(X_feature)
    
    #SST, SSR, SSE
    y_mean = np.mean(y_california)
    sst = np.sum((y_california - y_mean) ** 2)  
    ssr = np.sum((y_pred - y_mean) ** 2)  
    sse = np.sum((y_california - y_pred) ** 2)  
    
  
    r2 = ssr/sst
    adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))
    
  
    r2_scores.append(r2)
    adjusted_r2_scores.append(adjusted_r2)
    ssr_list.append(ssr)
    sse_list.append(sse)
    sst_list.append(sst)
    

In [223]:
results = pd.DataFrame({
    'Feature': features,
    'R^2': r2_scores,
    'Adjusted R^2': adjusted_r2_scores,
    'SSR': ssr_list,
    'SSE': sse_list,
    'SST': sst_list
})


In [231]:
print("\n Feature:")
print(results)


 Feature:
      Feature       R^2  Adjusted R^2           SSR           SSE  \
0      MedInc  0.473447      0.473243  13011.851252  14471.346942   
1    HouseAge  0.011156      0.010773    306.610949  27176.587245   
2    AveRooms  0.023088      0.022709    634.539851  26848.658343   
3   AveBedrms  0.002181      0.001794     59.939149  27423.259045   
4  Population  0.000608      0.000220     16.698975  27466.499219   
5    AveOccup  0.000563      0.000176     15.485814  27467.712380   
6    Latitude  0.020782      0.020402    571.160921  26912.037273   
7   Longitude  0.002113      0.001726     58.070066  27425.128128   

            SST  
0  27483.198194  
1  27483.198194  
2  27483.198194  
3  27483.198194  
4  27483.198194  
5  27483.198194  
6  27483.198194  
7  27483.198194  


In [229]:

best_feature = results.sort_values(by='R^2', ascending=False).iloc[0]

print("\nBest Feature:")
print(best_feature)



Best Feature:
Feature               MedInc
R^2                 0.473447
Adjusted R^2        0.473243
SSR             13011.851252
SSE             14471.346942
SST             27483.198194
Name: 0, dtype: object
