In [1]:
import numpy as np

In [2]:
def ProbabilityToClassLabel(dataset):
    '''
    This function determines class label from probability score
    '''
    cleaned_data1 = dataset.copy()
    #Here we are considering class label is +ve if probability score is greater than or equal to 0.5; otherwise 
    #class label is 0
    for i in range(0, len(cleaned_data1)):
        if cleaned_data1[i][1] >= 0.5:
            cleaned_data1[i][1] = 1
        else:
            cleaned_data1[i][1] = 0
    return cleaned_data1

In [3]:
def Confusion_Matrix(data):
    '''
    This function computes the four elements of the confusion matrix for binary classification
    '''
    TP = 0
    FP = 0
    FN = 0
    TN = 0
    for i in range(0, len(data)):
        if data[i][0] == 1 and data[i][1] == 1:
            TP += 1
        elif data[i][0] == 0 and data[i][1] == 1:
            FP += 1
        elif data[i][0] == 1 and data[i][1] == 0:
            FN += 1
        else:
            TN += 1
    return TP, FN, FP, TN

In [4]:
def Recall(tp, fn):
    '''
    This function computes the recall given the values of True Positive(TP) and False Negative(FN)
    '''
    return tp / (tp + fn)

In [5]:
def Precision(tp, fp):
    '''
    This function computes the precision given the values of True Positive(TP) and Fasle Positive(FP)
    '''
    return tp / (tp + fp)

In [6]:
def F1_Score(precision, recall):
    '''
    This function computes the F1_score given the values of precision and recall
    '''
    return 2 * precision * recall / (precision + recall)

In [7]:
def Accuracy(tp, fn, fp, tn):
    '''
    This function computes the accuracy given the values of True Positive(TP), Fasle Negative(FN), False Positive(FP), and True Negative(TN)
    '''
    return (tp + tn) / (tp + fn + fp + tn)

In [8]:
def ClassLabelAccordingToThreshold(probability_score, thresholds):
    '''
    This function computes the class labels from the threshold values
    '''
    predicted_class_label = []
    for thresh in thresholds: #To iterate through each unique threshold value
        lst = []
        for val in probability_score:
            if val >= thresh:
                lst.append(1)
            else:
                lst.append(0)
        predicted_class_label.append(lst) #At the end of the inner loop the list of class labels corressponding to
        #a perticular threshold is appended to the predicted_class_label which is a list of lists                
    return predicted_class_label

In [9]:
def Compute_AUC(dataset, predicted_class_label):
    '''
    This function computes the AUC value
    '''
    actual_class_label = dataset[:, 0]  #Getting actual class label of the data points
    tpr_lst = [] 
    fpr_lst = []
    for i in range(0, len(predicted_class_label)):
        TP, FN, FP, TN = Confusion_Matrix(np.c_[actual_class_label, np.array(predicted_class_label[i])])
        TPR = TP / (TP + FN)
        FPR = FP / (TN + FP)
        tpr_lst.append(TPR)
        fpr_lst.append(FPR)
    AUC = np.trapz(np.array(tpr_lst), np.array(fpr_lst))
    return AUC

A.Performance metrics for the given data 5_a.csv

In [10]:
#Loading data into a numpy array 
data_a = np.genfromtxt('5_a.csv', delimiter = ',')

In [11]:
print(data_a)

[[       nan        nan]
 [1.         0.63738662]
 [1.         0.63516504]
 ...
 [1.         0.77772367]
 [1.         0.84603622]
 [1.         0.67950667]]


In [12]:
#Deleting the first element which has 'nan' value from the  array  'data_a'
cleaned_data_a = np.delete(data_a, 0, 0)
print(cleaned_data_a)

[[1.         0.63738662]
 [1.         0.63516504]
 [1.         0.76658559]
 ...
 [1.         0.77772367]
 [1.         0.84603622]
 [1.         0.67950667]]


In [13]:
#Storing probability scores into 'proba_a'
proba_a = cleaned_data_a[:, 1]
print(proba_a)

[0.63738662 0.63516504 0.76658559 ... 0.77772367 0.84603622 0.67950667]


In [14]:
predicted_class_label_from_prob_a = ProbabilityToClassLabel(cleaned_data_a)
print(predicted_class_label_from_prob_a)

[[1. 1.]
 [1. 1.]
 [1. 1.]
 ...
 [1. 1.]
 [1. 1.]
 [1. 1.]]


In [15]:
TP, FN, FP, TN = Confusion_Matrix(predicted_class_label_from_prob_a)

In [16]:
print(TP)

10000


In [17]:
print(FN)

0


In [18]:
print(FP)

100


In [19]:
print(TN)

0


In [20]:
recall_a = Recall(TP, FN)

In [21]:
precision_a = Precision(TP, FP)

In [22]:
F1_score_a = F1_Score(precision_a, recall_a)

In [23]:
print('The F1 score for the dataset 5_a.csv is ', F1_score_a)

The F1 score for the dataset 5_a.csv is  0.9950248756218906


In [24]:
accuracy_a = Accuracy(TP, FN, FP, TN)
print('The accuracy for the dataset 5_a.csv is ', accuracy_a)

The accuracy for the dataset 5_a.csv is  0.9900990099009901


In [25]:
thresholds_a = sorted(list(set(proba_a)), reverse = True) #Computing the threshold values

In [28]:
predicted_class_label_from_threshold_a = ClassLabelAccordingToThreshold(proba_a, thresholds_a)

In [29]:
AUC_a = Compute_AUC(cleaned_data_a, predicted_class_label_from_threshold_a)

In [30]:
print('The AUC value of the dataset 5_a.csv is ', AUC_a)

The AUC value of the dataset 5_a.csv is  0.48829900000000004


B.Performance metrics for the given data 5_b.csv

In [26]:
data_b = np.genfromtxt('5_b.csv', delimiter = ',')
print(data_b)

[[       nan        nan]
 [0.         0.28103453]
 [0.         0.46515177]
 ...
 [0.         0.49933109]
 [0.         0.15761569]
 [0.         0.2966183 ]]


In [27]:
#Deleting the first element which has 'nan' value from the  array  'data_a'
cleaned_data_b = np.delete(data_b, 0, 0)
print(cleaned_data_b)

[[0.         0.28103453]
 [0.         0.46515177]
 [0.         0.35279298]
 ...
 [0.         0.49933109]
 [0.         0.15761569]
 [0.         0.2966183 ]]


In [28]:
proba_b = cleaned_data_b[:, 1]  #Storing probability scores into proba_b
print(proba_b)

[0.28103453 0.46515177 0.35279298 ... 0.49933109 0.15761569 0.2966183 ]


In [29]:
predicted_class_label_from_prob_b = ProbabilityToClassLabel(cleaned_data_b)
print(predicted_class_label_from_prob_b)

[[0. 0.]
 [0. 0.]
 [0. 0.]
 ...
 [0. 0.]
 [0. 0.]
 [0. 0.]]


In [30]:
TP, FP, FN, TN = Confusion_Matrix(predicted_class_label_from_prob_b)

In [31]:
print(TP)

55


In [32]:
print(FP)

45


In [33]:
print(FN)

239


In [34]:
print(TN)

9761


In [35]:
recall_b = Recall(TP, FN)

In [36]:
precision_b = Precision(TP, FP)

In [37]:
F1_score_b = F1_Score(precision_b, recall_b)

In [38]:
print('The F1_score for the dataset 5_b.csv is ', F1_score_b)

The F1_score for the dataset 5_b.csv is  0.2791878172588833


In [39]:
accuracy_b = Accuracy(TP, FN, FP, TN)

In [40]:
print('The accuracy for the dataset 5_b.csv is ', accuracy_b)

The accuracy for the dataset 5_b.csv is  0.9718811881188119


In [41]:
thresholds_b = sorted(list(set(proba_b)), reverse = True)  #Computing the thresholds values

In [42]:
predicted_class_label_from_thresholds_b = ClassLabelAccordingToThreshold(proba_b, thresholds_b)

In [43]:
AUC_b = Compute_AUC(cleaned_data_b, predicted_class_label_from_thresholds_b)

In [44]:
print('The AUC value for the dataset 5_b is ', AUC_b)

The AUC value for the dataset 5_b is  0.9377570000000001


C.The code for computing the best threshold of probability which gives lowest values of metric A for the given data 5_c.csv

𝐴 = 500 × number of false negative + 100 × numebr of false positive

In [45]:
data_c = np.genfromtxt('5_c.csv', delimiter = ',')
print(data_c)

[[       nan        nan]
 [0.         0.45852068]
 [0.         0.50503693]
 ...
 [1.         0.65916054]
 [1.         0.45626546]
 [1.         0.65916054]]


In [64]:
#Deleting the first element which has 'nan' value from the  array  'data_c'
cleaned_data_c = np.delete(data_c, 0, 0)
print(cleaned_data_c)

[[0.         0.45852068]
 [0.         0.50503693]
 [0.         0.41865174]
 ...
 [1.         0.65916054]
 [1.         0.45626546]
 [1.         0.65916054]]


In [68]:
proba_c = cleaned_data_c[:, 1]  #Storing probability scores into proba_c

In [60]:
thresholds_c = sorted(list(set(proba_c)), reverse = True) #Computing the thresholds values

In [61]:
predicted_class_label_from_thresholds_c = ClassLabelAccordingToThreshold(proba_c, thresholds_c)

In [70]:
A = []
for i in range(0, len(predicted_class_label_from_thresholds_c)):
    TP, FN, FP, TN = Confusion_Matrix(np.c_[cleaned_data_c[:, 0], np.array(predicted_class_label_from_thresholds_c[i])])
    A.append(500 * FN + 100 * FP)

In [71]:
sorted_A = sorted(A) #Sorting A into non decreasing order
best_threshold = thresholds_c[A.index(sorted_A[0])]

In [72]:
print('The best threshold of the metric A is ', best_threshold)

The best threshold of the metric A is  0.2300390278970873


D.The code for computing performance metrics(for regression) for the given data 5_d.csv

In [77]:
def MeanSquaredError(data):
    '''
    This function computes the mean squared error
    '''
    sum_squared_error = 0
    for i in range(0, len(data)):
        sum_squared_error += (data[i][0] - data[i][1]) ** 2 #sum_squared_error accumulates the sum of squared errors
    return sum_squared_error / len(data)

In [78]:
def MeanAbsolutePercentageError(data):
    '''
    This function computes the mean absolute percentage error
    '''
    sum_absolute_error = 0
    sum_actual_value = 0
    for i in range(0, len(data)):
        sum_absolute_error += abs(data[i][0] - data[i][1]) #sum_absolute_error accumulates the sum of absolute values of the errors 
        sum_actual_value += data[i][0]    #sum_actual_value accumulates the sum of the actual values
    return sum_absolute_error / sum_actual_value * 100

In [85]:
def ComputeRSquared(data):
    '''
    This function computes the R squared
    '''
    actual_sum = 0
    for i in range(0, len(data)):
        actual_sum += data[i][0]
    actual_mean = actual_sum / len(data)  #actual_mean contains the mean of the actual values
    ss_total = 0
    ss_res = 0
    for i in range(0, len(data)):
        ss_total += (data[i][0] - actual_mean) ** 2 #ss_total accumulates the sum of squared errors using simple mean model
        ss_res += (data[i][0] - data[i][1]) ** 2 #ss_res accumulates the sum of squared errors using the model used for regression
    return 1 - ss_res / ss_total

In [86]:
#Loading the dataset 5_d.csv into the numpy array data_d
data_d = np.genfromtxt('5_d.csv', delimiter = ',')
print(data_d)

[[ nan  nan]
 [101. 100.]
 [120. 100.]
 ...
 [106.  93.]
 [105. 101.]
 [ 81. 104.]]


In [87]:
#Deleting the first element which has 'nan' value from the  array  'data_a'
cleaned_data_d = np.delete(data_d, 0, 0)
print(cleaned_data_d)

[[101. 100.]
 [120. 100.]
 [131. 113.]
 ...
 [106.  93.]
 [105. 101.]
 [ 81. 104.]]


In [88]:
mse = MeanSquaredError(cleaned_data_d)
print('The mean squared error is ',mse)

The mean squared error is  177.16569974554707


In [89]:
mape = MeanAbsolutePercentageError(cleaned_data_d)
print('The mean absolute percentage error is ',mape)

The mean absolute percentage error is  12.91202994009687


In [90]:
rs = ComputeRSquared(cleaned_data_d)
print('The R squared error is ',rs)

The R squared error is  0.9563582786990964
