# Compute performance metrics for the given Y and Y_score without sklearn


## Compute performance metrics for the given data 5_a.csv

In [1]:
#importing library
import pandas as pd
import numpy as np

In [2]:
data=pd.read_csv("5_a.csv")
data.head()

Unnamed: 0,y,proba
0,1.0,0.637387
1,1.0,0.635165
2,1.0,0.766586
3,1.0,0.724564
4,1.0,0.889199


In [3]:

probability=data.loc[:,"proba"]

In [4]:
y_hat=[]
for i in range(len(probability)):
    """
    If the probability values given in data set is less than 0.5
    accept it as 0, if probability value is greater than or equal to 0.5 take it as 1
    """
    if probability[i]<0.5:
        y_hat.append(0)
    else:
        y_hat.append(1)

In [5]:

df = pd.DataFrame(y_hat, columns = ["y_pred"])  #concatenate y_hat(pred values) to dataframe
data_f=pd.concat([data,df], axis=1)
data_f.head()

Unnamed: 0,y,proba,y_pred
0,1.0,0.637387,1
1,1.0,0.635165,1
2,1.0,0.766586,1
3,1.0,0.724564,1
4,1.0,0.889199,1


In [6]:

y_act=data_f['y'].tolist()  #converts dataframe "y" column to list

In [7]:
def measures(y_actual, y_pred):
    """
    [This function is used to compute metrics and confusion matrix]
    args: [actaul vaues, predicted values list]
    returns: [True pos,False pos, True neg, False neg, confusion mat]
    """
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    mat=np.zeros((2,2))
    for i in range(len(y_pred)): 
        '''
        if actual and pred value is 1 then incr count of tp, if pred val is 1 and actual
        is not then incr count of fp, if pred val is 0 and actaul is not incr count of fn
        if actual and pred val is 0 then incr val of tn.
        '''
        if y_actual[i]==y_pred[i]==1:
            TP += 1
        if y_pred[i]==1 and y_actual[i]!=y_pred[i]:
            FP += 1
        if y_actual[i]==y_pred[i]==0:
            TN += 1
        if y_pred[i]==0 and y_actual[i]!=y_pred[i]:
            FN += 1
    
    mat[0]=TP,FP
    mat[1]=FN,TN
    return TP, FP, FN,TN,mat

In [8]:

TP1, FP1, FN1,TN1,MAT1=measures(y_act,y_hat)
print(MAT1)

[[10000.   100.]
 [    0.     0.]]


In [9]:

def TPR(TP,FN):
    return (TP)/(TP+FN)
def FPR(FP,TN):
    return (FP)/(TN+FP+1)
def precision(TP,FP):
    return (TP)/(TP+FP)

In [10]:
TPR1=TPR(TP1,FN1)
FPR1=FPR(FP1,TN1)
pr1=precision(TP1,FP1)
recall1=TPR1 #Recall is same as computing TPR

In [11]:
f1_1=2*((pr1*recall1)/(pr1+recall1)) # #compute f1 score
print(f1_1)

0.9950248756218906


In [12]:
total_no_points=TP1+FP1+FN1+TN1 #total no. of points
correctly_class_points=TP1+TN1  #total no. of correctly classified points
accuracy=correctly_class_points/total_no_points
print(accuracy) #accuracy is computed

0.9900990099009901


In [13]:
data_f_roc=data_f.sort_values(by=['proba'],ascending=False)  #sorting data column in desc order
data_f_roc.head()

Unnamed: 0,y,proba,y_pred
1664,1.0,0.899965,1
2099,1.0,0.899828,1
1028,1.0,0.899825,1
9592,1.0,0.899812,1
8324,1.0,0.899768,1


In [14]:
y_actual=data_f_roc['y'].tolist()  #converting to list
y_prob=((data_f_roc['proba'].tolist()))

In [15]:

def cal_thresh(threshold, actual, predicted):
    """
    [This function is use to calculate treshold, every treshold calculated for proba columns
    Where predicted value is compared with treshold and (y_tilda) is computed with value more
    than treshold are given as 1 and rest as 0]
    args: [ treshold value(descending), actual label, pred label]
    returns: [[tpr,fpr] for each treshold] and false neg, false pos for each treshold
    """
    pred_tilda = np.where(predicted >= threshold,1,0)
    TP,FP,FN,TN,_= measures(actual,pred_tilda)
    tpr=  TPR(TP,FN)
    fpr = FPR(FP,TN)
    return fpr,tpr,FN,FP

In [16]:
def roc_curve(actual, predicted, thresholds):
    """
    [This function is use to determine ROC curve to compute AUC]
    args: [actual label, pred label,treshold]
    returns: [[tpr,fpr] for each treshold and false neg and false pos for each treshold]
    """
    tpr=[]
    fpr=[]
    fp,fn={},{}
    for i in thresholds:
        fpr_at_treshold, tpr_at_treshold,FN,FP= cal_thresh(i, actual, predicted)
        tpr.append(tpr_at_treshold)
        fpr.append(fpr_at_treshold)
        fn[i]=FN
        fp[i]=FP
    return  tpr,fpr,fn,fp

In [17]:
tre=sorted(np.unique(y_prob),reverse=True)  #unique treshold in desc order 
tpr,fpr,_,_=roc_curve(y_actual,y_prob,tre)

In [18]:
auc_score=np.trapz(tpr, fpr)  #using np.trapz to compute auc score
print(auc_score)

0.4834643564356435



## Compute performance metrics for the given data 5_b.csv

steps same as task A

In [19]:
data_b=pd.read_csv("5_b.csv")

In [20]:
data_b.head()

Unnamed: 0,y,proba
0,0.0,0.281035
1,0.0,0.465152
2,0.0,0.352793
3,0.0,0.157818
4,0.0,0.276648


In [21]:

probability_b=data_b.loc[:,"proba"]

In [22]:
print(probability_b)

0        0.281035
1        0.465152
2        0.352793
3        0.157818
4        0.276648
           ...   
10095    0.474401
10096    0.128403
10097    0.499331
10098    0.157616
10099    0.296618
Name: proba, Length: 10100, dtype: float64


In [23]:
y_hat_b=[]
for i in range(len(probability_b)):
    if probability_b[i]<0.5:
        y_hat_b.append(0)
    else:
        y_hat_b.append(1)

In [24]:
df_b = pd.DataFrame(y_hat_b, columns = ["y_pred"])
data_f_b=pd.concat([data_b,df_b], axis=1)
data_f_b.head()

Unnamed: 0,y,proba,y_pred
0,0.0,0.281035,0
1,0.0,0.465152,0
2,0.0,0.352793,0
3,0.0,0.157818,0
4,0.0,0.276648,0


In [25]:

y_act_b=data_f_b['y'].tolist()

In [26]:
TP1, FP1, FN1,TN1,MAT1=measures(y_act_b,y_hat_b)
print(MAT1)

[[  55.  239.]
 [  45. 9761.]]


In [27]:
TPR_b=TPR(TP1,FN1)
FPR_b=FPR(FP1,TN1)
pr_b=precision(TP1,FP1)
recall_b=TPR_b

In [28]:
f1_b=2*((pr_b*recall_b)/(pr_b+recall_b)) #f1 is computed
print(f1_b)

0.2791878172588833


In [29]:
total_no_points_b=TP1+FP1+FN1+TN1
correctly_class_points_b=TP1+TN1
accuracy=correctly_class_points_b/total_no_points_b
print(accuracy) #accuracy is computed

0.9718811881188119


In [30]:
data_f_roc_b=data_f_b.sort_values(by=['proba'],ascending=False)

In [31]:

y_actual_b=data_f_roc_b['y'].tolist()
y_prob_b=((data_f_roc_b['proba'].tolist()))

In [32]:

tre_b=sorted(np.unique(y_prob_b),reverse=True)
tpr_b,fpr_b,_,_=roc_curve(y_actual_b,y_prob_b,tre_b)

In [33]:
auc_score_b=np.trapz(tpr_b, fpr_b)
print(auc_score_b) #auc score is computed

0.9376632336766323



# C. Compute the best threshold of probability which gives lowest values of metric A for the given data 5_c.csv

## task similar to task A and task B

In [34]:
data_c=pd.read_csv("5_c.csv")
data_c.head()

Unnamed: 0,y,prob
0,0,0.458521
1,0,0.505037
2,0,0.418652
3,0,0.412057
4,0,0.375579


In [35]:
probability_c=data_c.loc[:,"prob"]
data_f_roc_c=data_c.sort_values(by=['prob'],ascending=False)
y_actual_c=data_f_roc_c['y'].tolist()
y_prob_c=((data_f_roc_c['prob'].tolist()))

In [36]:
tre_c=sorted(np.unique(y_prob_c),reverse=True)
_, _,fn,fp=roc_curve(y_actual_c,y_prob_c,tre_c)


In [37]:
fn1=list(fn.values())  #false neg and false pos returned by roc, converting to list
fp1=list(fp.values())


In [38]:
A={}
for i,j in enumerate(tre_c):
    """
    To find the best treshold score should be minimum , soo finding optimal false neg
    and alse pos value to get best treshold
    """
    scr=500*fn1[i]+100*fp1[i]
    A[j]=scr


In [39]:
minimum_A_metric=min(A.values())  #getting minimum value from dict A
for i in A:
    if A[i]==minimum_A_metric:
        best_tresh=(i)  #key for the minimum value, i.e best treshold

In [40]:

print(minimum_A_metric) #smallest value for the best treshol
print(best_tresh) #best treshold computed by false neg and false pos

141000
0.2300390278970873


In [41]:

print(fp[best_tresh],fn[best_tresh]) # when false pos is 1028 & false neg is 78, i got best tresh

1020 78


## Compute performance metrics for the given data 5_d.csv

In [42]:
data_d=pd.read_csv("5_d.csv")

In [43]:
data_d.head()


Unnamed: 0,y,pred
0,101.0,100.0
1,120.0,100.0
2,131.0,113.0
3,164.0,125.0
4,154.0,152.0


In [44]:
y=((data_d['y'].tolist()))
y_pred=((data_d['pred'].tolist()))


In [45]:

sqr_err=0.0
no_val=len(y)
for i in range(len(y)):
    sqr_err+=np.sum((y[i]-y_pred[i])**2)

In [46]:
mean_error=sqr_err/no_val#computing mean sqr error
print(mean_error)

177.16569974554707


In [47]:
a_bar=0.0
for i in  range(len(y)):
    #computing average of actual y label
    a_bar+=np.sum(y[i])
a_bar=a_bar/no_val

In [48]:
ape=0.0
for i in range(len(y)):
    #computing mean absolute % error
    ape+=np.sum(((abs(y[i]-y_pred[i]))/a_bar)*100)
mape=ape/no_val #computing mean abs % err
print(mape)

12.912029940093051


In [49]:
ss_t=0.0
for i in range(len(y)):
     #computing total sum of square
    ss_t+=np.sum((y[i]-a_bar)**2)
print(ss_t)

638161080.035662


In [50]:
r_sqr=1-(sqr_err/ss_t)  #sum of square of residual is same as sum of squared error
print(r_sqr)

0.9563582786990964
