In [None]:
import utils
import importlib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve,auc,roc_auc_score
from sklearn.preprocessing import MaxAbsScaler
from sklearn.ensemble import RandomForestClassifier

In [None]:
importlib.reload(utils) 


## This code is used for extraction feature matrix from DTI data

In [None]:
    
def get_D_T_in_DTI(G):
    '''
    Return Drugs and targets with interactions in G
    '''
    D,T = G.nonzero()
    D = set(D)
    T = set(T)
    return D,T

def calculate_score(d,t,D_interact,T_interact,aD_sim,aT_sim,r=0.6):
    #loop over all pairs of similarities
    scores = []
    for D_sim in aD_sim:
        for T_sim in aT_sim:
            f_score = 0
            for i in D_interact:
                for j in T_interact:
                    if i!=d and j!=t:
                        score = D_sim[i,d]**r * T_sim[j,t]**(1-r)
                        #keep the max score
                        f_score = max(f_score,score)
            scores.append(f_score)
    #print(scores)
    return scores
def get_features(G,aD_sim,aT_sim):
    '''
    DTI matrix
    '''
    # get the D and Ts in interaction
    features = []
    D,T = get_D_T_in_DTI(G)
    
    for i in range(G.shape[0]):
        for j in range(G.shape[1]):
            features.append(calculate_score(i,j,D,T,aD_sim,aT_sim))
    return features



## Function in the following block will drive the classification process

In [None]:
def run_classification(data,labels,test_idx):
    All_scores = []
    length = len(data)
    train_idx = []
    for idx in range(length):
        if idx not in test_idx:
            train_idx.append(idx)
    data = np.array(data)
    test_idx = np.array(test_idx)
    train_idx = np.array(train_idx)
    labels = np.array(labels)
    
    X_train, X_test = data[train_idx,], data[test_idx,]
    y_train, y_test = labels[train_idx], labels[test_idx]

    print(X_train.shape)
    # scale the data
    max_abs_scaler = MaxAbsScaler()
    X_train_maxabs_fit = max_abs_scaler.fit(X_train) 

    X_train_maxabs_transform = max_abs_scaler.transform(X_train)

    X_test_maxabs_transform = max_abs_scaler.transform(X_test)


    clf = LogisticRegression(class_weight='balanced')
    print (X_train_maxabs_transform.shape,y_train.shape)
    clf.fit(X_train_maxabs_transform, y_train)
    
    

    scores_testing =  clf.predict_proba(X_test_maxabs_transform)[:, 1]


    y_pred = clf.predict(X_test_maxabs_transform)


    precision_testing, recall_testing, _ =   precision_recall_curve(y_test, scores_testing, pos_label=1)

    AUPR = auc(recall_testing, precision_testing)
    AUROC = roc_auc_score(y_test,scores_testing)

    print("AUPR is: {}".format(AUPR))
    print("AUROC is: {}".format(AUROC))


## Loading DTI matrix and similarity matricies

In [None]:
DTI_file = 'data/nr_admat_dgc_mat_2_line.txt'
DD_similarities = 'data/nr_D_similarities.txt'
TT_similarities = 'data/nr_T_similarities.txt'

#Loading D and T metadata
D,T,dDs,dTs,diDs,diTs = utils.get_D_T_info(DTI_file)

#get DTIs
DTI = utils.get_edge_list(DTI_file) #this returns a list of interactions

#create an adj matrix 
DTI_adj = utils.get_adj_matrix_from_relation(DTI,dDs,dTs)
row,col = DTI_adj.shape
labels = utils.mat2vec(DTI_adj)

# Load Drug similarity matrix
aDSim = utils.get_similarities(DD_similarities,dDs)

# Load Target similarity matrix
aTSim = utils.get_similarities(TT_similarities,dTs)


In [None]:
# generate cross validation data for pair mode
cv_data = utils.cross_validation(DTI_adj,seeds=[2], cv=1, num=10)
for fold in cv_data[2]:
    W = fold[0] # get the masking matrix for DTI
    DTI_train = DTI_adj*W #mask test pairs from DTI
    
    # Adding similarity based on network
    DT_impute_D = utils.impute_zeros(DTI_train,aDSim[0])
    DT_impute_T = utils.impute_zeros(np.transpose(DTI_train),aTSim[0])

    GIP_D = utils.Get_GIP_profile(np.transpose(DT_impute_D),"d")
    GIP_T = utils.Get_GIP_profile(DT_impute_T,"t")
    
    Final_sim_D = aDSim + [GIP_D]
    Final_sim_T = aTSim + [GIP_T]
    features = get_features(DTI_train,Final_sim_D,Final_sim_T)
    test_idx = [i*col+j for (i,j) in fold[1]]
    run_classification(features,labels,test_idx)

## Excercise
- How can we imporive the accuracy more?
- Can we look into other data sets? Try look into: https://www.cin.ufpe.br/~acan/kronrlsmkl/data.zip 
