In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import missingno as msno

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import LeaveOneGroupOut, GridSearchCV

from sklearn.inspection import permutation_importance

from sklearn.metrics import make_scorer, roc_auc_score
import os
%matplotlib inline

In [None]:
def score_func(y, y_pred):
    score = roc_auc_score(y, y_pred, multi_class="ovr")
    return score

my_scores = make_scorer(score_func=score_func, greater_is_better=True, needs_proba=True, needs_threshold=False)

In [None]:
os.getcwd()

In [None]:
os.chdir("../../2_Data")

In [None]:
df_noref = pd.read_csv("/Users/zhengyuanrui/Decoding_SALT/Decode_new/2_Data/df_no_ref.csv")
df_selfref = pd.read_csv("/Users/zhengyuanrui/Decoding_SALT/Decode_new/2_Data/df_self_ref.csv")

In [None]:
df_noref.head()

In [None]:
df_selfref.head()

In [None]:
msno.matrix(df_noref, labels=True)

In [None]:
msno.matrix(df_selfref, labels=True)

In [None]:
X_norefc = df_noref.iloc[:, 8:11].values
X_selfrefc = df_selfref.iloc[:, 9:12].values

y_noref = df_noref["label"].values
y_selfref = df_selfref["label"].values

norefcolc = df_noref.iloc[:, 8:11].columns
selfrefcolc = df_selfref.iloc[:, 9:12].columns

In [None]:
y_noref.shape

In [None]:
y_selfref.shape

In [None]:
groups_no = df_noref["Subject"].values
groups_self = df_selfref["Subject"].values

In [None]:
logo = LeaveOneGroupOut()

In [None]:
def rf_within_task(X, y, group, source):
    feature_importance = []
    df_result = dict(subID=[], score=[], source=[], target=[])# source拟合的，target预测的condition
    for train, test in logo.split(X, y, groups=group):
        test_sub = np.unique(group[test])[0]
        df_result["subID"].append(test_sub)
        
        rf = make_pipeline(MinMaxScaler(), 
                       RandomForestClassifier(n_estimators=500, bootstrap=True, 
                                              random_state=123, class_weight="balanced", 
                                              criterion = "entropy", max_samples=0.9, n_jobs=-1))
        
        model = rf.fit(X=X[train], y=y[train])
        im = permutation_importance(model, X[test], y[test], scoring=my_scores, n_repeats=20, n_jobs=-1, random_state=123)
        feature_importance.append(im['importances_mean'])
        y_pred = model.predict_proba(X[test])
        score = roc_auc_score(y[test], y_pred, multi_class='ovr')

        df_result['score'].append(score)
        df_result['source'].append(source)
        df_result['target'].append(source)

    return pd.DataFrame(df_result), feature_importance

In [None]:
def rf_cross_task(X_source, y_source, X_target, y_target, target_group, source_name, target_name):
    df_result = dict(subID=[], score=[], source=[], target=[])# source拟合的，target预测的condition
    feature_importance = []
    rf = make_pipeline(MinMaxScaler(), 
                       RandomForestClassifier(n_estimators=500, bootstrap=True, 
                                              random_state=123, class_weight="balanced", 
                                              criterion = "entropy", max_samples=0.9, n_jobs=-1))
    model = rf.fit(X=X_source, y=y_source)


    for sub in np.unique(target_group):
        idx_sub = target_group == sub
        feature_sub = X_target[idx_sub]
        label_sub = y_target[idx_sub]

        im = permutation_importance(model, feature_sub, label_sub, scoring=my_scores, n_repeats=20, n_jobs=-1, random_state=123)
        feature_importance.append(im['importances_mean'])

        y_pred = model.predict_proba(feature_sub)
        score = roc_auc_score(label_sub, y_pred, multi_class="ovr")

        df_result['subID'].append(sub)
        df_result["score"].append(score)
        df_result["source"].append(source_name)
        df_result["target"].append(target_name)


    return pd.DataFrame(df_result) , feature_importance

In [None]:
score_noc_rf,  im_noc_rf= rf_within_task(X = X_norefc, y = y_noref, group = groups_no, source="No_Ref")
score_selfc_rf,  im_selfc_rf= rf_within_task(X = X_selfrefc, y = y_selfref, group = groups_self, source="Self_Ref")

In [None]:
#No ref to self ref
df_cross1c_rf, im_cross1c_rf = rf_cross_task(X_source=X_norefc, y_source=y_noref, X_target=X_selfrefc, y_target=y_selfref, target_group=groups_self, source_name="No_Ref", target_name="Self_Ref")
#self to no ref
df_cross2c_rf, im_cross2c_rf = rf_cross_task(X_source=X_selfrefc, y_source=y_selfref, X_target=X_norefc, y_target=y_noref, target_group=groups_no, source_name="Self_Ref", target_name="No_Ref")