In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import missingno as msno

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import MinMaxScaler,StandardScaler

from sklearn.model_selection import LeaveOneGroupOut, GridSearchCV

from sklearn.inspection import permutation_importance

from sklearn.metrics import make_scorer, roc_auc_score

from sklearn.compose import ColumnTransformer

import os
%matplotlib inline

In [2]:
def score_func(y, y_pred):
    score = roc_auc_score(y, y_pred, multi_class="ovr")
    return score

my_scores = make_scorer(score_func=score_func, greater_is_better=True, needs_proba=True, needs_threshold=False)

In [3]:
os.getcwd()

'/Users/zhengyuanrui/Decoding_SALT/Decode_SALT/1_Script/2_model/5_Valence012_pre'

In [4]:
os.chdir("../../../3_Result/Valence012_pre/1_Logistic/1_Current")

In [5]:
os.getcwd()

'/Users/zhengyuanrui/Decoding_SALT/Decode_SALT/3_Result/Valence012_pre/1_Logistic/1_Current'

In [6]:
df_noref = pd.read_csv("/Users/zhengyuanrui/Decoding_SALT/Decode_SALT/2_Data/df_no_ref012.csv")
df_selfref = pd.read_csv("/Users/zhengyuanrui/Decoding_SALT/Decode_SALT/2_Data/df_self_ref012.csv")

In [7]:
df_noref.head()

Unnamed: 0,Subject,ExpNo,BlockNo,TrialNo,TrialNo1b,TrialNo2b,TrialNo3b,TrialNo4b,ACC,RT,...,Valence2b,ACC3b,RT3b,ismatch3b,Valence3b,ACC4b,RT4b,ismatch4b,Valence4b,label
0,1001,Exp1a,1,16,15,14,13,12,1,1065,...,1,1,1049,1,0,1,865,0,0,2
1,1001,Exp1a,1,36,35,34,33,32,1,929,...,2,1,913,0,0,1,633,1,0,2
2,1001,Exp1a,1,49,48,47,46,45,0,880,...,0,1,865,0,2,1,592,1,0,1
3,1001,Exp1a,1,50,49,48,47,46,0,888,...,0,1,648,1,0,1,865,0,2,2
4,1001,Exp1a,1,51,50,49,48,47,0,777,...,1,1,776,0,0,1,648,1,0,0


In [8]:
df_selfref.head()

Unnamed: 0,Subject,ExpNo,BlockNo,TrialNo,TrialNo1b,TrialNo2b,TrialNo3b,TrialNo4b,Identity,ACC,...,Valence2b,ACC3b,RT3b,ismatch3b,Valence3b,ACC4b,RT4b,ismatch4b,Valence4b,label
0,3010,Exp3a,1,5,4,3,2,1,Self,1,...,2,0,660,1,2,1,822,1,1,1
1,3010,Exp3a,1,6,5,4,3,2,Other,1,...,1,0,608,1,2,0,660,1,2,2
2,3010,Exp3a,1,7,6,5,4,3,Self,1,...,1,1,747,1,1,0,608,1,2,0
3,3010,Exp3a,1,8,7,6,5,4,Other,0,...,2,1,657,0,1,1,747,1,1,0
4,3010,Exp3a,1,9,8,7,6,5,Self,1,...,0,1,631,0,2,1,657,0,1,0


In [9]:
X_norefc = df_noref.iloc[:, 8:11].values
X_selfrefc = df_selfref.iloc[:, 9:12].values

y_noref = df_noref["label"]
y_selfref = df_selfref["label"]

norefcolc = ["RT", "ACC", "ismatch"]
selfrefcolc = ["RT", "ACC", "ismatch"]

In [10]:
X_norefc

array([[   1, 1065,    0],
       [   1,  929,    0],
       [   0,  880,    1],
       ...,
       [   1,  753,    0],
       [   1,  665,    1],
       [   1,  955,    1]])

In [11]:
X_selfrefc

array([[  1, 657,   0],
       [  1, 631,   0],
       [  1, 678,   0],
       ...,
       [  1, 922,   1],
       [  1, 802,   0],
       [  0, 730,   0]])

In [12]:
groups_no = df_noref["Subject"].values
groups_self = df_selfref["Subject"].values

In [13]:
logo = LeaveOneGroupOut()

In [14]:
def lr_within_task(X, y, group, source):
    df_result = dict(subID=[], score=[], source=[], target=[])# source拟合的，target预测的condition
    feature_importance = []
    feature_coef = []
    ct = ColumnTransformer(transformers=[("rt", StandardScaler(), [1])],
                       remainder='passthrough')
    for train, test in logo.split(X, y, groups=group):
        test_sub = np.unique(group[test])[0]
        df_result["subID"].append(test_sub)
        
        logi = make_pipeline(
            MinMaxScaler(), 
            LogisticRegressionCV(Cs = np.logspace(-6, 3, 7), cv = 5, class_weight='balanced', 
                                 random_state=123, max_iter=5000, multi_class="ovr"))
        
        X_train = ct.fit_transform(X[train])
        X_test = ct.transform(X[test])

        logi.fit(X=X_train, y=y[train])
        feature_coef.append(logi.steps[-1][-1].coef_)
        im = permutation_importance(logi, X_test, y[test], scoring=my_scores, n_repeats=20, n_jobs=-1, random_state=123)
        feature_importance.append(im['importances_mean'])
        y_pred = logi.predict_proba(X_test)
        score = roc_auc_score(y[test], y_pred, multi_class='ovr')

        df_result['score'].append(score)
        df_result['source'].append(source)
        df_result['target'].append(source)

    return pd.DataFrame(df_result), feature_importance, feature_coef


In [15]:
def lr_cross_task(X_source, y_source, X_target, y_target, target_group, source_name, target_name):
    df_result = dict(subID=[], score=[], source=[], target=[])# source拟合的，target预测的condition
    feature_importance = []
    feature_coef = []
    ct = ColumnTransformer(transformers=[("rt", StandardScaler(), [1])],
                       remainder='passthrough')
    logi = make_pipeline(
            MinMaxScaler(), 
            LogisticRegressionCV(Cs = np.logspace(-6, 3, 7), cv = 5, class_weight='balanced', 
                                 random_state=123, max_iter=5000, multi_class="ovr"))

    X_train = ct.fit_transform(X_source)

    logi.fit(X=X_train, y=y_source)


    for sub in np.unique(target_group):
        idx_sub = target_group == sub
        feature_sub = X_target[idx_sub]
        label_sub = y_target[idx_sub]

        feature_test = ct.transform(feature_sub)

        im = permutation_importance(logi, feature_test, label_sub, scoring=my_scores, n_repeats=20, n_jobs=-1, random_state=123)
        feature_importance.append(im['importances_mean'])
        feature_coef.append(logi.steps[-1][-1].coef_)
        y_pred = logi.predict_proba(feature_test)
        score = roc_auc_score(label_sub, y_pred, multi_class="ovr")

        df_result['subID'].append(sub)
        df_result["score"].append(score)
        df_result["source"].append(source_name)
        df_result["target"].append(target_name)


    return pd.DataFrame(df_result), feature_importance, feature_coef

In [16]:
score_noc, im_noc, coef_noc = lr_within_task(X = X_norefc, y = y_noref, group = groups_no, source="No_Ref")
score_selfc, im_selfc, coef_selfc = lr_within_task(X = X_selfrefc, y = y_selfref, group = groups_self, source="Self_Ref")

In [17]:
#No ref to self ref trial back
df_cross1c, im_cross1c, coef_cross1c = lr_cross_task(X_source=X_norefc, y_source=y_noref, X_target=X_selfrefc, y_target=y_selfref, target_group=groups_self, source_name="No_Ref", target_name="Self_Ref")
#self to no ref trial back
df_cross2c, im_cross2c, coef_cross2c = lr_cross_task(X_source=X_selfrefc, y_source=y_selfref, X_target=X_norefc, y_target=y_noref, target_group=groups_no, source_name="Self_Ref", target_name="No_Ref")

In [18]:
df_score = pd.concat([score_noc, score_selfc])

In [19]:
df_score.to_csv("LR_withinscore_current.csv")

In [20]:
df_cross_score = pd.concat([df_cross1c, df_cross2c])

In [21]:
df_cross_score.to_csv("LR_crossscore_current.csv")

In [22]:
df_im_no = pd.DataFrame(im_noc, columns=norefcolc)
df_im_no["subj_idx"] = np.arange(1, len(im_noc)+1)
df_im_no["source"] = "No_Ref"
df_im_no["target"] = "No_Ref"

df_im_self = pd.DataFrame(im_selfc, columns=selfrefcolc)
df_im_self["subj_idx"] = np.arange(1, len(im_selfc)+1)
df_im_self["source"] = "Self_Ref"
df_im_self["target"] = "Self_Ref"

In [23]:
im_no_long = pd.melt(df_im_no, id_vars=["subj_idx", "source", "target"])
im_self_long = pd.melt(df_im_self, id_vars=["subj_idx", "source", "target"])
im_within = pd.concat([im_no_long, im_self_long])
im_within

Unnamed: 0,subj_idx,source,target,variable,value
0,1,No_Ref,No_Ref,RT,0.093288
1,2,No_Ref,No_Ref,RT,0.058117
2,3,No_Ref,No_Ref,RT,0.051267
3,4,No_Ref,No_Ref,RT,0.023523
4,5,No_Ref,No_Ref,RT,0.111493
...,...,...,...,...,...
580,191,Self_Ref,Self_Ref,ismatch,-0.000354
581,192,Self_Ref,Self_Ref,ismatch,0.007385
582,193,Self_Ref,Self_Ref,ismatch,0.000115
583,194,Self_Ref,Self_Ref,ismatch,0.007685


In [24]:
im_within.to_csv("importance_within_current.csv")

In [25]:
df_im_cross1 = pd.DataFrame(im_cross1c, columns=selfrefcolc)
df_im_cross1["subj_idx"] = np.arange(1, len(im_cross1c)+1)
df_im_cross1["source"] = "No_Ref"
df_im_cross1["target"] = "Self_Ref"

df_im_cross2 = pd.DataFrame(im_cross2c, columns=norefcolc)
df_im_cross2["subj_idx"] = np.arange(1, len(im_cross2c)+1)
df_im_cross2["source"] = "Self_Ref"
df_im_cross2["target"] = "No_Ref"

In [26]:
im_cross1_long = pd.melt(df_im_cross1, id_vars=["subj_idx", "source", "target"])
im_cross2_long = pd.melt(df_im_cross2, id_vars=["subj_idx", "source", "target"])
im_cross = pd.concat([im_cross1_long, im_cross2_long])
im_cross

Unnamed: 0,subj_idx,source,target,variable,value
0,1,No_Ref,Self_Ref,RT,0.040489
1,2,No_Ref,Self_Ref,RT,0.026083
2,3,No_Ref,Self_Ref,RT,0.041534
3,4,No_Ref,Self_Ref,RT,0.012493
4,5,No_Ref,Self_Ref,RT,0.042985
...,...,...,...,...,...
487,160,Self_Ref,No_Ref,ismatch,0.006197
488,161,Self_Ref,No_Ref,ismatch,0.011965
489,162,Self_Ref,No_Ref,ismatch,0.004133
490,163,Self_Ref,No_Ref,ismatch,-0.001995


In [27]:
im_cross.to_csv("importance_cross_current.csv")

In [28]:
np.save('coef_no_current.npy', coef_noc)
np.save('coef_self_current.npy', coef_selfc)
np.save('coef_cross1_current.npy', coef_cross1c)
np.save('coef_cross2_current.npy', coef_cross2c)

In [29]:
os.getcwd()

'/Users/zhengyuanrui/Decoding_SALT/Decode_SALT/3_Result/Valence012_pre/1_Logistic/1_Current'