In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import missingno as msno

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.model_selection import LeaveOneGroupOut, GridSearchCV

from sklearn.compose import ColumnTransformer

from sklearn.inspection import permutation_importance

from sklearn.metrics import make_scorer, roc_auc_score
import os
%matplotlib inline

In [2]:
def score_func(y, y_pred):
    score = roc_auc_score(y, y_pred, multi_class="ovr")
    return score

my_scores = make_scorer(score_func=score_func, greater_is_better=True, needs_proba=True, needs_threshold=False)

In [3]:
os.getcwd()

'/Users/zhengyuanrui/Decoding_SALT/Decode_SALT/1_Script/2_model/5_Valence012_pre'

In [4]:
os.chdir("../../../3_Result/Valence012_pre/1_Logistic/2_Trials-Back")

In [5]:
os.getcwd()

'/Users/zhengyuanrui/Decoding_SALT/Decode_SALT/3_Result/Valence012_pre/1_Logistic/2_Trials-Back'

In [6]:
df_noref = pd.read_csv("/Users/zhengyuanrui/Decoding_SALT/Decode_SALT/2_Data/df_no_ref012.csv")
df_selfref = pd.read_csv("/Users/zhengyuanrui/Decoding_SALT/Decode_SALT/2_Data/df_self_ref012.csv")

In [7]:
df_noref.head()

Unnamed: 0,Subject,ExpNo,BlockNo,TrialNo,TrialNo1b,TrialNo2b,TrialNo3b,TrialNo4b,ACC,RT,...,Valence2b,ACC3b,RT3b,ismatch3b,Valence3b,ACC4b,RT4b,ismatch4b,Valence4b,label
0,1001,Exp1a,1,16,15,14,13,12,1,1065,...,1,1,1049,1,0,1,865,0,0,2
1,1001,Exp1a,1,36,35,34,33,32,1,929,...,2,1,913,0,0,1,633,1,0,2
2,1001,Exp1a,1,49,48,47,46,45,0,880,...,0,1,865,0,2,1,592,1,0,1
3,1001,Exp1a,1,50,49,48,47,46,0,888,...,0,1,648,1,0,1,865,0,2,2
4,1001,Exp1a,1,51,50,49,48,47,0,777,...,1,1,776,0,0,1,648,1,0,0


In [8]:
df_selfref.head()

Unnamed: 0,Subject,ExpNo,BlockNo,TrialNo,TrialNo1b,TrialNo2b,TrialNo3b,TrialNo4b,Identity,ACC,...,Valence2b,ACC3b,RT3b,ismatch3b,Valence3b,ACC4b,RT4b,ismatch4b,Valence4b,label
0,3010,Exp3a,1,5,4,3,2,1,Self,1,...,2,0,660,1,2,1,822,1,1,1
1,3010,Exp3a,1,6,5,4,3,2,Other,1,...,1,0,608,1,2,0,660,1,2,2
2,3010,Exp3a,1,7,6,5,4,3,Self,1,...,1,1,747,1,1,0,608,1,2,0
3,3010,Exp3a,1,8,7,6,5,4,Other,0,...,2,1,657,0,1,1,747,1,1,0
4,3010,Exp3a,1,9,8,7,6,5,Self,1,...,0,1,631,0,2,1,657,0,1,0


In [9]:
X_norefb = df_noref.iloc[:, 11:-1].values
X_selfrefb = df_selfref.iloc[:, 12:-1].values


y_noref = df_noref["label"].values
y_selfref = df_selfref["label"].values

norefcolb = ["RT1b", "RT2b", "RT3b", "RT4b", "ACC1b", "ismatch1b", "Valence1b", "ACC2b", "ismatch2b", "Valence2b", "ACC3b", "ismatch3b", "Valence3b", "ACC4b", "ismatch4b", "Valence4b"]
selfrefcolb = ["RT1b", "RT2b", "RT3b", "RT4b", "ACC1b", "ismatch1b", "Valence1b", "ACC2b", "ismatch2b", "Valence2b", "ACC3b", "ismatch3b", "Valence3b", "ACC4b", "ismatch4b", "Valence4b"]

In [10]:
X_selfrefb.shape

(108263, 16)

In [11]:
y_noref.shape

(92824,)

In [12]:
y_selfref.shape

(108263,)

In [13]:
groups_no = df_noref["Subject"].values
groups_self = df_selfref["Subject"].values

In [14]:
logo = LeaveOneGroupOut()

In [15]:
def lr_within_task(X, y, group, source):
    df_result = dict(subID=[], score=[], source=[], target=[])# source拟合的，target预测的condition
    feature_importance = []
    feature_coef = []
    ct = ColumnTransformer(transformers=[("rt", StandardScaler(), [1])],
                       remainder='passthrough')
    for train, test in logo.split(X, y, groups=group):
        test_sub = np.unique(group[test])[0]
        df_result["subID"].append(test_sub)
        
        logi = make_pipeline(
            MinMaxScaler(), 
            LogisticRegressionCV(Cs = np.logspace(-6, 3, 7), cv = 5, class_weight='balanced', 
                                 random_state=123, max_iter=5000, multi_class="ovr"))
        
        X_train = ct.fit_transform(X[train])
        X_test = ct.transform(X[test])

        logi.fit(X=X_train, y=y[train])
        feature_coef.append(logi.steps[-1][-1].coef_)
        im = permutation_importance(logi, X_test, y[test], scoring=my_scores, n_repeats=20, n_jobs=-1, random_state=123)
        feature_importance.append(im['importances_mean'])
        y_pred = logi.predict_proba(X_test)
        score = roc_auc_score(y[test], y_pred, multi_class='ovr')

        df_result['score'].append(score)
        df_result['source'].append(source)
        df_result['target'].append(source)

    return pd.DataFrame(df_result), feature_importance, feature_coef


In [16]:
def lr_cross_task(X_source, y_source, X_target, y_target, target_group, source_name, target_name):
    df_result = dict(subID=[], score=[], source=[], target=[])# source拟合的，target预测的condition
    feature_importance = []
    feature_coef = []
    ct = ColumnTransformer(transformers=[("rt", StandardScaler(), [1])],
                       remainder='passthrough')
    logi = make_pipeline(
            MinMaxScaler(), 
            LogisticRegressionCV(Cs = np.logspace(-6, 3, 7), cv = 5, class_weight='balanced', 
                                 random_state=123, max_iter=5000, multi_class="ovr"))

    X_train = ct.fit_transform(X_source)

    logi.fit(X=X_train, y=y_source)


    for sub in np.unique(target_group):
        idx_sub = target_group == sub
        feature_sub = X_target[idx_sub]
        label_sub = y_target[idx_sub]

        feature_test = ct.transform(feature_sub)

        im = permutation_importance(logi, feature_test, label_sub, scoring=my_scores, n_repeats=20, n_jobs=-1, random_state=123)
        feature_importance.append(im['importances_mean'])
        feature_coef.append(logi.steps[-1][-1].coef_)
        y_pred = logi.predict_proba(feature_test)
        score = roc_auc_score(label_sub, y_pred, multi_class="ovr")

        df_result['subID'].append(sub)
        df_result["score"].append(score)
        df_result["source"].append(source_name)
        df_result["target"].append(target_name)


    return pd.DataFrame(df_result), feature_importance, feature_coef

In [17]:
score_no, im_no, coef_no = lr_within_task(X = X_norefb, y = y_noref, group = groups_no, source="No_Ref")
score_self, im_self, coef_self = lr_within_task(X = X_selfrefb, y = y_selfref, group = groups_self, source="Self_Ref")

In [18]:
df_im_no = pd.DataFrame(np.array(im_no), columns=norefcolb)
df_im_no

Unnamed: 0,RT1b,RT2b,RT3b,RT4b,ACC1b,ismatch1b,Valence1b,ACC2b,ismatch2b,Valence2b,ACC3b,ismatch3b,Valence3b,ACC4b,ismatch4b,Valence4b
0,-0.001778,0.011782,0.000417,0.051604,-0.005562,0.000848,0.002658,0.052138,0.000565,0.000380,0.001543,0.027658,-0.006515,-0.001192,0.002427,0.006434
1,0.000264,0.012529,-0.001356,0.025751,0.004142,-0.000165,-0.003110,-0.015767,0.004991,0.000239,0.003152,0.034631,0.003607,-0.000258,-0.000541,0.012826
2,-0.000280,-0.000370,0.002025,0.048197,0.000819,-0.000104,0.002667,0.038542,0.002201,0.000168,-0.001670,0.016159,0.000051,0.000195,-0.003184,0.023382
3,0.000274,0.003664,-0.000570,0.015464,0.005247,-0.000504,-0.000702,0.046440,-0.004033,0.000843,-0.000925,0.029704,-0.003841,0.000074,-0.001684,0.024408
4,0.000423,-0.000218,0.000970,0.016339,0.006180,-0.000063,-0.002132,0.024257,0.002440,0.000132,0.004817,-0.003063,0.003305,0.000711,-0.000224,-0.003293
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,-0.000167,0.001676,0.000405,0.013717,-0.000809,-0.000228,-0.003100,0.021917,-0.001624,-0.000322,0.000741,0.015133,0.002072,0.000156,-0.003569,0.014111
160,0.000763,0.003170,-0.000104,0.019412,0.000420,0.000342,0.001472,0.028685,-0.000012,-0.000048,0.000279,0.011992,-0.001564,-0.000261,0.001594,0.017969
161,0.000241,0.001362,0.001244,0.002525,0.002427,-0.000046,-0.003519,0.012670,0.000325,0.000057,0.001339,0.020607,-0.000868,0.000354,0.000461,0.016459
162,0.000071,-0.000767,0.000005,0.015272,0.000143,0.000747,-0.000033,0.010727,-0.000178,-0.000221,0.000365,0.006650,0.000039,-0.000351,-0.000617,0.016823


In [19]:
#No ref to self ref trial back
df_cross1, im_cross1, coef_cross1 = lr_cross_task(X_source=X_norefb, y_source=y_noref, X_target=X_selfrefb, y_target=y_selfref, target_group=groups_self, source_name="No_Ref", target_name="Self_Ref")
#self to no ref trial back
df_cross2, im_cross2, coef_cross2 = lr_cross_task(X_source=X_selfrefb, y_source=y_selfref, X_target=X_norefb, y_target=y_noref, target_group=groups_no, source_name="Self_Ref", target_name="No_Ref")

In [20]:
df_score = pd.concat([score_no, score_self])

In [21]:
df_score.to_csv("LR_withinscore.csv")

In [22]:
df_cross_score = pd.concat([df_cross1, df_cross2])

In [23]:
df_cross_score.to_csv("LR_crossscore.csv")

In [24]:
df_im_no = pd.DataFrame(im_no, columns=norefcolb)
df_im_no["subj_idx"] = np.arange(1, len(im_no)+1)
df_im_no["source"] = "No_Ref"
df_im_no["target"] = "No_Ref"

df_im_self = pd.DataFrame(im_self, columns=selfrefcolb)
df_im_self["subj_idx"] = np.arange(1, len(im_self)+1)
df_im_self["source"] = "Self_Ref"
df_im_self["target"] = "Self_Ref"

In [25]:
im_no_long = pd.melt(df_im_no, id_vars=["subj_idx", "source", "target"])
im_self_long = pd.melt(df_im_self, id_vars=["subj_idx", "source", "target"])
im_within = pd.concat([im_no_long, im_self_long])
im_within

Unnamed: 0,subj_idx,source,target,variable,value
0,1,No_Ref,No_Ref,RT1b,-0.001778
1,2,No_Ref,No_Ref,RT1b,0.000264
2,3,No_Ref,No_Ref,RT1b,-0.000280
3,4,No_Ref,No_Ref,RT1b,0.000274
4,5,No_Ref,No_Ref,RT1b,0.000423
...,...,...,...,...,...
3115,191,Self_Ref,Self_Ref,Valence4b,0.107269
3116,192,Self_Ref,Self_Ref,Valence4b,0.127222
3117,193,Self_Ref,Self_Ref,Valence4b,0.114245
3118,194,Self_Ref,Self_Ref,Valence4b,0.116064


In [26]:
im_within.to_csv("importance_within.csv")

In [27]:
df_im_cross1 = pd.DataFrame(im_cross1, columns=selfrefcolb)
df_im_cross1["subj_idx"] = np.arange(1, len(im_cross1)+1)
df_im_cross1["source"] = "No_Ref"
df_im_cross1["target"] = "Self_Ref"

df_im_cross2 = pd.DataFrame(im_cross2, columns=norefcolb)
df_im_cross2["subj_idx"] = np.arange(1, len(im_cross2)+1)
df_im_cross2["source"] = "Self_Ref"
df_im_cross2["target"] = "No_Ref"

In [28]:
im_cross1_long = pd.melt(df_im_cross1, id_vars=["subj_idx", "source", "target"])
im_cross2_long = pd.melt(df_im_cross2, id_vars=["subj_idx", "source", "target"])
im_cross = pd.concat([im_cross1_long, im_cross2_long])
im_cross

Unnamed: 0,subj_idx,source,target,variable,value
0,1,No_Ref,Self_Ref,RT1b,-0.000967
1,2,No_Ref,Self_Ref,RT1b,-0.000179
2,3,No_Ref,Self_Ref,RT1b,0.000184
3,4,No_Ref,Self_Ref,RT1b,-0.001013
4,5,No_Ref,Self_Ref,RT1b,0.000190
...,...,...,...,...,...
2619,160,Self_Ref,No_Ref,Valence4b,0.012581
2620,161,Self_Ref,No_Ref,Valence4b,0.015394
2621,162,Self_Ref,No_Ref,Valence4b,0.021446
2622,163,Self_Ref,No_Ref,Valence4b,0.019197


In [29]:
im_cross.to_csv("importance_cross.csv")

In [30]:
np.save('coef_no.npy', coef_no)
np.save('coef_self.npy', coef_self)
np.save('coef_cross1.npy', coef_cross1)
np.save('coef_cross2.npy', coef_cross2)