In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from autosklearn.classification import AutoSklearnClassifier
import scipy.stats
import pandas as pd
import numpy as np
import util.Data
import util.Plotly as plots
import util.colors.Colors as colors

In [2]:
#dataset
csv_filename = r'../repr/data/final_coordinates-conversion-v2.csv'
df_data = util.Data.import_data(csv_filename)

In [3]:
df_data.shape

(656, 26)

In [4]:
df_data.dtypes

importance        object
author            object
title             object
table_name        object
keywords          object
Left/Right        object
Name              object
Broadman Area     object
t                float64
X(R)               int64
Y(A)               int64
Z(S)               int64
z-score          float64
NV               float64
p value          float64
cluster size     float64
index_col          int64
MNIX               int64
MNIY               int64
MNIZ               int64
BA                object
TALX               int64
TALY               int64
TALZ               int64
BA_name           object
BA_no            float64
dtype: object

## Dataframe valid

In [5]:
mask_outside_all = df_data['BA_name'].str.contains('Outside defined BAs')
df_data_valid = df_data[~mask_outside_all]

In [6]:
df_data_valid.shape

(483, 26)

In [7]:
set_keywords_selection = set(
    ["reward magnitude", "anticipation phase", "outcome phase", "decision phase", "working memory", "viewing",
     "monetary reward", "high gain vs high loss", "taste reward", "social reward", "monetary and taste reward", "food ",
     "high calorie", "low calorie", "positive reward", "negative reward", "context-dependent reward", "risk choice",
     "safe choice"])

In [8]:
df = df_data_valid
counter = {}
for val in set_keywords_selection:
    counter[val] = df['keywords'][df['keywords'].str.contains(val)].count()
counter = {k: v for k, v in sorted(counter.items(), key=lambda item: item[1])}
counter

{'working memory': 0,
 'safe choice': 0,
 'taste reward': 0,
 'monetary and taste reward': 0,
 'high calorie': 5,
 'risk choice': 14,
 'high gain vs high loss': 14,
 'low calorie': 18,
 'decision phase': 18,
 'food ': 18,
 'negative reward': 23,
 'viewing': 24,
 'context-dependent reward': 27,
 'anticipation phase': 76,
 'social reward': 84,
 'positive reward': 97,
 'reward magnitude': 146,
 'monetary reward': 277,
 'outcome phase': 378}

In [9]:
# refine keywords
min_val = 20
set_keywords = [key for key in set_keywords_selection if counter[key] >= min_val]
set_keywords

['positive reward',
 'reward magnitude',
 'monetary reward',
 'outcome phase',
 'anticipation phase',
 'negative reward',
 'context-dependent reward',
 'viewing',
 'social reward']

In [10]:
# significance
def significance(row, df=10):
    if not (pd.isna(row['p value'])):
        return abs(row['p value'])
    elif not (pd.isna(row['z-score'])):
        return scipy.stats.norm.sf(row['z-score'])
    elif not (pd.isna(row['t'])):
        return scipy.stats.t.sf(row['t'], df)
    return 0

In [11]:
df_data_valid['significance'] = df_data_valid.apply(significance, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data_valid['significance'] = df_data_valid.apply(significance, axis=1)


In [12]:
# Create y
y = []
x = []
for _, row in df_data_valid.iterrows():
    x.append(np.array([row['MNIX'], row['MNIY'], row['MNIZ']]))
    y.append(
        # [round(1 * (1 - row['significance']), 4) if key in row['keywords'] else 0 for key in set_keywords_selection])
        np.array([1 if key in row['keywords'] else 0 for key in set_keywords]))
y = np.array(y)
x = np.array(x)

In [13]:
# Split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=23)

In [14]:
x

array([[-28, -24, -14],
       [-21,  21, -25],
       [  4, -56,  35],
       ...,
       [-59,  -3, -18],
       [ 38,  39, -17],
       [ 56,  -6, -17]])

In [15]:
# Auto-Sklearn Initialization

# time_left_for_this_task : Time limit in seconds to find the optimal configuration
# per_run_time_limit : Time limit in seconds for each model
# ensemble_size: Number of models added to the Ensemble model
# initial_configurations_via_metalearning: "k" configurations to start the Bayesian Optimization
model = AutoSklearnClassifier(time_left_for_this_task=300,
                              per_run_time_limit=30,
                              ensemble_size=1,
                              initial_configurations_via_metalearning=0)
# Init training
model.fit(x_train, y_train)



In [16]:
print(model.sprint_statistics())

auto-sklearn results:
  Dataset name: 2f881543-d296-11ec-8c79-fdce89fc766c
  Metric: f1_macro
  Best validation score: 0.412912
  Number of target algorithm runs: 161
  Number of successful target algorithm runs: 150
  Number of crashed target algorithm runs: 11
  Number of target algorithms that exceeded the time limit: 0
  Number of target algorithms that exceeded the memory limit: 0



In [17]:
print(model.leaderboard())

          rank  ensemble_weight          type      cost  duration
model_id                                                         
160          1              1.0  bernoulli_nb  0.587088  0.433002


In [18]:
print(model.show_models())

{160: {'model_id': 160, 'rank': 1, 'cost': 0.587088389492443, 'ensemble_weight': 1.0, 'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f4b1785cd00>, 'balancing': Balancing(random_state=1, strategy='weighting'), 'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f4b1785c610>, 'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f4b17e67340>, 'sklearn_classifier': OneVsRestClassifier(estimator=BernoulliNB(alpha=0.021354685179664025), n_jobs=1)}}


In [19]:
import sklearn.metrics

In [27]:
test_predictions = model.predict(x_test)
print("mse:", sklearn.metrics.mean_squared_error(y_test, test_predictions))
print("acc:", sklearn.metrics.accuracy_score(y_test, test_predictions))

mse: 0.2952815829528158
acc: 0.0273972602739726


In [21]:
test_predictions

array([[1, 1, 0, 1, 0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 1, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 0, 0, 1, 1],
       [0, 1, 1, 0, 1, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 1, 0, 1, 0, 1, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 1, 0, 0, 0, 1],
       [0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 1, 0, 1, 0, 1, 0],
       [1, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 1, 0, 0, 0, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 1, 0, 0, 0],
       [0, 1, 0, 1, 0, 1, 0, 1, 0],
       [1, 0, 1, 1, 0, 0, 1, 0, 0],
       [1, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 1, 0, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 1, 1],
       [0, 1, 1, 0, 1, 0, 0, 0, 1],
       [1, 1, 0, 0, 0, 0, 0, 0, 1],
       [1, 0, 1, 1, 0, 0, 1, 0, 0],
       [1, 0, 1, 1, 0, 0, 0, 0, 0],
       [1, 0, 1, 1, 0, 0, 1,

In [22]:
y_test

array([[0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 1, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 1],
       [0, 1, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 1],
       [0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 1],
       [1, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 1, 0],
       [1, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 1, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 1, 1, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 1],
       [1, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 1],
       [0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0],
       [1, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0],
       [1, 0, 1, 1, 0, 0, 0,

## Multioutput regression

In [34]:
# Create y
y = []
x = []
for _, row in df_data_valid.iterrows():
    x.append(np.array([row['MNIX'], row['MNIY'], row['MNIZ']]))
    y.append(
        [round(1 * (1 - row['significance']), 4) if key in row['keywords'] else 0 for key in set_keywords])
        # np.array([1 if key in row['keywords'] else 0 for key in set_keywords]))
y = np.array(y)
x = np.array(x)

In [35]:
# Split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=23)

In [36]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge
y_predict = MultiOutputRegressor(Ridge(random_state=0)).fit(x_train, y_train).predict(x_test)

In [37]:
y_predict

array([[ 2.17513601e-01,  3.74473954e-01,  4.61725805e-01,
         7.37511149e-01,  1.66453072e-01,  6.52173372e-02,
         2.80306265e-02,  7.61497693e-02,  2.28019043e-01],
       [ 1.39455529e-01,  4.13547154e-01,  3.80080192e-01,
         7.20114340e-01,  1.74409876e-01,  5.73148648e-02,
         2.91465486e-03,  6.90681073e-02,  2.47060649e-01],
       [ 1.08773242e-01,  5.46052179e-01,  2.85072991e-01,
         7.24494847e-01,  2.74803732e-01,  6.49144344e-04,
        -3.84716315e-02,  1.06118048e-01,  2.65330308e-01],
       [ 1.77762453e-01,  4.20589173e-01,  4.08570186e-01,
         7.31916873e-01,  1.93518989e-01,  4.80501110e-02,
         8.74075282e-03,  8.11714349e-02,  2.38367373e-01],
       [ 2.64422488e-01,  2.97416636e-01,  6.94607844e-01,
         7.89567583e-01,  1.94372587e-01,  2.28227852e-02,
         9.18090880e-02,  4.44293367e-02,  1.02468601e-01],
       [ 1.92441415e-01,  3.94143086e-01,  4.59223208e-01,
         7.40907615e-01,  1.88632360e-01,  4.669693

In [38]:
y_test

array([[0.    , 0.    , 0.9998, 0.9998, 0.    , 0.    , 0.    , 0.    ,
        0.    ],
       [0.    , 1.    , 0.    , 1.    , 0.    , 0.    , 0.    , 0.    ,
        1.    ],
       [0.    , 0.    , 0.    , 0.995 , 0.    , 0.    , 0.    , 0.995 ,
        0.    ],
       [0.    , 0.    , 0.    , 1.    , 0.    , 0.    , 0.    , 0.    ,
        1.    ],
       [0.    , 0.9995, 0.    , 0.9995, 0.9995, 0.    , 0.    , 0.    ,
        0.    ],
       [0.    , 0.    , 0.95  , 0.95  , 0.    , 0.    , 0.    , 0.    ,
        0.    ],
       [0.    , 0.    , 0.    , 0.9995, 0.    , 0.    , 0.    , 0.    ,
        0.9995],
       [0.    , 0.    , 1.    , 1.    , 0.    , 0.    , 0.    , 0.    ,
        0.    ],
       [0.    , 0.    , 1.    , 1.    , 0.    , 0.    , 0.    , 0.    ,
        0.    ],
       [0.    , 0.    , 0.9999, 0.9999, 0.    , 0.    , 0.    , 0.    ,
        0.    ],
       [0.    , 0.    , 1.    , 0.    , 1.    , 0.    , 0.    , 0.    ,
        0.    ],
       [0.    , 0.   

In [33]:
print("mse:", sklearn.metrics.mean_squared_error(y_test, y_predict))

mse: 0.06577708043429517
