Test data for possible bad inputs

In [1]:
import pandas as pd
import numpy as np
import importlib
import util.Data as data_util
import util.Plotly as plots
import json

In [2]:
importlib.reload(data_util)

<module 'util.Data' from 'D:\\scul\\uaic-ub\\licenta\\code\\util\\Data.py'>

# Data

In [3]:
csv_filename = r'../data_db/final_coordinates-no_conversion.csv'

In [4]:
_no_columns = 19
df_data = data_util.import_data(csv_filename)
assert df_data.shape[1] == _no_columns, "The number of columns is not good"
print(df_data.shape)

(1827, 19)


In [5]:
print("Number of peeks: ", df_data.shape[0])
print("Number of papers: ", len(df_data['title'].unique()))
print("Number of main authors: ", len(df_data['author'].unique()))

Number of peeks:  1827
Number of papers:  61
Number of main authors:  54


In [6]:
df_data.dtypes

importance        string
author            string
title             string
table_name        string
contrast          string
keywords          string
Name              string
Left/Right        string
Broadman Area     string
X(R)               int64
Y(A)               int64
Z(S)               int64
z-score          float64
subjects           int64
cluster size     float64
p value          float64
t                float64
NV               float64
index_col          int64
dtype: object

# Data Check

## Hemisphere

In [7]:
#get nan out
mask_nan = df_data['Left/Right'].isna()
df_left_right = df_data[~mask_nan]
#Get known out
ls_left_right = ["left", "right", "r", "r.", "l", "l/r", "l.", "middle", "m.", "left/right", "bilateral"]
mask_ba = df_left_right['Left/Right'].isin(ls_left_right)
df_left_right = df_left_right[~mask_ba]
print(df_left_right.shape)
assert df_left_right.shape[0] == 0, "Hemisphere data may contain unwanted values"
# df_data['Left/Right'].unique()

(0, 19)


## Check columns without null values

In [8]:
print(df_data.columns)

Index(['importance', 'author', 'title', 'table_name', 'contrast', 'keywords',
       'Name', 'Left/Right', 'Broadman Area', 'X(R)', 'Y(A)', 'Z(S)',
       'z-score', 'subjects', 'cluster size', 'p value', 't', 'NV',
       'index_col'],
      dtype='object')


In [9]:
ls_col_nulls = ['author', 'title', 'table_name', 'contrast', 'keywords',
                'X(R)', 'Y(A)', 'Z(S)', 'subjects']
for col_name in ls_col_nulls:
    assert df_data[col_name].isna().sum() == 0, "Column: \"{col_name}\" contains null values".format(col_name=col_name)

## MNI and Talairach Spaces
Count the MNI and Talairach keywords and should be equal with total rows

In [10]:
mni_spaces_entries = df_data['keywords'].str.contains('MNI').sum()
tal_spaces_entries = df_data['keywords'].str.contains('talairach').sum()
print("MNI space entries", mni_spaces_entries)
print("Talairach space entries", tal_spaces_entries)
print("Total number of rows:\t\t\t{total_no_rows} \nsum of points with space specified:\t{points_sum}"
      "\nDifference:\t\t\t\t{diff}".format(total_no_rows=df_data.shape[0],
                                            points_sum=mni_spaces_entries + tal_spaces_entries,
                                            diff=df_data.shape[0] - (mni_spaces_entries + tal_spaces_entries)))

MNI space entries 1062
Talairach space entries 765
Total number of rows:			1827 
sum of points with space specified:	1827
Difference:				0


In [11]:
# mask_spaces = df_data['keywords'].str.contains('MNI|talairach')
# df_tmp_spaces = df_data[~mask_spaces]

In [12]:
assert df_data.shape[0] == mni_spaces_entries + tal_spaces_entries,\
"There are some entries without the space specified or with multiple spaces specified"

## Other checks



### Cluster size and Number of Voxels

In [13]:
mask_cluster_size_float = df_data['cluster size'].apply(lambda x: False if (pd.isna(x) or (x - int(x)) == 0) else True)
df_tmp_cluster = df_data[mask_cluster_size_float]
assert df_tmp_cluster.shape[0] == 0, "There may be a problem in cluster size"

mask_nv_float = df_data['NV'].apply(lambda x: False if (pd.isna(x) or (x - int(x)) == 0) else True)
df_tmp_nv = df_data[mask_nv_float]
assert df_tmp_nv.shape[0] == 0, "There are fractional number of voxels"

Peak points coordinates

In [14]:
#TODO assert in intervals for x,y and z

### Contrast

In [15]:
counter_contrast = data_util.df_counter(df_data, 'contrast', order=1)
print(json.dumps(counter_contrast, indent=2, default=str))

{
  ">nan_values<": 0,
  "punishment > no reward": "2",
  "win after losing streak > w1": "2",
  "loss after winning streak > l1": "2",
  "penalty during run of failure": "2",
  "social reward > monetary reward": "3",
  "precue > retrocue": "3",
  "ar switch > stay and sr switch > stay": "4",
  "all female faces > all male faces": "4",
  "w4 > w1": "4",
  "lose modulation": "4",
  "negative correlation with reward level": "4",
  "movement > no movement and reward > no reward": "4",
  "sub-id > baseline": "4",
  "decision-making > control task": "5",
  "risky gambles > safe gambles": "5",
  "beautiful male > average male": "5",
  "l4 > l1": "5",
  "all w > all l": "5",
  "win modulation": "5",
  "disadvantageous > advantageous": "5",
  "reward during run of success": "5",
  "lose > win": "5",
  "punishment > reward": "5",
  "monetary reward magnitude": "5",
  "beautiful faces > average faces": "6",
  "reward during run of success and penalty during run of failure": "6",
  "monetary rewa

### Report titles

In [16]:
counter_title = data_util.df_counter(df_data, 'title', order=0)
print(json.dumps(counter_title, indent=2, default=str))

{
  ">nan_values<": 0,
  "A Rapid fMRI Task Battery for Mapping of Visual, Motor, Cognitive and Emotional Function": "13",
  "A functional neuroimaging study of motivation and executive function": "41",
  "A region of mesial prefrontal cortex tracks monetarily rewarding outcomes - characterization with rapid event-related fMRI": "21",
  "Abnormal neural activity in partially remitted late-onset depression An fMRI study of one-back working memory task": "58",
  "Activity in human reward-sensitive brain areas is strongly context dependent": "10",
  "Alterations in the functional neural circuitry supporting flexible choice behavior in autism spectrum disorders": "27",
  "An fMRI study of working memory for schematic facial expressions": "24",
  "Anterior cingulate cortex differently modulates fronto-parietal functional connectivity between resting-state and working memory tasks": "26",
  "Anticipation of Increasing Monetary Reward Selectively Recruits Nucleus Accumbens": "24",
  "Anticipa