# MUW Neuroscience Seminar WS 23/24
## Group Work: CAL MS21 Dataset

## Preprocessing Goal

The originally nested data of the Mouse Social Behaviour Challenge is simplified and reduced to just the keypoints of the mouse body parts and the annotations.

In [1]:
import numpy as np
import pandas as pd

In [2]:
def load_task1_data(data_path):
  """
  Load data for task 1:
      The vocaubulary tells you how to map behavior names to class ids;
      it is the same for all sequences in this dataset.
  """
  data_dict = np.load(data_path, allow_pickle=True).item()
  dataset = data_dict['annotator-id_0']
  # Get any sequence key.
  sequence_id = list(data_dict['annotator-id_0'].keys())[0]
  vocabulary = data_dict['annotator-id_0'][sequence_id]['metadata']['vocab']
  return dataset, vocabulary


training_data, vocab = load_task1_data('data/calms21_task1_train.npy') #check where you created the files in the loading notebook
test_data, _ = load_task1_data('data/calms21_task1_test.npy') #check where you created the files in the loading notebook

In [3]:
print("Sample dataset keys: ", list(training_data.keys())[:3])
print("Vocabulary: ", vocab)
print("Number of train Sequences: ", len(training_data))
print("Number of test Sequences: ", len(test_data))

Sample dataset keys:  ['task1/train/mouse001_task1_annotator1', 'task1/train/mouse002_task1_annotator1', 'task1/train/mouse003_task1_annotator1']
Vocabulary:  {'attack': 0, 'investigation': 1, 'mount': 2, 'other': 3}
Number of train Sequences:  70
Number of test Sequences:  19


In [4]:
sequence_names = list(training_data.keys())
sample_sequence_key = sequence_names[0]
single_sequence = training_data[sample_sequence_key]
print("Name of our sample sequence: ", sample_sequence_key)
print("Sequence keys: ", single_sequence.keys())
print("Sequence metadata: ", single_sequence['metadata'])
print(f"Number of Frames in Sequence \"{sample_sequence_key}\": ", len(single_sequence['annotations']))
print(f"Keypoints data shape of Sequence \"{sample_sequence_key}\": ", single_sequence['keypoints'].shape)

Name of our sample sequence:  task1/train/mouse001_task1_annotator1
Sequence keys:  dict_keys(['keypoints', 'scores', 'annotations', 'metadata'])
Sequence metadata:  {'annotator-id': 0, 'vocab': {'attack': 0, 'investigation': 1, 'mount': 2, 'other': 3}}
Number of Frames in Sequence "task1/train/mouse001_task1_annotator1":  21364
Keypoints data shape of Sequence "task1/train/mouse001_task1_annotator1":  (21364, 2, 2, 7)


In [5]:
# simplify data in a dataframe

sequence_names = list(training_data.keys())

data = []

#columns
mice = ['m1', 'm2']
coordinates = ['x', 'y']
bodyparts = ['nose', 'left_ear', 'right_ear', 'neck', 'left_hip', 'right_hip', 'tail_base']

print('We have ', len(sequence_names), ' sequences')
for sequence in sequence_names:

    for f, frame in enumerate(training_data[sequence]['keypoints']):

      tabdata = {}
      id = sequence + str(f)
      tabdata = {'sequence': sequence, 'frame': f, 'id': id}
      tabdata['label'] = training_data[sequence]['annotations'][f]

      # create a column for each mouse + coordinate + bodypart column
      for m, mouse in enumerate(frame):

        for c, coordinate in enumerate(mouse):

          for b, c_bodypart in enumerate(coordinate):

            column_name = mice[m] + '_' + coordinates[c] + '_' + bodyparts[b]
            tabdata[column_name] = c_bodypart

      data.append(tabdata)

print('We have ', len(data), ' frames in total in the dataset')

We have  70  sequences
We have  507738  frames in total in the dataset


In [6]:
# converting the data into a dataframe
df = pd.DataFrame(data)
df.head()

Unnamed: 0,sequence,frame,id,label,m1_x_nose,m1_x_left_ear,m1_x_right_ear,m1_x_neck,m1_x_left_hip,m1_x_right_hip,...,m2_x_left_hip,m2_x_right_hip,m2_x_tail_base,m2_y_nose,m2_y_left_ear,m2_y_right_ear,m2_y_neck,m2_y_left_hip,m2_y_right_hip,m2_y_tail_base
0,task1/train/mouse001_task1_annotator1,0,task1/train/mouse001_task1_annotator10,3,831.659204,805.659204,775.659204,780.659204,711.659204,711.659204,...,796.915924,840.915924,766.915924,253.216902,195.216902,193.216902,179.216902,152.216902,102.216902,97.216902
1,task1/train/mouse001_task1_annotator1,1,task1/train/mouse001_task1_annotator11,1,833.050439,809.050439,778.050439,783.050439,723.050439,717.050439,...,799.907019,846.907019,766.907019,259.539977,204.539977,201.539977,188.539977,153.539977,105.539977,98.539977
2,task1/train/mouse001_task1_annotator1,2,task1/train/mouse001_task1_annotator12,1,838.718976,816.718976,776.718976,787.718976,730.718976,713.718976,...,800.195703,860.195703,777.195703,256.902935,208.902935,205.902935,193.902935,150.902935,112.902935,99.902935
3,task1/train/mouse001_task1_annotator1,3,task1/train/mouse001_task1_annotator13,1,826.757507,815.757507,774.757507,785.757507,743.757507,711.757507,...,794.788861,856.788861,786.788861,263.420539,206.420539,206.420539,193.420539,147.420539,113.420539,97.420539
4,task1/train/mouse001_task1_annotator1,4,task1/train/mouse001_task1_annotator14,1,822.045709,812.045709,768.045709,779.045709,749.045709,709.045709,...,789.578644,862.578644,793.578644,263.366469,202.366469,201.366469,190.366469,143.366469,120.366469,95.366469


In [7]:
# check label distribution - imbalanced classification problem
df.groupby(['label'])['label'].count()

label
0     14039
1    146615
2     28615
3    318469
Name: label, dtype: int64

In [8]:
# adding a column for binary classification for attack
df['attack'] = np.where(df['label']==0, 1, 0)
df.groupby(['attack'])['attack'].count()

attack
0    493699
1     14039
Name: attack, dtype: int64

In [9]:
df.query('attack==1')

Unnamed: 0,sequence,frame,id,label,m1_x_nose,m1_x_left_ear,m1_x_right_ear,m1_x_neck,m1_x_left_hip,m1_x_right_hip,...,m2_x_right_hip,m2_x_tail_base,m2_y_nose,m2_y_left_ear,m2_y_right_ear,m2_y_neck,m2_y_left_hip,m2_y_right_hip,m2_y_tail_base,attack
23470,task1/train/mouse002_task1_annotator1,2106,task1/train/mouse002_task1_annotator12106,0,200.810637,236.810637,170.810637,198.810637,227.810637,146.810637,...,295.559488,226.559488,196.530399,133.530399,159.530399,136.530399,109.530399,119.530399,96.530399,1
23471,task1/train/mouse002_task1_annotator1,2107,task1/train/mouse002_task1_annotator12107,0,217.102132,224.102132,166.102132,191.102132,227.102132,153.102132,...,268.606415,293.606415,171.211275,102.211275,136.211275,113.211275,94.211275,156.211275,117.211275,1
23472,task1/train/mouse002_task1_annotator1,2108,task1/train/mouse002_task1_annotator12108,0,188.905800,227.905800,155.905800,187.905800,230.905800,154.905800,...,267.856203,289.856203,158.902380,86.902380,127.902380,98.902380,88.902380,147.902380,119.902380,1
23473,task1/train/mouse002_task1_annotator1,2109,task1/train/mouse002_task1_annotator12109,0,215.931004,218.931004,146.931004,176.931004,231.931004,156.931004,...,261.040175,282.040175,148.514660,75.514660,119.514660,92.514660,91.514660,153.514660,120.514660,1
23474,task1/train/mouse002_task1_annotator1,2110,task1/train/mouse002_task1_annotator12110,0,220.757887,214.757887,143.757887,170.757887,220.757887,146.757887,...,265.017714,253.017714,161.940282,71.940282,122.940282,92.940282,106.940282,151.940282,171.940282,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507386,task1/train/mouse070_task1_annotator1,12185,task1/train/mouse070_task1_annotator112185,0,758.709680,709.709680,698.709680,680.709680,603.709680,590.709680,...,860.900513,820.900513,225.038497,278.038497,210.038497,249.038497,316.038497,236.038497,279.038497,1
507387,task1/train/mouse070_task1_annotator1,12186,task1/train/mouse070_task1_annotator112186,0,820.085913,777.085913,772.085913,759.085913,636.085913,635.085913,...,887.764648,855.764648,183.435973,258.435973,197.435973,236.435973,302.435973,224.435973,252.435973,1
507388,task1/train/mouse070_task1_annotator1,12187,task1/train/mouse070_task1_annotator112187,0,903.216522,835.216522,820.216522,807.216522,720.216522,712.216522,...,891.783875,858.783875,145.317623,224.317623,172.317623,213.317623,291.317623,212.317623,250.317623,1
507389,task1/train/mouse070_task1_annotator1,12188,task1/train/mouse070_task1_annotator112188,0,902.815503,848.815503,836.815503,826.815503,757.815503,766.815503,...,881.801709,870.801709,118.164476,205.164476,161.164476,196.164476,267.164476,214.164476,253.164476,1


In [10]:
# download the dataset to be reused
df.to_csv('calms21_tab_data.csv')

## Potential enhancements
Experiments from causal inference showing relation to the distance of the mouse could be included.
Distances between the different bodypart points could be included.
?
?
?