In [1]:
# Successfully using this file requires the existence of "data.csv".
# This can be generated with the "load_Allen_Visual_Behavior_from_pre_processed_file.ipynb" notebook.

import pandas as pd
from helper_functions import create_neural_traces_per_label

In [2]:
data_csv_path =        "data/data.csv"
columns_of_interest = ["stimulus_presentations_id", "cell_specimen_id", "trace",
                       "trace_timestamps", "image_name", "image_index", "mouse_id",
                       "animal_in_image", "close_proximity"]

create_neural_traces_per_label(data_csv_path, columns_of_interest)

data = pd.read_csv("data/neural_activity_per_trial.csv")
print(data.shape)

Excluded 88 because of duplicate cell_specimen_ids.
The combinations of ['stimulus_presentations_id', 'mouse_id'] is unique. To do so, 465 out of 9209 trials have been ignored.
(8744, 9)


In [4]:
# Print how many mice there are, and how many trialy per mouse there are in the dataset.
i_mice = data["mouse_id"].unique()
print(f"Number of mice: {len(i_mice)}\n")

trials_per_mouse = data.groupby("mouse_id").size()

trials_per_mouse_data = trials_per_mouse.reset_index(name='n_trials')
trials_per_mouse_data.columns = ['mouse_id', 'n_trials']
trials_per_mouse_data = trials_per_mouse_data.sort_values(by='n_trials', ascending=False)

print(f"Trials per mouse, ordered by trial count:\n\n{trials_per_mouse_data.to_string(index=False)}")

Number of mice: 13

Trials per mouse, ordered by trial count:

 mouse_id  n_trials
   476970      1160
   492395      1151
   467953       830
   486737       745
   467954       710
   477202       710
   489066       679
   485688       604
   480753       505
   470784       498
   467951       448
   489056       378
   498972       326


The 13 mice have between 326 and 1160 per mouse, which is enough to meaningfully use classifiers.

The next question however is, if all trials contain the same or different neurons. If they contain different neurons per trial, analysis will be more difficult.

In [6]:
# Print how often each neuron appears in the trials per mouse.

neuron_counts = {}
always_occuring_neurons_count = {}

for mouse_id in i_mice:
    neuron_counts[mouse_id] = {}
    mouse_trials = data[data["mouse_id"] == mouse_id]
    always_occuring_neurons_count[mouse_id] = 0

    n_trials = len(mouse_trials)
    
    for neuron_list in mouse_trials["cell_specimen_id"]:
        for neuron_id in [int(neuron_id) for neuron_id in neuron_list.strip("[]").split(", ")]:
            if neuron_id not in neuron_counts[mouse_id]: neuron_counts[mouse_id][neuron_id] = 0  
            neuron_counts[mouse_id][neuron_id] += 1

    for neuron_id in list(neuron_counts[mouse_id].keys()):
        if neuron_counts[mouse_id][neuron_id] == n_trials: always_occuring_neurons_count[mouse_id] += 1
        neuron_counts[mouse_id][neuron_id] = round(neuron_counts[mouse_id][neuron_id] / n_trials * 100, 3)

for mouse_id in list(neuron_counts.keys()):
    print(f"\nMouse {mouse_id} - {always_occuring_neurons_count[mouse_id]} Neurons appear in all {len(data[data['mouse_id'] == mouse_id])} trials\n  Neurons:")
    for neuron_id, neuron_percentage in sorted(neuron_counts[mouse_id].items(), key=lambda item: item[1], reverse=True):
        print(f"    {neuron_id}: {neuron_percentage} %")


Mouse 492395 - 15 Neurons appear in all 1151 trials
  Neurons:
    1086557082: 100.0 %
    1086560048: 100.0 %
    1086559782: 100.0 %
    1086558904: 100.0 %
    1086559446: 100.0 %
    1086555190: 100.0 %
    1086555432: 100.0 %
    1086567017: 100.0 %
    1086560592: 100.0 %
    1086567589: 100.0 %
    1086558341: 100.0 %
    1086556696: 100.0 %
    1086563032: 100.0 %
    1086563829: 100.0 %
    1086561466: 100.0 %
    1086561830: 76.803 %
    1086557997: 76.803 %
    1086556416: 76.803 %
    1086560926: 76.803 %
    1086562430: 76.803 %
    1086566746: 63.076 %
    1086566230: 63.076 %
    1086563384: 63.076 %
    1086565528: 63.076 %
    1086562780: 63.076 %
    1086559188: 60.122 %
    1086567897: 60.122 %
    1086556094: 60.122 %
    1086569757: 39.878 %
    1086565788: 39.878 %
    1086568632: 39.878 %
    1086573070: 39.878 %
    1086568315: 39.878 %
    1086568830: 39.878 %
    1086566623: 39.878 %
    1086565124: 39.878 %
    1086569063: 39.878 %
    1086563601: 39.878 %
 

Per mouse, between 3 and 18 Neurons appear on every trial.

Examples with over 10 Neurons will likely be more interesting than ones with e.g. 3.

Neurons that do not appear on every trial are harder to use.

Mouse 492395 looks especially promising, as it has the highest number of trials and 15 neurons recorded across all of them.

For the intended hypothesis, the next question is, how many different images are contained in the trials. If there are only a few of them, overfitting must be avoided.

In [9]:
# Print image occurrences per mouse trials

image_counts = {}

for mouse_id in i_mice:
    image_counts[mouse_id] = {}
    mouse_trials = data[data["mouse_id"] == mouse_id]

    n_trials = len(mouse_trials)

    for image_name in mouse_trials["image_name"]:
            if image_name not in image_counts[mouse_id]: image_counts[mouse_id][image_name] = 0  
            image_counts[mouse_id][image_name] += 1

    for image_name in list(image_counts[mouse_id].keys()):
        image_counts[mouse_id][image_name] = round(image_counts[mouse_id][image_name] / n_trials * 100, 3)

for mouse_id in list(image_counts.keys()):
    print(f"\nMouse {mouse_id} - {len(image_counts[mouse_id].keys())} different Images appear accross the {len(data[data['mouse_id'] == mouse_id])} trials\n  Images:")
    for image_name, image_percentage in sorted(image_counts[mouse_id].items(), key=lambda item: item[1], reverse=True):
        print(f"    {image_name}: {image_percentage} %")


Mouse 492395 - 17 different Images appear accross the 1151 trials
  Images:
    omitted: 37.619 %
    im054: 4.865 %
    im045: 4.778 %
    im106: 4.692 %
    im000: 4.518 %
    im031: 4.431 %
    im073: 4.344 %
    im035: 4.257 %
    im075: 4.083 %
    im062: 3.649 %
    im085: 3.562 %
    im063: 3.388 %
    im069: 3.301 %
    im065: 3.215 %
    im077: 3.128 %
    im061: 3.128 %
    im066: 3.041 %

Mouse 467951 - 9 different Images appear accross the 448 trials
  Images:
    omitted: 44.643 %
    im073: 7.143 %
    im031: 7.143 %
    im106: 7.143 %
    im035: 6.92 %
    im054: 6.92 %
    im075: 6.92 %
    im000: 6.696 %
    im045: 6.473 %

Mouse 485688 - 17 different Images appear accross the 604 trials
  Images:
    omitted: 50.662 %
    im077: 3.477 %
    im063: 3.477 %
    im061: 3.477 %
    im031: 3.311 %
    im062: 3.311 %
    im073: 3.146 %
    im066: 3.146 %
    im065: 3.146 %
    im000: 2.98 %
    im069: 2.98 %
    im085: 2.98 %
    im054: 2.815 %
    im045: 2.815 %
    im035

Between 8 and 16 different images appear per mouse, usually with very similar occurance.

This is not ideal, but it might suvice. When training a classifyer, the issue of overfitting should be regarded, and the splitting of the training set in not trivial.

I do not know what "omitted" means.