In [137]:
import os
import glob
import shutil 
import numpy as np 
import pandas as pd
import warnings
from pathlib import Path
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

## Formulas to clean and organize the data
### output will be the dlc_data_dict


In [138]:
def clean_data(df):
    #rename the columns based on the multi-index set up
    for i in range(1, len(df.columns)):
        j = df.iloc[1:4, i:i+1].values
        col_name = str(j[0])+'_'+str(j[1])+'_'+str(j[2])
        col_name = col_name.replace('[', '').replace(']', '').replace("'", '')
        df.rename(columns={i:col_name}, inplace=True)
    #Reformat the sample data, remove the multi-index rows
    df = df.iloc[4:, :]
    df.rename(columns={0:'frame'}, inplace=True)
    #Convert the data to floats
    df = df.astype(float)
    #Replace all instances of individual1 in column names with test_subject
    #### important: For now BOTH of the stimulus animals are intruders.....
    df.columns = df.columns.str.replace('individual1', 'testsubject')
    df.columns = df.columns.str.replace('individual2', 'intruder')
    #reset the index
    df = df.reset_index(drop=True)
    return df

def organize_raw_data(list_of_csv_files):
    #create a dictionary to store the dataframes
    dlc_data_dict = {}
    for data in list_of_csv_files:
        #edit the filenames to remove commas and spaces
        new_filename = data.replace(' ', '_').replace(',', '')
        #get the subject and session ids
        subject_id = new_filename.split('/')[-1].split('_')[0]
        session_id = new_filename.split('_')[-2].split('_')[-1]
        df = pd.read_csv(data, header=None)
        df = clean_data(df)
        #filter out the subjects recorded at low fps
        if (df.shape)[0] > 4000:
            #add the data to the dictionary of all raw dlc data
            dlc_data_dict[f'{subject_id}_{session_id}'] = df
    return dlc_data_dict


In [139]:
cluster_dir = Path('/Users/carlydrzewiecki/Desktop/trainor_cluster/raw_dlc_data/')

files_to_analyze = glob.glob(str(cluster_dir/'*.csv'))

dlc_data_dict = organize_raw_data(files_to_analyze)

## Formulas for calculating distances
#### These give you distances between animals and also location in the cage (rearing, dist from edges, etc)

In [140]:
def euclidean_distance(x1, y1, x2, y2):
    return np.sqrt((x2 - x1)**2 + (y2 - y1)**2)

def get_distance_between_mice(df, sub1, body_part1, sub2, body_part2):
    series = euclidean_distance(df[sub1+'_'+body_part1+'_x'], df[sub1+'_'+body_part1+'_y'], 
    df[sub2+'_'+body_part2+'_x'], df[sub2+'_'+body_part2+'_y'])
    dist_df = pd.DataFrame(series)
    return dist_df

def get_distance_between_edge(df, sub1, body_part1, edge):
    if edge == 'left':
        series = euclidean_distance(df[sub1+'_'+body_part1+'_x'], 0, 0, 0)
    elif edge == 'right':
        series = euclidean_distance(df[sub1+'_'+body_part1+'_x'], 0, 1173, 0)
        ## 1173 is the width of the arena in pixels (51 cm long, each cm is 23 pixels)
    elif edge == 'bottom':
        series = euclidean_distance(0, df[sub1+'_'+body_part1+'_y'], 0, 0)
    dist_df = pd.DataFrame(series)
    return dist_df


## Formulas for calculating velocities

In [141]:
def get_derivative(df, x, y):
    df['dx'] = df[x].diff()
    df['dy'] = df[y].diff()
    #slope = rise/run
    df['dist_traveled_by_frame'] = df['dy']/df['dx']
    return df

def smooth_data(df, window):
    df_smoothed = df.rolling(window).median()
    return df_smoothed

def get_velocity(df, sub1, body_part1):
    dist_list = []
    for i in range(len(df)-1):
        dist_traveled = euclidean_distance(df[sub1+'_'+body_part1+'_x'][i], df[sub1+'_'+body_part1+'_y'][i], df[sub1+'_'+body_part1+'_x'][i+1], df[sub1+'_'+body_part1+'_y'][i+1])
        dist_list.append(dist_traveled)
    dist_df = pd.DataFrame(dist_list)
    #median smooth
    #distance_smoothed = smooth_data(dist_df, 3)
    #format this dataset so it makes sense....
    dist_df.columns = ['distance_traveled']
    dist_df['frame'] = df['frame']
    #derive the velocity from the distance traveled over frames
    velocity = get_derivative(dist_df, x = 'frame', y = 'distance_traveled')
    return velocity

def get_velocity_between_mice(df, sub1, body_part1, sub2, body_part2):
    distance = get_distance_between_mice(df, sub1, body_part1, sub2, body_part2)
    dist_df = pd.DataFrame(distance)
    #distance_smoothed = smooth_data(dist_df, 3)
    #format this dataset so it makes sense....
    dist_df.columns = ['distance_between_subs']
    dist_df['frame'] = df['frame']
    # #derive the velocity from the distance traveled over frames
    velocity = get_derivative(dist_df, x = 'frame', y = 'distance_between_subs')
    return dist_df

def clean_velocity(df):
    if 'distance_traveled' in df.columns:
        df = df.drop(columns = ['distance_traveled', 'frame', 'dx', 'dy'])
        df['dist_traveled_by_frame'] = df['dist_traveled_by_frame'].abs()
    elif 'distance_between_subs' in df.columns:
        df = df.drop(columns = ['distance_between_subs', 'frame', 'dx', 'dy'])
        #NOTE thaht we do not take the absolute value here 
        # because we want to know if the mice are moving towards or away from each other
        df['dist_traveled_by_frame'] = df['dist_traveled_by_frame']
        #change the column name to be more descriptive
        df.rename(columns={'dist_traveled_by_frame':'dist_between_mice_by_frame'}, inplace=True)
    return df

## Formulas for calculating the orientation of the test mouse to the intruder

In [142]:
def get_vector_lengths(df):
    intruder_midpoint = np.array([df['intruder_midpoint_x'], df['intruder_midpoint_y']])
    testsubject_midpoint = np.array([df['testsubject_midpoint_x'], df['testsubject_midpoint_y']])
    testsubject_nose = np.array([df['testsubject_nose_x'], df['testsubject_nose_y']])
    midpoint_to_midpoint = intruder_midpoint - testsubject_midpoint
    midpoint_to_nose = testsubject_nose - testsubject_midpoint
    return midpoint_to_midpoint, midpoint_to_nose

def get_angles(midpoint_to_midpoint, midpoint_to_nose):
    orientation_list = []
    for i in range(midpoint_to_midpoint.shape[1]-1):
        BA = np.reshape(midpoint_to_nose[:,i:i+1], (2,))
        BC = np.reshape(midpoint_to_midpoint[:,i:i+1], (2,))
        cosine_angle = np.dot(BA, BC/ (np.linalg.norm(BA) * np.linalg.norm(BC)))
        angle = np.arccos(cosine_angle)
        orientation_list.append(np.degrees(angle))
    orientation_df = pd.DataFrame(orientation_list)
    orientation_df.columns = ['orientation_to_intruder']
    return orientation_df


### Finds the angle of orientation between the test subject and the intruder
def get_test_orientation_to_intruder(df):
    midpoint_to_midpoint, midpoint_to_nose = get_vector_lengths(df)
    theta = get_angles(midpoint_to_midpoint, midpoint_to_nose)
    return theta

## Formulas to extract all features

In [143]:
def extract_features(df):
    feature_list = []

    #between centroid distance
    dist_between_mice_midpoints = get_distance_between_mice(df, 'testsubject', 'midpoint', 'intruder', 'midpoint')
    dist_between_mice_midpoints.columns = ['dist_between_mice']
    feature_list.append(dist_between_mice_midpoints)

    #distance between test subject rear and intruder nose
    dist_between_intruder_nose_and_test_rear = get_distance_between_mice(df, 'testsubject', 'tailbase', 'intruder', 'nose')
    dist_between_intruder_nose_and_test_rear.columns = ['intruder_investigation']
    feature_list.append(dist_between_intruder_nose_and_test_rear)

    #distance between test subject nose and intruder nose
    dist_between_mice_noses = get_distance_between_mice(df, 'testsubject', 'nose', 'intruder', 'nose')
    dist_between_mice_noses.columns = ['nose_to_nose_investigation']
    feature_list.append(dist_between_mice_noses)

    #distance from the left edge of the arena
    dist_from_left_wall = get_distance_between_edge(df, 'testsubject', 'nose', 'left')
    dist_from_left_wall.columns = ['dist_from_left_wall']
    feature_list.append(dist_from_left_wall)

    #test_subject_rearing
    test_subject_nose_height = get_distance_between_edge(df, 'testsubject', 'nose', 'bottom')
    test_subject_nose_height.columns = ['test_subject_rearing']
    feature_list.append(test_subject_nose_height)

    #intruder_rearing
    intruder_nose_height = get_distance_between_edge(df, 'intruder', 'nose', 'bottom')
    intruder_nose_height.columns = ['intruder_rearing']
    feature_list.append(intruder_nose_height)

    #Velocity of the test subject
    test_subject_speed_data = get_velocity(df, 'testsubject', 'midpoint')
    test_subject_velocity = clean_velocity(test_subject_speed_data)
    test_subject_velocity.columns = ['test_subject_velocity']
    feature_list.append(test_subject_velocity)

    #Velocity of the intruder subject
    intruder_speed_data = get_velocity(df, 'intruder', 'midpoint')
    intruder_velocity = clean_velocity(intruder_speed_data)
    intruder_velocity.columns = ['intruder_velocity']
    feature_list.append(intruder_velocity)

    #distance between test subject and intruder over time
    delta_dif_between_mice = get_velocity_between_mice(df, 'testsubject', 'midpoint', 'intruder', 'midpoint')
    velocity_between_mice = clean_velocity(delta_dif_between_mice)
    velocity_between_mice.columns = ['velocity_between_mice']
    feature_list.append(velocity_between_mice)

    #orientation of the test mouse to the intruder (measured in degrees)
    orientation_to_intruder = get_test_orientation_to_intruder(df)
    orientation_to_intruder.columns = ['orientation_to_intruder']
    feature_list.append(orientation_to_intruder)

    return feature_list

def compile_features(feature_list):
    compiled_features = pd.concat(feature_list, axis = 1)
    return compiled_features

In [144]:
for raw_data in dlc_data_dict:
    feature_list = extract_features(dlc_data_dict[raw_data])
    compiled_features = compile_features(feature_list)
    compiled_features.to_csv(f'/Users/carlydrzewiecki/Desktop/trainor_cluster/extracted_features_output/{raw_data}_compiled_features.csv')