# Preprocessing of the data

## Event Dictionary

* Response RI = 24
* Response RR = 44
* Reward RI = 29
* Reward RR = 42
* Start session with RI (p=0.5) = 12
* Start session with RR (p=0.5) = 13
* Switch from RR to RI = 37
* Switch from RI to RR = 36
* Show pairs = 11
* Response on TecCen to start session = 11

### Importing libraries

In [22]:
import pandas as pd
import numpy as np

### Importing data, creating dataframe, extracting subjects, sessions, and pairs.

In [23]:
df = pd.read_csv("full_data_2022.csv")

In [24]:
df.head()

Unnamed: 0,box,bird,session,pair,time,event,archive_name
0,2.0,P007,S1,1,0.02,11,P007_Dinamica_S1.xls
1,2.0,P007,S1,1,0.02,12,P007_Dinamica_S1.xls
2,2.0,P007,S1,1,1.02,21,P007_Dinamica_S1.xls
3,2.0,P007,S1,1,1.31,10,P007_Dinamica_S1.xls
4,2.0,P007,S1,1,1.31,17,P007_Dinamica_S1.xls


In [25]:
df["session"].unique()  

array(['S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10', 'S11',
       'S12', 'S13', 'S14', 'S15', 'S16', 'S17', 'S18', 'S19', 'S20',
       'S21', 'S22', 'S23', 'S24', 'S25', 'S26', 'S27', 'S28', 'S29',
       'S30', 'S31', 'S32', 'S33', 'S34', 'S35', 'S36', 'S37', 'S38',
       'S39', 'S40', 'S41', 'S42', 'S43', 'S44', 'S45', 'S46', 'S47',
       'S48', 'S49', 'S50', 'S51', 'S52', 'S53', 'S54', 'S55', 'S56',
       'S57', 'S58', 'S59', 'S60', 'S61', 'S62', 'S63', 'S64', 'S65',
       'S66', 'S67', 'S68', 'S69', 'S70', 'S71', 'S72', 'S73', 'S74',
       'S75', 'S76', 'S77', 'S78', 'S79', 'S80', 'S81', 'S82', 'S83',
       'S84', 'S85', 'S86', 'S87', 'S88', 'S89', 'S90', 'S91', 'S92',
       'S93', 'S94', 'S95', 'S96', 'S97', 'S98', 'S99', 'S100', 'S101',
       'S102', 'S103', 'S104', 'S105', 'S106', 'S107', 'S108', 'S109',
       'S110', 'S111', 'S112', 'S113', 'S114', 'S115', 'S116', 'S117',
       'S118', 'S119', 'S120'], dtype=object)

In [27]:
def get_data_info(df):
     """
     Returns the number of sessions, birds, pairs and events in the dataset.
     """

     sessions = df.session.unique()
     birds = df.bird.unique()
     pairs = np.sort(df.pair.unique())
     events = np.sort(df.event.unique())

     print(f"Sessions: {sessions}","\n", "*"*60, "\n", 
          f"Birds:{birds}", "\n", "*"*60, "\n", 
          f"Pairs: {pairs}", "\n", "*"*60, "\n",
          f"Events: {events}")
     
     return sessions, birds, pairs, events

In [28]:
sessions, birds, pairs, events = get_data_info(df)

Sessions: ['S1' 'S2' 'S3' 'S4' 'S5' 'S6' 'S7' 'S8' 'S9' 'S10' 'S11' 'S12' 'S13'
 'S14' 'S15' 'S16' 'S17' 'S18' 'S19' 'S20' 'S21' 'S22' 'S23' 'S24' 'S25'
 'S26' 'S27' 'S28' 'S29' 'S30' 'S31' 'S32' 'S33' 'S34' 'S35' 'S36' 'S37'
 'S38' 'S39' 'S40' 'S41' 'S42' 'S43' 'S44' 'S45' 'S46' 'S47' 'S48' 'S49'
 'S50' 'S51' 'S52' 'S53' 'S54' 'S55' 'S56' 'S57' 'S58' 'S59' 'S60' 'S61'
 'S62' 'S63' 'S64' 'S65' 'S66' 'S67' 'S68' 'S69' 'S70' 'S71' 'S72' 'S73'
 'S74' 'S75' 'S76' 'S77' 'S78' 'S79' 'S80' 'S81' 'S82' 'S83' 'S84' 'S85'
 'S86' 'S87' 'S88' 'S89' 'S90' 'S91' 'S92' 'S93' 'S94' 'S95' 'S96' 'S97'
 'S98' 'S99' 'S100' 'S101' 'S102' 'S103' 'S104' 'S105' 'S106' 'S107'
 'S108' 'S109' 'S110' 'S111' 'S112' 'S113' 'S114' 'S115' 'S116' 'S117'
 'S118' 'S119' 'S120'] 
 ************************************************************ 
 Birds:['P007' 'P183' 'P450' 'P502' 'P507' 'P696' 'P767' 'P892'] 
 ************************************************************ 
 Pairs: [0 1 2 3 4 5] 
 *****************************

In [29]:
df_last_sessions = df[df["session"].isin(sessions[59:])]

In [30]:
df_last_sessions.head(50)

Unnamed: 0,box,bird,session,pair,time,event,archive_name
268200,2.0,P007,S60,3,0.02,11,P007_Dinamica_S60.xls
268201,2.0,P007,S60,3,0.02,12,P007_Dinamica_S60.xls
268202,2.0,P007,S60,3,1.02,21,P007_Dinamica_S60.xls
268203,2.0,P007,S60,3,2.02,21,P007_Dinamica_S60.xls
268204,2.0,P007,S60,3,3.02,21,P007_Dinamica_S60.xls
268205,2.0,P007,S60,3,4.02,21,P007_Dinamica_S60.xls
268206,2.0,P007,S60,3,5.02,21,P007_Dinamica_S60.xls
268207,2.0,P007,S60,3,6.02,20,P007_Dinamica_S60.xls
268208,2.0,P007,S60,3,13.51,22,P007_Dinamica_S60.xls
268209,2.0,P007,S60,3,13.51,24,P007_Dinamica_S60.xls


In [31]:
df_last_sessions.drop(['box', 'archive_name'], axis=1, inplace=True)
df_last_sessions.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_last_sessions.drop(['box', 'archive_name'], axis=1, inplace=True)


Unnamed: 0,bird,session,pair,time,event
268200,P007,S60,3,0.02,11
268201,P007,S60,3,0.02,12
268202,P007,S60,3,1.02,21
268203,P007,S60,3,2.02,21
268204,P007,S60,3,3.02,21


In [32]:
df_last_sessions.reset_index(inplace=True, drop=True)
df_last_sessions.head()

Unnamed: 0,bird,session,pair,time,event
0,P007,S60,3,0.02,11
1,P007,S60,3,0.02,12
2,P007,S60,3,1.02,21
3,P007,S60,3,2.02,21
4,P007,S60,3,3.02,21


In [33]:
df_last_sessions.tail(10)

Unnamed: 0,bird,session,pair,time,event
2567524,P892,S120,1,1233.52,21
2567525,P892,S120,1,1234.52,21
2567526,P892,S120,1,1235.52,21
2567527,P892,S120,1,1236.52,20
2567528,P892,S120,1,1236.58,22
2567529,P892,S120,1,1236.58,24
2567530,P892,S120,1,1236.58,29
2567531,P892,S120,1,1236.58,33
2567532,P892,S120,1,1239.08,34
2567533,P892,S120,1,1239.08,70


In [34]:
sessions, birds, pairs, events = get_data_info(df_last_sessions)

Sessions: ['S60' 'S61' 'S62' 'S63' 'S64' 'S65' 'S66' 'S67' 'S68' 'S69' 'S70' 'S71'
 'S72' 'S73' 'S74' 'S75' 'S76' 'S77' 'S78' 'S79' 'S80' 'S81' 'S82' 'S83'
 'S84' 'S85' 'S86' 'S87' 'S88' 'S89' 'S90' 'S91' 'S92' 'S93' 'S94' 'S95'
 'S96' 'S97' 'S98' 'S99' 'S100' 'S101' 'S102' 'S103' 'S104' 'S105' 'S106'
 'S107' 'S108' 'S109' 'S110' 'S111' 'S112' 'S113' 'S114' 'S115' 'S116'
 'S117' 'S118' 'S119' 'S120'] 
 ************************************************************ 
 Birds:['P007' 'P183' 'P450' 'P502' 'P507' 'P696' 'P767' 'P892'] 
 ************************************************************ 
 Pairs: [0 1 2 3 4 5] 
 ************************************************************ 
 Events: [10 11 12 13 14 15 16 17 18 19 20 21 22 24 29 32 33 34 36 40 42 43 44 46
 47 60 70 75]


## Functions to preprocess the data

In [36]:
def transform_data(data: pd.DataFrame()):
    # Description of the function
    """
    The function  `transform_data(data, birds, sessions, pairs)`, transform the data to be analyzed in the following way:
    1. Extract the following variables:
         Responses
         Responses before reward
         Visit lenght responses
         Time
         Time difference
         Visit lenght time
         Reward
         Time since last reward
    2. Create a new dataframe with the variables extracted.
    3. Return the new dataframe.

    Input: data   (pandas dataframe)  - Data to be transformed
    Output: data  (pandas dataframe) - Transformed data
    """
    # Number of observations
    len_data = data.shape[0]

    # Bird
    bird = np.array([data.bird.unique()[0]])
    bird = np.repeat(bird, len_data)

    # Session
    session = np.array([data.session.unique()[0]])
    session = np.repeat(session, len_data)

    # Pair
    pair = np.array([data.pair.unique()])
    pair = np.repeat(pair, len_data)

    # Responses
    resp_RI = [0]
    resp_RR = [0]

    # Responses before reward
    responses_before_reward_RI = [0]
    responses_before_reward_RR = [0]

    # Visit lenght responses
    visit_lenght_response_RI = [0]
    visit_lenght_response_RR = [0]

    # Time
    time = [0]
    # Time difference
    time_diff = [0]

    # Visit lenght time
    visit_lenght_time_RI = [0]
    visit_lenght_time_RR = [0]

    # Reward
    reward_RI = [0]
    reward_RR = [0]

    # Time since last reward
    time_since_last_reward_RI = [0]
    time_since_last_reward_RR = [0]

    # Counter for the number of responses before reward
    counter_ri = 0
    counter_rr = 0

    # Counter for the time since last reward
    time_counter_rr = 0
    time_counter_ri = 0

    # Counter for the visit length
    counter_visit_length_ri = 0
    counter_visit_length_rr = 0

    # Counter for the visit time
    counter_visit_time_ri = 0
    counter_visit_time_rr = 0

#################################################################################
    # Loop over the data to extract the variables for the analysis.
    for ii in range(1, len_data):
#################################################################################
        if data.iloc[ii]["event"] == 29:
            # time
            time.append(data.iloc[ii]["time"])
            # time difference
            time_diff.append(time[ii] - time[ii-2])

            # Responses before reward
            counter_ri = 0
            responses_before_reward_RI.append(counter_ri)
            responses_before_reward_RR.append(counter_rr)

            # Time since last reward
            time_counter_ri = 0
            time_counter_rr += time_diff[ii]
            time_since_last_reward_RI.append(time_counter_ri)
            time_since_last_reward_RR.append(time_counter_rr)

            # Responses
            resp_RI.append(1)
            resp_RR.append(0)

            # Reward
            reward_RI.append(1)
            reward_RR.append(0)

            # Lenght visit response
            counter_visit_length_rr = 0
            counter_visit_length_ri += 1
            visit_lenght_response_RI.append(counter_visit_length_ri)
            visit_lenght_response_RR.append(counter_visit_length_rr)

            # Lenght visit time
            counter_visit_time_ri = time_diff[ii]
            counter_visit_time_rr = 0
            visit_lenght_time_RI.append(counter_visit_time_ri)
            visit_lenght_time_RR.append(counter_visit_time_rr)

    #################################################################################
        if data.iloc[ii]["event"] == 42:
            # time
            time.append(data.iloc[ii][ "time"])
            # time difference
            time_diff.append(time[ii] - time[ii-2])

            # Responses before reward
            counter_rr = 0
            responses_before_reward_RR.append(counter_rr)
            responses_before_reward_RI.append(counter_ri)

            # Time since last reward
            time_counter_ri += time_diff[ii]
            time_counter_rr = 0
            time_since_last_reward_RI.append(time_counter_ri)
            time_since_last_reward_RR.append(time_counter_rr)

            # Responses
            resp_RI.append(0)
            resp_RR.append(1)

            # Reward
            reward_RI.append(0)
            reward_RR.append(1)

            # Lenght visit response
            counter_visit_length_rr += 1
            counter_visit_length_ri = 0
            visit_lenght_response_RI.append(counter_visit_length_ri)
            visit_lenght_response_RR.append(counter_visit_length_rr)

            # Lenght visit time
            counter_visit_time_rr = time_diff[ii]
            counter_visit_time_ri = 0
            visit_lenght_time_RI.append(counter_visit_time_ri)
            visit_lenght_time_RR.append(counter_visit_time_rr)


    #################################################################################
        if data.iloc[ii]["event"] == 24:
            # time
            time.append(data.iloc[ii][ "time"])
            # time difference
            time_diff.append(time[ii] - time[ii-1])

            # Responses before reward
            counter_ri += 1
            responses_before_reward_RI.append(counter_ri)
            responses_before_reward_RR.append(counter_rr)

            # Time since last reward
            time_counter_ri += time_diff[ii]
            time_counter_rr += time_diff[ii]
            time_since_last_reward_RI.append(time_counter_ri)
            time_since_last_reward_RR.append(time_counter_rr)

            # Responses
            resp_RI.append(1)
            resp_RR.append(0)

            # Reward
            reward_RI.append(0)
            reward_RR.append(0)


            if data.iloc[ii-1]["event"] == 24:
                # Lenght visit response
                counter_visit_length_rr = 0
                counter_visit_length_ri += 1
                visit_lenght_response_RI.append(counter_visit_length_ri)
                visit_lenght_response_RR.append(counter_visit_length_rr)

                # Lenght visit time
                counter_visit_time_ri += time_diff[ii]
                counter_visit_time_rr = 0
                visit_lenght_time_RI.append(counter_visit_time_ri)
                visit_lenght_time_RR.append(counter_visit_time_rr)

            elif data.iloc[ii-1]["event"] == 42 or data.iloc[ii-1]["event"] == 29 or data.iloc[ii-1]["event"] == 12 or data.iloc[ii-1]["event"] == 13:
                # Lenght visit response
                counter_visit_length_rr = 0
                counter_visit_length_ri = 0
                visit_lenght_response_RI.append(counter_visit_length_ri)
                visit_lenght_response_RR.append(counter_visit_length_rr)

                # Lenght visit time
                counter_visit_time_rr = 0
                counter_visit_time_ri = 0
                visit_lenght_time_RI.append(counter_visit_time_ri)
                visit_lenght_time_RR.append(counter_visit_time_rr)

            elif data.iloc[ii-1]["event"] == 44:
                # Lenght visit response
                counter_visit_length_rr = 0
                counter_visit_length_ri = 0
                visit_lenght_response_RI.append(counter_visit_length_ri)
                visit_lenght_response_RR.append(counter_visit_length_rr)

                # Lenght visit time
                counter_visit_time_rr = 0
                counter_visit_time_ri = 0
                visit_lenght_time_RI.append(counter_visit_time_ri)
                visit_lenght_time_RR.append(counter_visit_time_rr)

    #################################################################################
        if data.iloc[ii]["event"] == 44:
            time.append(data.iloc[ii]["time"])
            time_diff.append(time[ii] - time[ii-1])

            counter_rr +=1
            responses_before_reward_RI.append(counter_ri)
            responses_before_reward_RR.append(counter_rr)

            time_counter_ri += time_diff[ii]
            time_counter_rr += time_diff[ii]
            time_since_last_reward_RI.append(time_counter_ri)
            time_since_last_reward_RR.append(time_counter_rr)

            resp_RI.append(0)
            resp_RR.append(1)

            reward_RI.append(0)
            reward_RR.append(0)

            if data.iloc[ii-1]["event"] == 44:
                counter_visit_length_rr += 1
                counter_visit_length_ri = 0
                visit_lenght_response_RI.append(counter_visit_length_ri)
                visit_lenght_response_RR.append(counter_visit_length_rr)

                counter_visit_time_rr += time_diff[ii]
                counter_visit_time_ri = 0
                visit_lenght_time_RI.append(counter_visit_time_ri)
                visit_lenght_time_RR.append(counter_visit_time_rr)

            elif data.iloc[ii-1]["event"] == 42 or data.iloc[ii-1]["event"] == 29 or data.iloc[ii-1]["event"] == 12 or data.iloc[ii-1]["event"] == 13:
                counter_visit_length_rr = 0
                counter_visit_length_ri = 0
                visit_lenght_response_RI.append(counter_visit_length_ri)
                visit_lenght_response_RR.append(counter_visit_length_rr)

                counter_visit_time_rr = 0
                counter_visit_time_ri = 0
                visit_lenght_time_RI.append(counter_visit_time_ri)
                visit_lenght_time_RR.append(counter_visit_time_rr)

            elif data.iloc[ii-1]["event"] == 24:
                counter_visit_length_rr = 0
                counter_visit_length_ri = 0
                visit_lenght_response_RI.append(counter_visit_length_ri)
                visit_lenght_response_RR.append(counter_visit_length_rr)

                counter_visit_time_rr = 0
                counter_visit_time_ri = 0
                visit_lenght_time_RI.append(counter_visit_time_ri)
                visit_lenght_time_RR.append(counter_visit_time_rr)

    #################################################################################
    data_filtred = pd.DataFrame({"bird": bird, 
                                 "session": session,
                                 "pair": pair, 
                                 "time": time, 
                                 "time_diff": time_diff, 
                                 "resp_RI": resp_RI, 
                                 "resp_RR": resp_RR, 
                                 "reward_RI": reward_RI, 
                                 "reward_RR": reward_RR, 
                                 "time_since_last_reward_RI": time_since_last_reward_RI, 
                                 "time_since_last_reward_RR": time_since_last_reward_RR, 
                                 "visit_lenght_response_RI": visit_lenght_response_RI, 
                                 "visit_lenght_response_RR": visit_lenght_response_RR, 
                                 "responses_before_reward_RI": responses_before_reward_RI, 
                                 "responses_before_reward_RR": responses_before_reward_RR, 
                                 "visit_lenght_time_RI": visit_lenght_time_RI, 
                                 "visit_lenght_time_RR": visit_lenght_time_RR})

    return data_filtred

In [38]:
def extract_data(df: pd.DataFrame(), dir: str, birds: list, pairs: list, sessions: list):
    """The function `extract_data` extracts data from data frame for given birds, pairs and sessions, and saves into a directory, with the format: bird_pair_session.csv. If there are some problems with saving data, the function returns a list of birds, pairs and sessions that were not saved.

    Input: data: pd.DataFrame() - data frame with data ,
            dir: str - directory to save data,
            birds: list - list of birds to extract data from,
            pairs: list - list of pairs to extract data from,
            sessions: list - list of sessions to extract data from 
    Output: not_saved: list - list of birds, pairs and sessions that were not saved.
    """

    not_saved = []
    global data_frame
    for bird in birds:
        for session in sessions:
            for pair in pairs:
                print("Saving data for bird {}, pair {}, session {}".format(bird, pair, session))
                data_frame = df.loc[((df.event == 29) | (df.event == 44) 
                           | (df.event == 24) | (df.event == 42)
                             | (df.event == 12) | (df.event == 13)) 
                             & (df.session == session) & (df.bird == bird)
                             & (df.pair == pair)]

                data_frame.reset_index(inplace=True, drop=True)

                try:
                    assert ([13, 24, 29, 44, 42] in data_frame.event.unique()) or ([12, 13, 24, 29, 44, 42] in data_frame.event.unique()) or ([12, 24, 44, 29] in data_frame.event.unique()) or ([12, 24, 44, 42] in data_frame.event.unique()) or ([13, 24, 44, 29] in data_frame.event.unique()) or ([13, 24, 44, 42] in data_frame.event.unique()), Exception("There are events missing in the data dataframe") 
                    
                    transfor_df = transform_data(data_frame)
                    transfor_df.to_csv(dir + "{}_{}_{}.csv".format(bird, pair, session), index=False)
                
                except OSError:
                    raise OSError(f"The directory '{dir}' does not exist")

                except:
                    print("There are events missing in the data dataframe, the data for bird {}, pair {}, session {} was not saved".format(bird, pair, session))
                    not_saved.append([bird, pair, session])
                    continue
    return not_saved

In [39]:
!mkdir Data2022
dir = "Data2022/"

not_saved = extract_data(df = df_last_sessions, dir=dir, birds = birds, pairs= pairs, sessions = sessions)

mkdir: Data2022: File exists
Saving data for bird P007, pair 0, session S60
There are events missing in the data dataframe, the data for bird P007, pair 0, session S60 was not saved
Saving data for bird P007, pair 1, session S60
Saving data for bird P007, pair 2, session S60
There are events missing in the data dataframe, the data for bird P007, pair 2, session S60 was not saved
Saving data for bird P007, pair 3, session S60
There are events missing in the data dataframe, the data for bird P007, pair 3, session S60 was not saved
Saving data for bird P007, pair 4, session S60
There are events missing in the data dataframe, the data for bird P007, pair 4, session S60 was not saved
Saving data for bird P007, pair 5, session S60
There are events missing in the data dataframe, the data for bird P007, pair 5, session S60 was not saved
Saving data for bird P007, pair 0, session S61
There are events missing in the data dataframe, the data for bird P007, pair 0, session S61 was not saved
Saving

In [19]:
# Data that was not saved because there are events missing in the data dataframe
# Or because does not exist data for that combination of bird, pair and session.

print(not_saved)

[['P007', 0, 'S60'], ['P007', 2, 'S60'], ['P007', 3, 'S60'], ['P007', 4, 'S60'], ['P007', 5, 'S60'], ['P007', 0, 'S61'], ['P007', 1, 'S61'], ['P007', 3, 'S61'], ['P007', 4, 'S61'], ['P007', 5, 'S61'], ['P007', 1, 'S62'], ['P007', 2, 'S62'], ['P007', 3, 'S62'], ['P007', 4, 'S62'], ['P007', 5, 'S62'], ['P007', 0, 'S63'], ['P007', 1, 'S63'], ['P007', 2, 'S63'], ['P007', 3, 'S63'], ['P007', 4, 'S63'], ['P007', 0, 'S64'], ['P007', 1, 'S64'], ['P007', 2, 'S64'], ['P007', 3, 'S64'], ['P007', 4, 'S64'], ['P007', 0, 'S65'], ['P007', 1, 'S65'], ['P007', 2, 'S65'], ['P007', 4, 'S65'], ['P007', 5, 'S65'], ['P007', 0, 'S66'], ['P007', 1, 'S66'], ['P007', 2, 'S66'], ['P007', 3, 'S66'], ['P007', 4, 'S66'], ['P007', 5, 'S66'], ['P007', 0, 'S67'], ['P007', 1, 'S67'], ['P007', 2, 'S67'], ['P007', 3, 'S67'], ['P007', 4, 'S67'], ['P007', 0, 'S68'], ['P007', 1, 'S68'], ['P007', 2, 'S68'], ['P007', 3, 'S68'], ['P007', 4, 'S68'], ['P007', 5, 'S68'], ['P007', 0, 'S69'], ['P007', 1, 'S69'], ['P007', 3, 'S69'],