# Preprocessing of the data

## Pairs Description.

* Pairs from **0** to **4** are RR fixed ($\frac{1}{30}$) - RI random ($\frac{1}{7.5}$, $\frac{1}{15}$, $\frac{1}{30}$, $\frac{1}{60}$, $\frac{1}{120}$).

* Pairs drom **5** to **9** are RR random ($\frac{1}{15}$, $\frac{1}{30}$, $\frac{1}{45}$, $\frac{1}{60}$, $\frac{1}{120}$) - RI fixed ($\frac{1}{60}$).

## Event Dictionary

* Response RI = 24
* Response RR = 44
* Reward RI = 33
* Reward RR=46
* Start session with R1 (p=0.5) = 12
* Start session with RR (p=0.5) = 13
* Switch from RR to RI = 14
* Switch from RI to RR = 15
* Show pairs = 10
* Response on TecCen to start session = 11

### Importing libraries

In [3]:
import pandas as pd
import numpy as np

### Importing data, creating dataframe, extracting subjects, sessions, and pairs.

In [4]:
df = pd.read_csv("raw_data.csv")

In [5]:
sessions = df.session.unique()
birds = df.bird.unique()
pairs = df.pair.unique()

print(f"Sessions: {sessions}","\n", "*"*60, "\n", 
     f"Birds:{sessions}", "\n", "*"*60, "\n", 
     f"Pairs: {pairs}")

Sessions: ['S50' 'S51' 'S52' 'S53' 'S54' 'S55' 'S56' 'S57' 'S58' 'S59' 'S60' 'S61'
 'S62' 'S63' 'S64' 'S65' 'S66' 'S67' 'S68' 'S69' 'S70' 'S71' 'S72' 'S73'
 'S74' 'S75' 'S76' 'S77' 'S78' 'S79' 'S80' 'S81' 'S82' 'S83' 'S84' 'S85'
 'S86' 'S87' 'S88' 'S89' 'S90' 'S91' 'S92' 'S93' 'S94' 'S95' 'S96' 'S97'
 'S98' 'S99' 'S100' 'S101' 'S102' 'S103' 'S104' 'S105' 'S106' 'S107'
 'S108' 'S109' 'S110'] 
 ************************************************************ 
 Birds:['S50' 'S51' 'S52' 'S53' 'S54' 'S55' 'S56' 'S57' 'S58' 'S59' 'S60' 'S61'
 'S62' 'S63' 'S64' 'S65' 'S66' 'S67' 'S68' 'S69' 'S70' 'S71' 'S72' 'S73'
 'S74' 'S75' 'S76' 'S77' 'S78' 'S79' 'S80' 'S81' 'S82' 'S83' 'S84' 'S85'
 'S86' 'S87' 'S88' 'S89' 'S90' 'S91' 'S92' 'S93' 'S94' 'S95' 'S96' 'S97'
 'S98' 'S99' 'S100' 'S101' 'S102' 'S103' 'S104' 'S105' 'S106' 'S107'
 'S108' 'S109' 'S110'] 
 ************************************************************ 
 Pairs: [0 1 3 2 4 8 9 6 5 7]


## Functions to preprocess the data

In [6]:
def transform_data(data: pd.DataFrame()):
    # Description of the function
    """
    The function  `transform_data(data, birds, sessions, pairs)`, transform the data to be analyzed in the following way:
    1. Extract the following variables:
        - Responses
        - Responses before reward
        - Visit lenght responses
        - Time
        - Time difference
        - Visit lenght time
        - Reward
        - Time since last reward
    2. Create a new dataframe with the variables extracted.
    3. Return the new dataframe.

    Input: 
    - data   (pandas dataframe)  - Data to be transformed
    
    Output: data  (pandas dataframe) - Transformed data
    """
    import pandas as pd

    # Number of observations
    len_data = data.shape[0]

    # Bird
    bird = np.array([data.bird.unique()[0]])
    bird = np.repeat(bird, len_data)

    # Session
    session = np.array([data.session.unique()[0]])
    session = np.repeat(session, len_data)

    # Pair
    pair = np.array([data.pair.unique()])
    pair = np.repeat(pair, len_data)

    # Responses
    resp_RI = [0]
    resp_RR = [0]

    # Responses before reward
    responses_before_reward_RI = [0]
    responses_before_reward_RR = [0]

    # Visit lenght responses
    visit_lenght_response_RI = [0]
    visit_lenght_response_RR = [0]

    # Time
    time = [0]
    # Time difference
    time_diff = [0]

    # Visit lenght time
    visit_lenght_time_RI = [0]
    visit_lenght_time_RR = [0]

    # Reward
    reward_RI = [0]
    reward_RR = [0]

    # Time since last reward
    time_since_last_reward_RI = [0]
    time_since_last_reward_RR = [0]  

    # Counter for the number of responses before reward
    counter_ri = 0
    counter_rr = 0

    # Counter for the time since last reward
    time_counter_rr = 0
    time_counter_ri = 0

    # Counter for the visit length
    counter_visit_length_ri = 0
    counter_visit_length_rr = 0

    # Counter for the visit time
    counter_visit_time_ri = 0
    counter_visit_time_rr = 0

#################################################################################
    # Loop over the data to extract the variables for the analysis.
    for ii in range(1, len_data):
#################################################################################
        if data.iloc[ii]["event"] == 33:
            # time
            time.append(data.iloc[ii]["time"])   
            # time difference     
            time_diff.append(time[ii] - time[ii-2])
        
            # Responses before reward
            counter_ri = 0
            responses_before_reward_RI.append(counter_ri)
            responses_before_reward_RR.append(counter_rr)
        
            # Time since last reward
            time_counter_ri = 0
            time_counter_rr += time_diff[ii]
            time_since_last_reward_RI.append(time_counter_ri)
            time_since_last_reward_RR.append(time_counter_rr)
        
            # Responses
            resp_RI.append(1)
            resp_RR.append(0)

            # Reward
            reward_RI.append(1)
            reward_RR.append(0)

            # Lenght visit response
            counter_visit_length_rr = 0
            counter_visit_length_ri += 1
            visit_lenght_response_RI.append(counter_visit_length_ri)
            visit_lenght_response_RR.append(counter_visit_length_rr)

            # Lenght visit time
            counter_visit_time_ri = time_diff[ii]
            counter_visit_time_rr = 0
            visit_lenght_time_RI.append(counter_visit_time_ri)
            visit_lenght_time_RR.append(counter_visit_time_rr)

    #################################################################################
        if data.iloc[ii]["event"] == 46:
            # time
            time.append(data.iloc[ii][ "time"])
            # time difference
            time_diff.append(time[ii] - time[ii-2])
        
            # Responses before reward
            counter_rr = 0
            responses_before_reward_RR.append(counter_rr)
            responses_before_reward_RI.append(counter_ri)

            # Time since last reward
            time_counter_ri += time_diff[ii]
            time_counter_rr = 0
            time_since_last_reward_RI.append(time_counter_ri)
            time_since_last_reward_RR.append(time_counter_rr)
        
            # Responses
            resp_RI.append(0)
            resp_RR.append(1)

            # Reward
            reward_RI.append(0)
            reward_RR.append(1)

            # Lenght visit response
            counter_visit_length_rr += 1
            counter_visit_length_ri = 0
            visit_lenght_response_RI.append(counter_visit_length_ri)
            visit_lenght_response_RR.append(counter_visit_length_rr)

            # Lenght visit time
            counter_visit_time_rr = time_diff[ii]
            counter_visit_time_ri = 0
            visit_lenght_time_RI.append(counter_visit_time_ri)
            visit_lenght_time_RR.append(counter_visit_time_rr)


    #################################################################################
        if data.iloc[ii]["event"] == 24:
            # time 
            time.append(data.iloc[ii][ "time"])
            # time difference
            time_diff.append(time[ii] - time[ii-1])

            # Responses before reward
            counter_ri += 1
            responses_before_reward_RI.append(counter_ri)
            responses_before_reward_RR.append(counter_rr)
        
            # Time since last reward
            time_counter_ri += time_diff[ii]
            time_counter_rr += time_diff[ii]
            time_since_last_reward_RI.append(time_counter_ri)
            time_since_last_reward_RR.append(time_counter_rr)

            # Responses
            resp_RI.append(1)
            resp_RR.append(0)

            # Reward
            reward_RI.append(0)
            reward_RR.append(0)


            if data.iloc[ii-1]["event"] == 24:
                # Lenght visit response
                counter_visit_length_rr = 0
                counter_visit_length_ri += 1 
                visit_lenght_response_RI.append(counter_visit_length_ri)
                visit_lenght_response_RR.append(counter_visit_length_rr)

                # Lenght visit time
                counter_visit_time_ri += time_diff[ii]
                counter_visit_time_rr = 0
                visit_lenght_time_RI.append(counter_visit_time_ri)
                visit_lenght_time_RR.append(counter_visit_time_rr)

            elif data.iloc[ii-1]["event"] == 46 or data.iloc[ii-1]["event"] == 33 or data.iloc[ii-1]["event"] == 12 or data.iloc[ii-1]["event"] == 13:
                # Lenght visit response
                counter_visit_length_rr = 0
                counter_visit_length_ri = 0
                visit_lenght_response_RI.append(counter_visit_length_ri)
                visit_lenght_response_RR.append(counter_visit_length_rr)

                # Lenght visit time
                counter_visit_time_rr = 0
                counter_visit_time_rr = 0
                visit_lenght_time_RI.append(counter_visit_time_ri)
                visit_lenght_time_RR.append(counter_visit_time_rr)

            elif data.iloc[ii-1]["event"] == 44:
                # Lenght visit response
                counter_visit_length_rr = 0
                counter_visit_length_ri = 0
                visit_lenght_response_RI.append(counter_visit_length_ri)
                visit_lenght_response_RR.append(counter_visit_length_rr)
                
                # Lenght visit time
                counter_visit_time_rr = 0
                counter_visit_time_rr = 0
                visit_lenght_time_RI.append(counter_visit_time_ri)
                visit_lenght_time_RR.append(counter_visit_time_rr)

    #################################################################################       
        if data.iloc[ii]["event"] == 44:
            time.append(data.iloc[ii]["time"])
            time_diff.append(time[ii] - time[ii-1])

            counter_rr +=1
            responses_before_reward_RI.append(counter_ri)
            responses_before_reward_RR.append(counter_rr)

            time_counter_ri += time_diff[ii]
            time_counter_rr += time_diff[ii]
            time_since_last_reward_RI.append(time_counter_ri)
            time_since_last_reward_RR.append(time_counter_rr)
        
            resp_RI.append(0)
            resp_RR.append(1)

            reward_RI.append(0)
            reward_RR.append(0)

            if data.iloc[ii-1]["event"] == 44:
                counter_visit_length_rr += 1
                counter_visit_length_ri = 0
                visit_lenght_response_RI.append(counter_visit_length_ri)
                visit_lenght_response_RR.append(counter_visit_length_rr)

                counter_visit_length_rr += time_diff[ii]
                counter_visit_length_ri = 0
                visit_lenght_time_RI.append(counter_visit_length_ri)
                visit_lenght_time_RR.append(counter_visit_length_rr)

            elif data.iloc[ii-1]["event"] == 46 or data.iloc[ii-1]["event"] == 33 or data.iloc[ii-1]["event"] == 12 or data.iloc[ii-1]["event"] == 13:
                counter_visit_length_rr = 0
                counter_visit_length_ri = 0
                visit_lenght_response_RI.append(counter_visit_length_ri)
                visit_lenght_response_RR.append(counter_visit_length_rr)

                counter_visit_length_rr = 0
                counter_visit_length_ri = 0
                visit_lenght_time_RI.append(counter_visit_length_ri)
                visit_lenght_time_RR.append(counter_visit_length_rr)

            elif data.iloc[ii-1]["event"] == 24:
                counter_visit_length_rr = 0
                counter_visit_length_ri = 0
                visit_lenght_response_RI.append(counter_visit_length_ri)
                visit_lenght_response_RR.append(counter_visit_length_rr)

                counter_visit_length_rr = 0
                counter_visit_length_ri = 0
                visit_lenght_time_RI.append(counter_visit_length_ri)
                visit_lenght_time_RR.append(counter_visit_length_rr)
            

    assert data.shape[0] == len(time), Exception("time and data have different lenghts")
    assert data.shape[0] == len(time_diff), Exception("time_diff and data have different lenghts")
    assert data.shape[0] == len(resp_RI), Exception("resp_RI and data have different lenghts")
    assert data.shape[0] == len(resp_RR), Exception("resp_RR and data have different lenghts")
    assert data.shape[0] == len(reward_RI), Exception("reward_RI and data have different lenghts")
    assert data.shape[0] == len(reward_RR), Exception("reward_RR and data have different lenghts")
    assert data.shape[0] == len(time_since_last_reward_RI), Exception("time_since_last_reward_RI and data have different lenghts")
    assert data.shape[0] == len(time_since_last_reward_RR), Exception("time_since_last_reward_RR and data have different lenghts")
    assert data.shape[0] == len(visit_lenght_response_RI), Exception("visit_lenght_response_RI and data have different lenghts")
    assert data.shape[0] == len(visit_lenght_response_RR), Exception("visit_lenght_response_RR and data have different lenghts")
    assert data.shape[0] == len(responses_before_reward_RI), Exception("responses_before_reward_RI and data have different lenghts")
    assert data.shape[0] == len(responses_before_reward_RR), Exception("responses_before_reward_RR and data have different lenghts")
    assert data.shape[0] == len(visit_lenght_time_RI), Exception("visit_lenght_time_RI and data have different lenghts")
    assert data.shape[0] == len(visit_lenght_time_RR), Exception("visit_lenght_time_RR and data have different lenghts")
    assert data.shape[0] == len(bird), Exception("bird and data have different lenghts")
    assert data.shape[0] == len(session), Exception("session and data have different lenghts")
    assert data.shape[0] == len(pair), Exception("pair and data have different lenghts")


    #################################################################################
    data_filtred = pd.DataFrame({"bird": bird, "session": session,"pair": pair, "time": time, "time_diff": time_diff, "resp_RI": resp_RI, "resp_RR": resp_RR, "reward_RI": reward_RI, "reward_RR": reward_RR, "time_since_last_reward_RI": time_since_last_reward_RI, "time_since_last_reward_RR": time_since_last_reward_RR, "visit_lenght_response_RI": visit_lenght_response_RI, "visit_lenght_response_RR": visit_lenght_response_RR, "responses_before_reward_RI": responses_before_reward_RI, "responses_before_reward_RR": responses_before_reward_RR, "visit_lenght_time_RI": visit_lenght_time_RI, "visit_lenght_time_RR": visit_lenght_time_RR})

    return data_filtred

In [25]:
def extract_data(data: pd.DataFrame(), dir: str, birds: list, pairs: list, sessions: list):
    """The function `extract_data` extracts data from data frame for given birds, pairs and sessions, and saves into a directory, with the format: bird_pair_session.csv. If there are some problems with saving data, the function returns a list of birds, pairs and sessions that were not saved.

    Input: data: pd.DataFrame() - data frame with data ,
            dir: str - directory to save data,
            birds: list - list of birds to extract data from,
            pairs: list - list of pairs to extract data from,
            sessions: list - list of sessions to extract data from 
    Output: not_saved: list - list of birds, pairs and sessions that were not saved.
    """

    not_saved = []
    global data_frame
    for bird in birds:
        for pair in pairs:
            for session in sessions:
                print("Saving data for bird {}, pair {}, session {}".format(bird, pair, session))
                data_frame = data[((df.event == 33) | (df.event == 44) 
                           | (df.event == 24) | (df.event == 46)
                             | (df.event == 12) | (df.event == 13)) 
                             & (df.session == session) & (df.bird == bird)
                             & (df.pair == pair)]

                data_frame.reset_index(inplace=True, drop=True)

                try:
                    assert ([13, 24, 33, 44, 46] in data_frame.event.unique()) or ([12, 13, 24, 33, 44, 46] in data_frame.event.unique()) or ([12, 24, 44, 33] in data_frame.event.unique()) or ([12, 24, 44, 46] in data_frame.event.unique()) or ([13, 24, 44, 33] in data_frame.event.unique()) or ([13, 24, 44, 46] in data_frame.event.unique()), sException("There are events missing in the data dataframe") 
                    
                    transfor_df = transform_data(data_frame)
                    transfor_df.to_csv(dir + "{}_{}_{}.csv".format(bird, pair, session), index=False)
                
                except OSError:
                    raise OSError(f"The directory '{dir}' does not exist")

                except:
                    print("There are events missing in the data dataframe, the data for bird {}, pair {}, session {} was not saved".format(bird, pair, session))
                    not_saved.append([bird, pair, session])
                    continue
    return not_saved

In [None]:
not_saved = extract_data(data = df, dir="data/", birds = birds, pairs= pairs, sessions = sessions)

In [24]:
# Data that was not saved
not_saved_data =   [['P168', 1, 'S81'], ['P168', 5, 'S87'], ['P168', 5, 'S95'], ['P423', 2, 'S106'],
                    ['P423', 6, 'S71'], ['P423', 6, 'S109'], ['P423', 5, 'S66'], ['P423', 5, 'S70'],
                    ['P423', 5, 'S71'], ['P423', 5, 'S86'], ['P423', 5, 'S93'], ['P423', 5, 'S94'],
                    ['P423', 5, 'S97'], ['P423', 5, 'S102'], ['P423', 5, 'S104'], ['P423', 5, 'S110'], 
                    ['P498', 0, 'S56'], ['P498', 0, 'S98'], ['P498', 0, 'S103'], ['P787', 3, 'S51'],
                    ['P787', 4, 'S57'], ['P787', 5, 'S53'], ['P787', 7, 'S97'], ['P796', 0, 'S61'],
                    ['P796', 0, 'S95'], ['P796', 1, 'S61'], ['P796', 1, 'S95'], ['P796', 3, 'S61'],
                    ['P796', 2, 'S61'], ['P796', 2, 'S95'], ['P796', 4, 'S61'], ['P796', 4, 'S95'],
                    ['P796', 8, 'S61'], ['P796', 9, 'S61'], ['P796', 5, 'S87'], ['P875', 0, 'S54']]