In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind, pearsonr

# Goal => Load, clean, and prepare both datasets for analysis.

# Load datasets
bat_df = pd.read_csv("../data/dataset1.csv")
rat_df = pd.read_csv("../data/dataset2.csv")

# Look at the first few rows
print("Bat dataset:")
display(bat_df.head())

print("Rat dataset:")
display(rat_df.head())

# Data cleaning and processing 
# Convert date/time columns to datetime
bat_df['start_time'] = pd.to_datetime(bat_df['start_time'], errors='coerce', dayfirst=True)
rat_df['time'] = pd.to_datetime(rat_df['time'], errors='coerce', dayfirst=True)

# Handle missing values
bat_df = bat_df.dropna(subset=['risk', 'reward'])
rat_df = rat_df.dropna(subset=['rat_arrival_number'])

# Convert categorical text (season) to title case (robust to non-strings)
bat_df['season'] = bat_df['season'].astype('string').str.strip().str.title()

# Create a derived feature: how long rats stayed
bat_df['rat_duration'] = pd.to_datetime(bat_df['rat_period_end'], errors='coerce', dayfirst=True) - pd.to_datetime(bat_df['rat_period_start'], errors='coerce', dayfirst=True)
bat_df['rat_duration'] = bat_df['rat_duration'].dt.total_seconds()

# Fill missing rat_duration with 0
bat_df['rat_duration'] = bat_df['rat_duration'].fillna(0)

# Save cleaned versions
bat_df.to_csv("../data/bat_cleaned.csv", index=False)
rat_df.to_csv("../data/rat_cleaned.csv", index=False)

# Quick summary
print(bat_df.describe())



Bat dataset:


Unnamed: 0,start_time,bat_landing_to_food,habit,rat_period_start,rat_period_end,seconds_after_rat_arrival,risk,reward,month,sunset_time,hours_after_sunset,season
0,30/12/2017 18:37,16.0,rat,30/12/2017 18:35,30/12/2017 18:38,108,1,0,0,30/12/2017 16:45,1.870833,0
1,30/12/2017 19:51,0.074016,fast,30/12/2017 19:50,30/12/2017 19:55,17,0,1,0,30/12/2017 16:45,3.100833,0
2,30/12/2017 19:51,4.0,fast,30/12/2017 19:50,30/12/2017 19:55,41,0,1,0,30/12/2017 16:45,3.1075,0
3,30/12/2017 19:52,10.0,rat,30/12/2017 19:50,30/12/2017 19:55,111,1,0,0,30/12/2017 16:45,3.126944,0
4,30/12/2017 19:54,15.0,rat,30/12/2017 19:50,30/12/2017 19:55,194,1,0,0,30/12/2017 16:45,3.15,0


Rat dataset:


Unnamed: 0,time,month,hours_after_sunset,bat_landing_number,food_availability,rat_minutes,rat_arrival_number
0,26/12/2017 16:13,0,-0.5,20,4.0,0.0,0
1,26/12/2017 16:43,0,0.0,28,4.0,0.0,0
2,26/12/2017 17:13,0,0.5,25,4.0,0.0,0
3,26/12/2017 17:43,0,1.0,71,4.0,0.0,0
4,26/12/2017 18:13,0,1.5,44,3.753857,0.0,0


                          start_time  bat_landing_to_food  \
count                            907           907.000000   
mean   2018-04-11 20:41:27.056229120            11.713134   
min              2017-12-26 20:57:00             0.010238   
25%              2018-04-03 23:29:00             1.000000   
50%              2018-04-27 00:28:00             4.000000   
75%              2018-05-01 00:07:30            11.500000   
max              2018-05-31 23:34:00           443.000000   
std                              NaN            27.644410   

       seconds_after_rat_arrival        risk      reward       month  \
count                 907.000000  907.000000  907.000000  907.000000   
mean                  282.877619    0.495039    0.533627    3.800441   
min                     0.000000    0.000000    0.000000    0.000000   
25%                    89.500000    0.000000    0.000000    4.000000   
50%                   206.000000    0.000000    1.000000    4.000000   
75%               