In [2]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import datetime
import plotly.graph_objects as go
import scipy.stats as st

# Data Cleaning for All Datasets

In [3]:
# Read in shark data
eat = pd.read_excel("SharkData.xlsx", sheet_name = 0)
targets = pd.read_excel("SharkData.xlsx", sheet_name = 1)
drops = pd.read_excel("SharkData.xlsx", sheet_name = 2)
other_factors = pd.read_excel("SharkData.xlsx", sheet_name = 3)

In [4]:
# Clean eating dataset
eat = eat.iloc[1:,:-1]
eat.rename(columns = {"Unnamed: 14":'Etc. Comments', "Pieces Eaten": "Date"}, inplace = True)

#get rid of null values at end of dataset
eat = eat.head(582)

# Change date to just date format, not datetime
eat['Date'] = pd.to_datetime(eat['Date']).dt.date


eat.tail()

Unnamed: 0,Date,Ross,Chandler,BT1,BT2,BT3,BT4,BT5,GR1,GR2,GR3,GR4,GR5,Total:,Etc. Comments
578,2020-09-22,6.0,0.0,1.0,2.0,6.0,0.0,4.0,1.0,0.0,0.0,5.0,1.0,26.0,
579,2020-09-24,0.0,9.0,0.0,7.0,5.0,5.0,7.0,0.0,1.0,2.0,2.0,6.0,44.0,
580,2020-09-26,,,,,,,,4.0,0.0,2.0,3.0,3.0,12.0,
581,2020-09-27,5.0,6.0,3.0,6.0,4.0,6.0,3.0,,,,,,33.0,
582,2020-09-29,10.0,3.0,1.0,1.0,4.0,2.0,7.0,4.0,4.0,0.0,2.0,0.0,38.0,


In [5]:
# Clean Drops Dataset
drops = drops.iloc[1:,:-1]
drops['Drops'] = pd.to_datetime(drops['Drops']).dt.date

In [6]:
# Clean Targets Dataset
targets = targets.iloc[1:,:-1]
targets.rename(columns = {"Unnamed: 14":'Etc. Comments', "Targets": "Date"}, inplace = True)

#get rid of null values at end of dataset
targets = targets.head(582)

# Change date to just date format, not datetime
targets['Date'] = pd.to_datetime(targets['Date']).dt.date


targets.tail()

Unnamed: 0,Date,Ross,Chandler,BT1,BT2,BT3,BT4,BT5,GR1,GR2,GR3,GR4,GR5,Total:
578,2020-09-22,6.0,1.0,4.0,6.0,5.0,4.0,7.0,2.0,1.0,5.0,4.0,1.0,46.0
579,2020-09-24,8.0,18.0,2.0,9.0,10.0,10.0,11.0,5.0,6.0,3.0,7.0,9.0,98.0
580,2020-09-26,,,,,,,,6.0,2.0,3.0,5.0,8.0,24.0
581,2020-09-27,10.0,19.0,9.0,10.0,8.0,16.0,3.0,,,,,,75.0
582,2020-09-29,15.0,5.0,6.0,6.0,4.0,4.0,7.0,4.0,1.0,4.0,4.0,1.0,61.0


# Create Subset Groups

In [7]:
# All Sharks Subset
All_Sharks = eat[['Ross', 'Chandler','BT1','BT2','BT3','BT4','BT5','GR1','GR2','GR3','GR4', 'GR5','Total: ']]

# All SS Subset
All_SS = eat[['Ross', 'Chandler']]

# All BT Subset
All_BT = eat[['BT1', 'BT2', 'BT3', 'BT4', 'BT5']]

# All GR Subset
All_GR = eat[['GR1', 'GR2', 'GR3', 'GR4', 'GR5']]

# Male Subset
Male = eat[["BT1","BT5","GR1","Ross","Chandler"]]

# Female Subset
female = eat[["BT2","BT3","BT4","GR2","GR3","GR4","GR5"]]

# Append all subsets to original datasets
# eat dataset
eat["All_GR"] = eat['GR1'] + eat['GR2'] + eat['GR3'] + eat['GR4'] + eat['GR5']
eat["All_BT"] = eat['BT1'] + eat['BT2'] + eat['BT3'] + eat['BT4'] + eat['BT5']
eat["All_SS"] = eat['Ross'] + eat['Chandler']
eat["male"] = eat[["BT1","BT5","GR1","Ross","Chandler"]].sum(axis = 1)
eat["female"] = eat[["BT2","BT3","BT4","GR2","GR3","GR4","GR5"]].sum(axis = 1)

# drop dataset
drops["All_GR"] = drops['GR1'] + drops['GR2'] + drops['GR3'] + drops['GR4'] + drops['GR5']
drops["All_BT"] = drops['BT1'] + drops['BT2'] + drops['BT3'] + drops['BT4'] + drops['BT5']
drops["All_SS"] = drops['Ross'] + drops['Chandler']
drops["male"] = drops[["BT1","BT5","GR1","Ross","Chandler"]].sum(axis = 1)
drops["female"] = drops[["BT2","BT3","BT4","GR2","GR3","GR4","GR5"]].sum(axis = 1)

# target dataset
targets["All_GR"] = targets['GR1'] + targets['GR2'] + targets['GR3'] + targets['GR4'] + targets['GR5']
targets["All_BT"] = targets['BT1'] + targets['BT2'] + targets['BT3'] + targets['BT4'] + targets['BT5']
targets["All_SS"] = targets['Ross'] + targets['Chandler']
targets["male"] = targets[["BT1","BT5","GR1","Ross","Chandler"]].sum(axis = 1)
targets["female"] = targets[["BT2","BT3","BT4","GR2","GR3","GR4","GR5"]].sum(axis = 1)


In [8]:
eat.head()

Unnamed: 0,Date,Ross,Chandler,BT1,BT2,BT3,BT4,BT5,GR1,GR2,GR3,GR4,GR5,Total:,Etc. Comments,All_GR,All_BT,All_SS,male,female
1,2017-12-05,0.0,5.0,3.0,6.0,5.0,4.0,4.0,0.0,0.0,2.0,3.0,1.0,33.0,,6.0,22.0,5.0,12.0,21.0
2,2017-12-07,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,,0.0,2.0,1.0,1.0,2.0
3,2017-12-09,,,,,,,,2.0,0.0,3.0,1.0,0.0,6.0,,6.0,,,2.0,4.0
4,2017-12-10,5.0,1.0,1.0,3.0,2.0,2.0,0.0,,,,,,14.0,,,8.0,6.0,7.0,7.0
5,2017-12-12,2.0,2.0,0.0,5.0,2.0,4.0,5.0,0.0,2.0,3.0,2.0,0.0,27.0,,7.0,16.0,4.0,9.0,18.0


In [9]:
drops.head()

Unnamed: 0,Drops,Ross,Chandler,BT1,BT2,BT3,BT4,BT5,GR1,GR2,GR3,GR4,GR5,Total:,All_GR,All_BT,All_SS,male,female
1,2017-12-05,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,3.0,2.0,1.0,0.0,0.0,3.0
2,2017-12-07,0.0,1.0,1.0,2.0,1.0,1.0,2.0,0.0,0.0,2.0,0.0,0.0,10.0,2.0,7.0,1.0,4.0,6.0
3,2017-12-09,,,,,,,,0.0,1.0,0.0,0.0,0.0,1.0,1.0,,,0.0,1.0
4,2017-12-10,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,,,,,1.0,,1.0,0.0,1.0,0.0
5,2017-12-12,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,1.0,2.0,0.0,0.0,3.0


In [10]:
targets.head()

Unnamed: 0,Date,Ross,Chandler,BT1,BT2,BT3,BT4,BT5,GR1,GR2,GR3,GR4,GR5,Total:,All_GR,All_BT,All_SS,male,female
1,2017-12-05,3.0,0.0,4.0,3.0,0.0,2.0,2.0,1.0,1.0,3.0,3.0,0.0,22.0,8.0,11.0,3.0,10.0,12.0
2,2017-12-07,4.0,5.0,2.0,2.0,5.0,3.0,5.0,0.0,0.0,2.0,2.0,0.0,30.0,4.0,17.0,9.0,16.0,14.0
3,2017-12-09,,,,,,,,1.0,3.0,1.0,0.0,3.0,8.0,8.0,,,1.0,7.0
4,2017-12-10,4.0,7.0,0.0,2.0,2.0,0.0,0.0,,,,,,15.0,,4.0,11.0,11.0,4.0
5,2017-12-12,3.0,1.0,3.0,2.0,4.0,1.0,1.0,1.0,1.0,5.0,1.0,3.0,26.0,11.0,11.0,4.0,9.0,17.0


# T test for Shutdown vs. Open Groups

# Number of Pieces Eaten

In [11]:
start_date = pd.to_datetime('03-14-2020')
end_date = pd.to_datetime('05-09-2020')
corona_times_eat = (eat['Date'] >= start_date) & (eat['Date'] < end_date)
corona_eat = eat[corona_times_eat]

# Create dataset of the all the time that the Aquarium was open to the public to compare
open_eat_times = (eat['Date'] < start_date) | (eat['Date'] >= end_date)
open_eat = eat[open_eat_times]

In [12]:
# Perform T-Test for each individual Shark
for column in All_Sharks:
    twosample_results = st.ttest_ind(corona_eat[column], open_eat[column],nan_policy='omit')
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 5))

Ross : Test Statistic = -1.61396 P-Value = 0.10726
Chandler : Test Statistic = -0.40546 P-Value = 0.68534
BT1 : Test Statistic = -0.49799 P-Value = 0.61874
BT2 : Test Statistic = 2.87097 P-Value = 0.00429
BT3 : Test Statistic = -1.95872 P-Value = 0.05078
BT4 : Test Statistic = -0.20414 P-Value = 0.83834
BT5 : Test Statistic = -1.04968 P-Value = 0.29445
GR1 : Test Statistic = -1.05797 P-Value = 0.29066
GR2 : Test Statistic = -1.38793 P-Value = 0.16587
GR3 : Test Statistic = -0.11202 P-Value = 0.91086
GR4 : Test Statistic = -0.90367 P-Value = 0.36667
GR5 : Test Statistic = -1.37777 P-Value = 0.16898
Total:  : Test Statistic = -0.81231 P-Value = 0.41695


In [13]:
# Perform T-Test for species and genders
gender_and_species = eat[['male', 'female', 'All_GR', 'All_BT', 'All_SS']]
for column in gender_and_species:
    twosample_results = st.ttest_ind(corona_eat[column], open_eat[column],nan_policy='omit')
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 5))

male : Test Statistic = -1.09837 P-Value = 0.2725
female : Test Statistic = -0.1688 P-Value = 0.86602
All_GR : Test Statistic = -1.756 P-Value = 0.07979
All_BT : Test Statistic = -0.24767 P-Value = 0.8045
All_SS : Test Statistic = -1.44172 P-Value = 0.1501


After correcting for the number of T-tests, these are the significant p-values for each subset:

In [14]:
print('Individual Sharks:', round(0.05 / 12, 5))
print('Species:', round(0.05 / 3, 5))
print("Male/Female", round(0.05 / 2, 5))
print("Total:", 0.05)

Individual Sharks: 0.00417
Species: 0.01667
Male/Female 0.025
Total: 0.05


Based on these significant values, there are no significant p-values from any individual shark or subset as a result of our T-tests. We can conclude that there was no significant difference in the number of pieces eaten between the COVID lockdown and the time the Aquarium was open to the public. However, it does look like BT2 was marginally significant and it was just slightly over the updated p-value and should be mentioned.

# Number of Pieces Dropped

In [15]:
# Create Drops Data for Corona time and open to the public time to perform T-Test
corona_times_drop = (drops['Drops'] >= start_date) & (drops['Drops'] < end_date)
corona_drop = drops[corona_times_drop]

# Create dataset of the all the time that the Aquarium was open to the public to compare
open_drop_times = (drops['Drops'] < start_date) | (drops['Drops'] >= end_date)
open_drop = drops[open_drop_times]

In [16]:
# Perform T-Test for each individual Shark
for column in All_Sharks:
    twosample_results = st.ttest_ind(corona_drop[column], open_drop[column],nan_policy='omit')
    print(column, ": Test Statistic =" ,round(twosample_results[0],5), "P-Value =", round(twosample_results[1],5))

Ross : Test Statistic = 1.58969 P-Value = 0.11263
Chandler : Test Statistic = 0.43535 P-Value = 0.66352
BT1 : Test Statistic = 0.24058 P-Value = 0.80999
BT2 : Test Statistic = 3.89072 P-Value = 0.00012
BT3 : Test Statistic = 1.67011 P-Value = 0.09562
BT4 : Test Statistic = 2.15166 P-Value = 0.03197
BT5 : Test Statistic = 3.07718 P-Value = 0.00222
GR1 : Test Statistic = -0.29879 P-Value = 0.76525
GR2 : Test Statistic = 0.35249 P-Value = 0.72464
GR3 : Test Statistic = 0.49377 P-Value = 0.62171
GR4 : Test Statistic = 0.61508 P-Value = 0.53882
GR5 : Test Statistic = 0.16739 P-Value = 0.86714
Total:  : Test Statistic = 3.54939 P-Value = 0.00042


In [17]:
# Perform T-Test for species and genders
gender_and_species_drop = drops[['male', 'female', 'All_GR', 'All_BT', 'All_SS']]
for column in gender_and_species_drop:
    twosample_results = st.ttest_ind(corona_drop[column], open_drop[column],nan_policy='omit')
    print(column, ": Test Statistic =" ,round(twosample_results[0],5), "P-Value =", round(twosample_results[1],5))

male : Test Statistic = 1.84951 P-Value = 0.06489
female : Test Statistic = 3.64153 P-Value = 0.0003
All_GR : Test Statistic = 0.57308 P-Value = 0.56689
All_BT : Test Statistic = 4.29426 P-Value = 2e-05
All_SS : Test Statistic = 1.17782 P-Value = 0.23951


Blacktip2 and Blcktip5 have significant p-values for drops, Total, as well as the Blacktip subset and the female subset

# Number of Times Targeted

In [18]:
# Create Target Data for Corona time and open to the public time to perform T-Test
corona_times_target = (targets['Date'] >= start_date) & (targets['Date'] < end_date)
corona_target = targets[corona_times_target]

# Create dataset of the all the time that the Aquarium was open to the public to compare
open_target_times = (targets['Date'] < start_date) | (targets['Date'] >= end_date)
open_target = targets[open_target_times]

In [19]:
# Perform T-Test for each individual Shark
for column in All_Sharks:
    twosample_results = st.ttest_ind(corona_target[column], open_target[column],nan_policy='omit')
    print(column, ": Test Statistic =" ,round(twosample_results[0],5), "P-Value =", round(twosample_results[1],5))

Ross : Test Statistic = -2.2993 P-Value = 0.02196
Chandler : Test Statistic = -1.63313 P-Value = 0.10317
BT1 : Test Statistic = -1.53598 P-Value = 0.12527
BT2 : Test Statistic = 1.64821 P-Value = 0.10003
BT3 : Test Statistic = 0.75754 P-Value = 0.44914
BT4 : Test Statistic = 0.37776 P-Value = 0.70579
BT5 : Test Statistic = -1.69312 P-Value = 0.09115
GR1 : Test Statistic = -1.81694 P-Value = 0.06991
GR2 : Test Statistic = -0.29126 P-Value = 0.77099
GR3 : Test Statistic = -0.97071 P-Value = 0.33223
GR4 : Test Statistic = 0.29916 P-Value = 0.76496
GR5 : Test Statistic = -1.80879 P-Value = 0.07117
Total:  : Test Statistic = -1.05584 P-Value = 0.29148


In [20]:
# Perform T-Test for species and genders
gender_and_species_target = targets[['male', 'female', 'All_GR', 'All_BT', 'All_SS']]
for column in gender_and_species_target:
    twosample_results = st.ttest_ind(corona_target[column], open_target[column],nan_policy='omit')
    print(column, ": Test Statistic =" ,round(twosample_results[0],5), "P-Value =", round(twosample_results[1],5))

male : Test Statistic = -1.86336 P-Value = 0.06292
female : Test Statistic = 0.42712 P-Value = 0.66945
All_GR : Test Statistic = -1.45035 P-Value = 0.14768
All_BT : Test Statistic = -0.27715 P-Value = 0.7818
All_SS : Test Statistic = -2.35684 P-Value = 0.01887


There were no significant p-values in terms of number of pieces dropped between COVID times and regular open times. However, it does seem that Sand Bar Sharks are marginally significant because it was just slightly over the updated p-value and should be mentioned

# T tests for Light Training Vs. No Light Training

Since there were only 41 entries in our data that did not include light training, we will compare this time period with the 40 days immediately after light training began, the final 40 days of our dataset, and to all the days that did not have light training to see if there are significant differences in any of the dates.

# Number of Pieces Eaten

In [21]:
# Create dataset that includes days without light training
start_date = pd.to_datetime('02-15-2018')
#end_date = pd.to_datetime('05-09-2020')
no_light_e = (eat['Date'] < start_date)
no_light_eat = eat[no_light_e]

# Create dataset of first 40 days after light training was introduced
first_40 = (eat["Date"] >= start_date) & (eat["Date"] <= pd.to_datetime('04-28-2018'))
first_40_eat = eat[first_40]

# Create dataset of last 40 days of dataset
last_40 = eat.iloc[-40:]

# Create dataset of all non light training days
light = eat['Date'] >= start_date
light_eat = eat[light]

First we will perform the T-test between the no light training group and the first 40 days after light training was introduced:

In [22]:
# Perform T-Test for each individual Shark
for column in All_Sharks:
    twosample_results = st.ttest_ind(no_light_eat[column], first_40_eat[column],nan_policy='omit')
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 5))

Ross : Test Statistic = -1.41029 P-Value = 0.16362
Chandler : Test Statistic = -1.97072 P-Value = 0.05337
BT1 : Test Statistic = 0.08784 P-Value = 0.9303
BT2 : Test Statistic = 0.14512 P-Value = 0.8851
BT3 : Test Statistic = 0.62961 P-Value = 0.53134
BT4 : Test Statistic = -0.64733 P-Value = 0.51989
BT5 : Test Statistic = 0.22961 P-Value = 0.81919
GR1 : Test Statistic = 1.15358 P-Value = 0.25317
GR2 : Test Statistic = 0.19689 P-Value = 0.84457
GR3 : Test Statistic = 1.61052 P-Value = 0.11245
GR4 : Test Statistic = 0.02983 P-Value = 0.9763
GR5 : Test Statistic = -1.54532 P-Value = 0.12744
Total:  : Test Statistic = -0.29934 P-Value = 0.76545


In [23]:
# Perform T-Test for species and genders
for column in gender_and_species:
    twosample_results = st.ttest_ind(no_light_eat[column], first_40_eat[column],nan_policy='omit')
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 5))

male : Test Statistic = -1.20314 P-Value = 0.23247
female : Test Statistic = 0.11406 P-Value = 0.90947
All_GR : Test Statistic = 0.57841 P-Value = 0.56512
All_BT : Test Statistic = 0.33383 P-Value = 0.73969
All_SS : Test Statistic = -2.19435 P-Value = 0.03209


There were no significant p-values when conducting the T-tests between the time frame with no light training and the first 40 days of light training. Next we will perform the T-test between the no light training time frame and the last 40 days of the dataset with light training used:

In [145]:
# Perform T-Test for each individual Shark
unequal_var = ["Chandler"]
for column in All_Sharks.drop(columns = unequal_var):
    twosample_results = st.ttest_ind(no_light_eat[column], last_40[column],nan_policy='omit')
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 10))
for column in All_Sharks[unequal_var]:
    twosample_results = st.ttest_ind(no_light_eat[column], last_40[column],nan_policy='omit', equal_var = False)
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 10))

Ross : Test Statistic = -1.26515 P-Value = 0.2107924558
BT1 : Test Statistic = -1.12999 P-Value = 0.2630534188
BT2 : Test Statistic = -2.74599 P-Value = 0.0079850899
BT3 : Test Statistic = -1.79207 P-Value = 0.0782491067
BT4 : Test Statistic = -3.42741 P-Value = 0.0011163556
BT5 : Test Statistic = -3.94279 P-Value = 0.0002194297
GR1 : Test Statistic = -3.30711 P-Value = 0.0016085972
GR2 : Test Statistic = -1.72522 P-Value = 0.0897235664
GR3 : Test Statistic = -0.28503 P-Value = 0.7766165661
GR4 : Test Statistic = -2.15341 P-Value = 0.0353844967
GR5 : Test Statistic = -3.87084 P-Value = 0.0002738381
Total:  : Test Statistic = -5.0205 P-Value = 3.1196e-06
Chandler : Test Statistic = -3.12246 P-Value = 0.003305346


In [25]:
# Perform T-Test for species and genders
for column in gender_and_species:
    twosample_results = st.ttest_ind(no_light_eat[column], last_40[column],nan_policy='omit')
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 5))

male : Test Statistic = -4.21881 P-Value = 6e-05
female : Test Statistic = -3.94547 P-Value = 0.00017
All_GR : Test Statistic = -4.64421 P-Value = 2e-05
All_BT : Test Statistic = -4.57361 P-Value = 3e-05
All_SS : Test Statistic = -3.75821 P-Value = 0.00039


Chandler, BT4, BT5, GR1, GR5, males, females, All_GR, All_BT, All_SS, and the total were all significant. This shows evidence that the sharks may have improved over time with the number of the fish they ate because of light training. Now we will compare the time frame of no light training with the whole time frame of light training:

In [26]:
# Perform T-Test for each individual Shark
for column in All_Sharks:
    twosample_results = st.ttest_ind(no_light_eat[column], light_eat[column],nan_policy='omit')
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 10))

Ross : Test Statistic = -0.52385 P-Value = 0.6006489928
Chandler : Test Statistic = -1.69831 P-Value = 0.0901602489
BT1 : Test Statistic = -1.33223 P-Value = 0.1834797438
BT2 : Test Statistic = -2.51846 P-Value = 0.0121439573
BT3 : Test Statistic = -0.6316 P-Value = 0.5279772593
BT4 : Test Statistic = -2.93365 P-Value = 0.0035267242
BT5 : Test Statistic = -2.16195 P-Value = 0.0311677576
GR1 : Test Statistic = -1.46375 P-Value = 0.1439840385
GR2 : Test Statistic = -1.34015 P-Value = 0.1808965732
GR3 : Test Statistic = 0.36902 P-Value = 0.7122952273
GR4 : Test Statistic = -1.19187 P-Value = 0.2339604589
GR5 : Test Statistic = -2.40969 P-Value = 0.0163802393
Total:  : Test Statistic = -3.37587 P-Value = 0.0007851421


In [27]:
# Perform T-Test for species and genders
for column in gender_and_species:
    twosample_results = st.ttest_ind(no_light_eat[column], light_eat[column],nan_policy='omit')
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 5))

male : Test Statistic = -2.25547 P-Value = 0.02448
female : Test Statistic = -3.15556 P-Value = 0.00168
All_GR : Test Statistic = -2.18877 P-Value = 0.02914
All_BT : Test Statistic = -3.56903 P-Value = 0.0004
All_SS : Test Statistic = -1.55598 P-Value = 0.12044


The total p-value was significant, BT4 along with the male, female, and All_BT subsets. This is a good indicator that the light training has a significant effect on the number of fish the sharks eat.

# Number of Drops

In [28]:
# Create dataset that includes days without light training

no_light_d = (drops['Drops'] < start_date)
no_light_drops = drops[no_light_d]

# Create dataset of first 40 days after light training was introduced
first_40_d = (drops["Drops"] >= start_date) & (drops["Drops"] <= pd.to_datetime('04-28-2018'))
first_40_drops = drops[first_40_d]

# Create dataset of last 40 days of dataset
last_40_d = drops.iloc[-40:]

# Create dataset of all non light training days
light_d = drops['Drops'] >= start_date
light_drops = drops[light_d]

First we will perform the T-test between the no light training group and the first 40 days after light training was introduced:

In [147]:
# Perform T-Test for each individual Shark
unequal_var = ["GR1","GR2"]
for column in All_Sharks.drop(columns = unequal_var):
    twosample_results = st.ttest_ind(no_light_drops[column], first_40_drops[column],nan_policy='omit')
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 5))
for column in All_Sharks[unequal_var]:
    twosample_results = st.ttest_ind(no_light_drops[column], first_40_drops[column],nan_policy='omit', equal_var = False)
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 5))

Ross : Test Statistic = -0.45676 P-Value = 0.64955
Chandler : Test Statistic = -2.18551 P-Value = 0.0329
BT1 : Test Statistic = 0.61463 P-Value = 0.54116
BT2 : Test Statistic = 0.09473 P-Value = 0.92485
BT3 : Test Statistic = 0.87148 P-Value = 0.38702
BT4 : Test Statistic = -0.75221 P-Value = 0.45492
BT5 : Test Statistic = 0.87357 P-Value = 0.38589
GR3 : Test Statistic = 1.31692 P-Value = 0.19279
GR4 : Test Statistic = 0.94443 P-Value = 0.34867
GR5 : Test Statistic = -0.70028 P-Value = 0.48641
Total:  : Test Statistic = 0.40096 P-Value = 0.68953
GR1 : Test Statistic = 1.89545 P-Value = 0.0653
GR2 : Test Statistic = 1.07753 P-Value = 0.28727


In [148]:
# Perform T-Test for species and genders
unequal_var = ["All_GR"]
for column in gender_and_species.drop(columns = unequal_var):
    twosample_results = st.ttest_ind(no_light_drops[column], first_40_drops[column],nan_policy='omit')
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 5))
for column in gender_and_species[unequal_var]:
    twosample_results = st.ttest_ind(no_light_drops[column], first_40_drops[column],nan_policy='omit', equal_var = False)
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 5))


male : Test Statistic = 0.33205 P-Value = 0.74073
female : Test Statistic = 0.96797 P-Value = 0.33601
All_BT : Test Statistic = 0.85347 P-Value = 0.39685
All_SS : Test Statistic = -1.82788 P-Value = 0.07271
All_GR : Test Statistic = 1.52562 P-Value = 0.13569


There were no significant P-values between the no light training time frame and the subset of the first 40 days. Now we will compare the last 40 days of the dataset to the days with no light training:

In [149]:
# Perform T-Test for each individual Shark
unequal_var = ["Ross","BT1","GR3","GR4"]
for column in All_Sharks.drop(columns = unequal_var):
    twosample_results = st.ttest_ind(no_light_drops[column], last_40_d[column],nan_policy='omit')
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 5))
for column in All_Sharks[unequal_var]:
    twosample_results = st.ttest_ind(no_light_drops[column], last_40_d[column],nan_policy='omit', equal_var = False)
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 5))

Chandler : Test Statistic = -0.79993 P-Value = 0.42701
BT2 : Test Statistic = -0.3721 P-Value = 0.71117
BT3 : Test Statistic = 1.21852 P-Value = 0.22796
BT4 : Test Statistic = 0.88681 P-Value = 0.37884
BT5 : Test Statistic = 0.12891 P-Value = 0.89787
GR1 : Test Statistic = 1.9847 P-Value = 0.05183
GR2 : Test Statistic = -0.8929 P-Value = 0.37554
GR5 : Test Statistic = -2.03061 P-Value = 0.04681
Total:  : Test Statistic = 1.09082 P-Value = 0.27871
Ross : Test Statistic = 2.50405 P-Value = 0.01815
BT1 : Test Statistic = 1.31062 P-Value = 0.19855
GR3 : Test Statistic = 1.67466 P-Value = 0.10138
GR4 : Test Statistic = 1.61521 P-Value = 0.11389


In [150]:
# Perform T-Test for species and genders
unequal_var = ["male","All_GR"]
for column in gender_and_species.drop(columns = unequal_var):
    twosample_results = st.ttest_ind(no_light_drops[column], last_40_d[column],nan_policy='omit')
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 5))
for column in gender_and_species[unequal_var]:
    twosample_results = st.ttest_ind(no_light_drops[column], last_40_d[column],nan_policy='omit', equal_var = False)
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 5))

female : Test Statistic = 0.93319 P-Value = 0.3536
All_BT : Test Statistic = 1.35082 P-Value = 0.182
All_SS : Test Statistic = 0.37817 P-Value = 0.70668
male : Test Statistic = 1.32292 P-Value = 0.19167
All_GR : Test Statistic = 1.05107 P-Value = 0.29952


There were no significant P-values between the subset of no light training and the last 40 days subset. Now we will look at the no light training subset and the light training subset as a whole:

In [153]:
# Perform T-Test for each individual Shark
unequal_var = ["BT1","GR1","GR3"]
for column in All_Sharks.drop(columns = unequal_var):
    twosample_results = st.ttest_ind(no_light_drops[column], light_drops[column],nan_policy='omit')
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 5))
for column in All_Sharks[unequal_var]:
    twosample_results = st.ttest_ind(no_light_drops[column], light_drops[column],nan_policy='omit', equal_var = False)
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 5))

Ross : Test Statistic = 1.9177 P-Value = 0.0558
Chandler : Test Statistic = 0.45865 P-Value = 0.64671
BT2 : Test Statistic = -0.4617 P-Value = 0.64453
BT3 : Test Statistic = 0.63897 P-Value = 0.52318
BT4 : Test Statistic = -0.30976 P-Value = 0.75689
BT5 : Test Statistic = 0.8157 P-Value = 0.41512
GR2 : Test Statistic = -0.3634 P-Value = 0.71648
GR4 : Test Statistic = 2.04736 P-Value = 0.04122
GR5 : Test Statistic = -0.36774 P-Value = 0.71325
Total:  : Test Statistic = 1.74169 P-Value = 0.08209
BT1 : Test Statistic = 1.13014 P-Value = 0.26747
GR1 : Test Statistic = 2.05487 P-Value = 0.04841
GR3 : Test Statistic = 1.87145 P-Value = 0.07072


In [154]:
# Perform T-Test for species and genders
unequal_var = ["All_GR","male"]
for column in gender_and_species.drop(columns = unequal_var):
    twosample_results = st.ttest_ind(no_light_drops[column], light_drops[column],nan_policy='omit')
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 5))
for column in gender_and_species[unequal_var]:
    twosample_results = st.ttest_ind(no_light_drops[column], light_drops[column],nan_policy='omit', equal_var = False)
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 5))

female : Test Statistic = 1.05099 P-Value = 0.2937
All_BT : Test Statistic = 1.26563 P-Value = 0.20633
All_SS : Test Statistic = 1.36852 P-Value = 0.17185
All_GR : Test Statistic = 1.58365 P-Value = 0.12349
male : Test Statistic = 1.51139 P-Value = 0.13853


There are no significant values so this is a good indicator the light training does not significantly effect the number of drops.

# Number of Targets

In [35]:
# Create dataset that includes days without light training

no_light_t = (targets['Date'] < start_date)
no_light_targets = targets[no_light_t]

# Create dataset of first 40 days after light training was introduced
first_40_t = (targets["Date"] >= start_date) & (targets["Date"] <= pd.to_datetime('04-28-2018'))
first_40_targets = drops[first_40_t]

# Create dataset of last 40 days of dataset
last_40_t = targets.iloc[-40:]

# Create dataset of all non light training days
light_t = drops['Drops'] >= start_date
light_targets = drops[light_t]

First we will compare the no light training subset to the first 40 days after light training started subset:

In [157]:
# Perform T-Test for each individual Shark

twosample_results = st.ttest_ind(no_light_targets["BT2"], first_40_targets["BT2"],nan_policy='omit')
print("BT2", ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 10))
for column in All_Sharks.drop(columns = ["BT2"]):
    twosample_results = st.ttest_ind(no_light_targets[column], first_40_targets[column],nan_policy='omit', equal_var = False)
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 10))

BT2 : Test Statistic = 6.79503 P-Value = 6e-09
Ross : Test Statistic = 7.4382 P-Value = 2.67e-08
Chandler : Test Statistic = 6.86537 P-Value = 1.035e-07
BT1 : Test Statistic = 5.61988 P-Value = 3.0515e-06
BT3 : Test Statistic = 8.37721 P-Value = 1.3e-09
BT4 : Test Statistic = 7.08617 P-Value = 3.94e-08
BT5 : Test Statistic = 5.92226 P-Value = 1.2915e-06
GR1 : Test Statistic = 6.41837 P-Value = 3.603e-07
GR2 : Test Statistic = 5.54979 P-Value = 4.2045e-06
GR3 : Test Statistic = 6.87081 P-Value = 9.91e-08
GR4 : Test Statistic = 5.12867 P-Value = 1.41262e-05
GR5 : Test Statistic = 3.027 P-Value = 0.0049112821
Total:  : Test Statistic = 10.16325 P-Value = 0.0


In [158]:
# Perform T-Test for species and genders
for column in gender_and_species:
    twosample_results = st.ttest_ind(no_light_targets[column], first_40_targets[column],nan_policy='omit', equal_var = False)
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 10))

male : Test Statistic = 7.69344 P-Value = 2e-09
female : Test Statistic = 10.71679 P-Value = 0.0
All_GR : Test Statistic = 8.85028 P-Value = 6e-10
All_BT : Test Statistic = 9.42654 P-Value = 1e-10
All_SS : Test Statistic = 7.76162 P-Value = 1.13e-08


Every single P-value is significant. This would make sense since the light training probably was a shock to the sharks at first and either didn't target as much or targeted more. Now we will compare the non-light training subset to the last 40 days of light training subset:

In [38]:
# Perform T-Test for each individual Shark
for column in All_Sharks:
    twosample_results = st.ttest_ind(no_light_targets[column], last_40_t[column],nan_policy='omit')
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 10))

Ross : Test Statistic = -1.68722 P-Value = 0.0969333882
Chandler : Test Statistic = -1.53604 P-Value = 0.1299654646
BT1 : Test Statistic = 0.49784 P-Value = 0.6204777804
BT2 : Test Statistic = -3.12479 P-Value = 0.002777852
BT3 : Test Statistic = -1.03103 P-Value = 0.3068063745
BT4 : Test Statistic = -4.08836 P-Value = 0.0001354664
BT5 : Test Statistic = -4.00907 P-Value = 0.0001763462
GR1 : Test Statistic = -1.32542 P-Value = 0.1901401276
GR2 : Test Statistic = -2.83406 P-Value = 0.0062826497
GR3 : Test Statistic = 1.69134 P-Value = 0.0960482222
GR4 : Test Statistic = -1.49635 P-Value = 0.1398924537
GR5 : Test Statistic = -3.95334 P-Value = 0.0002088523
Total:  : Test Statistic = -2.64137 P-Value = 0.009972226


In [39]:
# Perform T-Test for species and genders
for column in gender_and_species:
    twosample_results = st.ttest_ind(no_light_targets[column], last_40_t[column],nan_policy='omit')
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 10))

male : Test Statistic = -1.69109 P-Value = 0.0948112714
female : Test Statistic = -3.30149 P-Value = 0.0014524086
All_GR : Test Statistic = -2.44215 P-Value = 0.0176157187
All_BT : Test Statistic = -3.25035 P-Value = 0.0019211677
All_SS : Test Statistic = -1.8115 P-Value = 0.0752424523


BT2, BT4, BT5, GR2 (marginally), GR5 as well as female, total, All_GR (marginally), and All_BT subsets. Now we will look at the non-light training subset vs. the light training subset as a whole:

In [159]:
# Perform T-Test for each individual Shark
for column in All_Sharks.drop(columns = ["BT2"]):
    twosample_results = st.ttest_ind(no_light_targets[column], light_targets[column],nan_policy='omit',equal_var = False)
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 10))
twosample_results = st.ttest_ind(no_light_targets["BT2"], light_targets["BT2"],nan_policy='omit')
print("BT2", ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 10))

Ross : Test Statistic = 7.79666 P-Value = 1.33e-08
Chandler : Test Statistic = 7.58439 P-Value = 2.29e-08
BT1 : Test Statistic = 6.01249 P-Value = 1.5046e-06
BT3 : Test Statistic = 8.46227 P-Value = 2.3e-09
BT4 : Test Statistic = 7.46314 P-Value = 2.86e-08
BT5 : Test Statistic = 5.9307 P-Value = 1.8298e-06
GR1 : Test Statistic = 6.48769 P-Value = 3.53e-07
GR2 : Test Statistic = 5.07688 P-Value = 1.80169e-05
GR3 : Test Statistic = 7.06752 P-Value = 7.26e-08
GR4 : Test Statistic = 5.28318 P-Value = 1.03196e-05
GR5 : Test Statistic = 3.13772 P-Value = 0.0037926659
Total:  : Test Statistic = 10.43655 P-Value = 0.0
BT2 : Test Statistic = 11.31421 P-Value = 0.0


In [160]:
# Perform T-Test for species and genders
for column in gender_and_species:
    twosample_results = st.ttest_ind(no_light_targets[column], light_targets[column],nan_policy='omit', equal_var = False)
    print(column, ": Test Statistic =" ,round(twosample_results[0], 5), "P-Value =", round(twosample_results[1], 10))

male : Test Statistic = 8.04001 P-Value = 8e-10
female : Test Statistic = 10.81423 P-Value = 0.0
All_GR : Test Statistic = 8.8951 P-Value = 6e-10
All_BT : Test Statistic = 9.61757 P-Value = 1e-10
All_SS : Test Statistic = 8.32902 P-Value = 3.5e-09


All of the P-values are significant, so it seems that targeting was significantly effected by light training.

# Water Temperature Linear Regression to see if temperature had any affect on how much they ate, dropped, and targeted.

In [294]:
from sklearn.linear_model import LinearRegression

In [314]:
model = LinearRegression()
model.fit(np.array(other_factors.Temperature[eat.Chandler.notna().reset_index(drop = True)]).reshape((-1, 1)),eat["Chandler"].dropna())

LinearRegression()

In [320]:
print('intercept:', model.intercept_)
print('slope:', model.coef_)
print("y =",round(model.coef_[0],4),"x -",round(-1*model.intercept_,2))

intercept: -3.4890945546992467
slope: [0.08529267]
y = 0.0853 x - 3.49


In [321]:
model.score(np.array(other_factors.Temperature[eat.Chandler.notna().reset_index(drop = True)]).reshape((-1, 1)),eat["Chandler"].dropna())

0.0019882898707391794

In [328]:
rsq = []
for column in eat.drop(columns = "Etc. Comments").iloc[:,1:]:
    model = LinearRegression()
    model.fit(np.array(other_factors.Temperature[eat[column].notna().reset_index(drop = True)]).reshape((-1, 1)),eat[column].dropna())
    rsq.append(round(model.score(np.array(other_factors.Temperature[eat[column].notna().reset_index(drop = True)]).reshape((-1, 1)),eat[column].dropna()),5))
rsq

[0.00019,
 0.00199,
 0.00105,
 0.00495,
 0.00217,
 0.02497,
 0.00151,
 0.01629,
 0.00262,
 0.00016,
 6e-05,
 0.01364,
 0.00844,
 0.01081,
 0.01838,
 0.00168,
 0.00271,
 0.00995]

In [329]:
rsq = []
for column in drops.iloc[:,1:]:
    model = LinearRegression()
    model.fit(np.array(other_factors.Temperature[drops[column].notna().reset_index(drop = True)]).reshape((-1, 1)),drops[column].dropna())
    rsq.append(round(model.score(np.array(other_factors.Temperature[drops[column].notna().reset_index(drop = True)]).reshape((-1, 1)),drops[column].dropna()),5))
rsq

[0.00282,
 0.00754,
 0.0136,
 0.0157,
 0.01122,
 0.01249,
 0.00187,
 0.00406,
 0.00669,
 0.00203,
 0.01977,
 0.00106,
 0.03424,
 0.01834,
 0.03942,
 0.00912,
 0.01424,
 0.0309]

In [330]:
rsq = []
for column in targets.iloc[:,1:]:
    model = LinearRegression()
    model.fit(np.array(other_factors.Temperature[targets[column].notna().reset_index(drop = True)]).reshape((-1, 1)),targets[column].dropna())
    rsq.append(round(model.score(np.array(other_factors.Temperature[targets[column].notna().reset_index(drop = True)]).reshape((-1, 1)),targets[column].dropna()),5))
rsq

[0.00145,
 0.00146,
 0.00816,
 2e-05,
 3e-05,
 2e-05,
 0.01044,
 0.006,
 0.01467,
 0.0011,
 0.00677,
 0.02721,
 0.00083,
 0.02154,
 1e-05,
 0.00208,
 0.00015,
 0.00203]

# Time of Year Affect on Sharks

In [None]:
#Split the dataframes into different seasons

In [350]:
def season_of_date(date):
    year = str(date.year)
    seasons = {'spring': pd.date_range(start=year+'/03/21', end=year+'/06/20'),
               'summer': pd.date_range(start=year+'/06/21', end=year+'/09/22'),
               'fall': pd.date_range(start=year+'/09/23', end=year+'/12/20')}
    if date in seasons['spring']:
        return 'spring'
    if date in seasons['summer']:
        return 'summer'
    if date in seasons['fall']:
        return 'fall'
    else:
        return 'winter'

# Assuming df has a date column of type `datetime`
#df['season'] = df.date.map(season_of_date)

In [351]:
eat.Date.map(season_of_date)

1        fall
2        fall
3        fall
4        fall
5        fall
        ...  
578    summer
579      fall
580      fall
581      fall
582      fall
Name: Date, Length: 582, dtype: object

In [355]:
eat_summer = eat[eat.Date.map(season_of_date)=="summer"]
eat_fall = eat[eat.Date.map(season_of_date)=="fall"]
eat_winter = eat[eat.Date.map(season_of_date)=="winter"]
eat_spring = eat[eat.Date.map(season_of_date)=="spring"]

In [356]:
targets_summer = targets[targets.Date.map(season_of_date)=="summer"]
targets_fall = targets[targets.Date.map(season_of_date)=="fall"]
targets_winter = targets[targets.Date.map(season_of_date)=="winter"]
targets_spring = targets[targets.Date.map(season_of_date)=="spring"]

In [357]:
drops_summer = drops[eat.Date.map(season_of_date)=="summer"]
drops_fall = drops[eat.Date.map(season_of_date)=="fall"]
drops_winter = drops[eat.Date.map(season_of_date)=="winter"]
drops_spring = drops[eat.Date.map(season_of_date)=="spring"]

# Confidence Intervals

## Corona Vs. Non-Corona

In [163]:
print("Corona Eating Mean CI for BT2",st.t.interval(alpha=0.95, df=len(corona_eat["BT2"])-1, loc=np.mean(corona_eat["BT2"]), scale=st.sem(corona_eat["BT2"],nan_policy='omit')))
print("Open Eating Mean CI for BT2",st.t.interval(alpha=0.95, df=len(open_eat["BT2"])-1, loc=np.mean(open_eat["BT2"]), scale=st.sem(open_eat["BT2"],nan_policy='omit')))

Corona Eating Mean CI for BT2 (3.1422010241574245, 5.466494628016488)
Open Eating Mean CI for BT2 (2.540346838548647, 3.016279667475449)


In [167]:
print("Corona Dropping Mean CI for BT2",st.t.interval(alpha=0.95, df=len(corona_drop["BT2"])-1, loc=np.mean(corona_drop["BT2"]), scale=st.sem(corona_drop["BT2"],nan_policy='omit')))
print("Open Dropping Mean CI for BT2",st.t.interval(alpha=0.95, df=len(open_drop["BT2"])-1, loc=np.mean(open_drop["BT2"]), scale=st.sem(open_drop["BT2"],nan_policy='omit')))

Corona Dropping Mean CI for BT2 (1.0788439446594555, 2.486373446644892)
Open Dropping Mean CI for BT2 (0.7028123803003375, 0.9212840052418311)


In [221]:
def overlap(start1, end1, start2, end2):
    """Does the range (start1, end1) overlap with (start2, end2)?"""
    return (
        start1 <= start2 <= end1 or
        start1 <= end2 <= end1 or
        start2 <= start1 <= end2 or
        start2 <= end1 <= end2
    )

True

In [224]:
test = pd.DataFrame()
sig = ["BT2", "BT5", "female", "All_BT", "Total: "]
for column in sig:
    x = st.t.interval(alpha=0.95, df=len(corona_drop[column])-1, loc=np.mean(corona_drop[column]), scale=st.sem(corona_drop[column],nan_policy='omit'))
    y = st.t.interval(alpha=0.95, df=len(open_drop[column])-1, loc=np.mean(open_drop[column]), scale=st.sem(open_drop[column],nan_policy='omit'))
    if overlap(x[0],x[1],y[0],y[1])==False:
        #Graph it
        corona_mean = corona_drop[column].mean()
        open_mean = open_drop[column].mean()
        corona_int = st.t.interval(alpha=0.95, df=len(corona_drop[column])-1, loc=np.mean(corona_drop[column]), scale=st.sem(corona_drop[column],nan_policy='omit'))
        open_int = st.t.interval(alpha=0.95, df=len(open_drop[column])-1, loc=np.mean(open_drop[column]), scale=st.sem(open_drop[column],nan_policy='omit'))
        corona_error = (corona_int[1] - corona_int[0])/2
        open_error = (open_int[1] - open_int[0])/2
        pd.Series()
        data = {'Variable':['Corona','Open'], 'Mean':[corona_mean,open_mean],'error':[corona_error,open_error],'column':[column,column]} 
        data = pd.DataFrame(data)
        test = pd.concat([test, data])
    else:
        print(column,"Has Confidence Intervals that overlap")
        print("Here are the confidence intervals:")
        print("Corona",x)
        print("Open",y)
test

px.bar(test, x = "column", y = "Mean", color = "Variable", error_y = "error", barmode = "group", title = "Subsets that were significantly different in Dropping during Corona shutdown vs. non shutdown")
  





BT5 Has Confidence Intervals that overlap
Here are the confidence intervals:
Corona (0.6508513025502022, 1.7301010784021789)
Open (0.48878682997915057, 0.6582011218280784)


In [183]:
test = pd.DataFrame()
sig = ["All_SS"]
for column in sig:
    corona_mean = corona_target[column].mean()
    open_mean = open_target[column].mean()
    corona_int = st.t.interval(alpha=0.95, df=len(corona_target[column])-1, loc=np.mean(corona_target[column]), scale=st.sem(corona_target[column],nan_policy='omit'))
    open_int = st.t.interval(alpha=0.95, df=len(open_target[column])-1, loc=np.mean(open_target[column]), scale=st.sem(open_target[column],nan_policy='omit'))
    corona_error = (corona_int[1] - corona_int[0])/2
    open_error = (open_int[1] - open_int[0])/2
    pd.Series()
    data = {'Variable':['Corona','Open'], 'Mean':[corona_mean,open_mean],'error':[corona_error,open_error],'column':[column,column]} 
    data = pd.DataFrame(data)
    test = pd.concat([test, data])
test

px.bar(test, x = "column", y = "Mean", color = "Variable", error_y = "error", barmode = "group", title = "Subsets that were significantly different in Targeting during Corona shutdown vs. non shutdown")
  





In [225]:
test = pd.DataFrame()
sig = ["BT2"]
for column in sig:
    x = st.t.interval(alpha=0.95, df=len(corona_eat[column])-1, loc=np.mean(corona_eat[column]), scale=st.sem(corona_eat[column],nan_policy='omit'))
    y = st.t.interval(alpha=0.95, df=len(open_eat[column])-1, loc=np.mean(open_eat[column]), scale=st.sem(open_eat[column],nan_policy='omit'))
    if overlap(x[0],x[1],y[0],y[1])==False:
        corona_mean = corona_eat[column].mean()
        open_mean = open_eat[column].mean()
        corona_int = st.t.interval(alpha=0.95, df=len(corona_eat[column])-1, loc=np.mean(corona_eat[column]), scale=st.sem(corona_eat[column],nan_policy='omit'))
        open_int = st.t.interval(alpha=0.95, df=len(open_eat[column])-1, loc=np.mean(open_eat[column]), scale=st.sem(open_eat[column],nan_policy='omit'))
        corona_error = (corona_int[1] - corona_int[0])/2
        open_error = (open_int[1] - open_int[0])/2
        pd.Series()
        data = {'Variable':['Corona','Open'], 'Mean':[corona_mean,open_mean],'error':[corona_error,open_error],'column':[column,column]} 
        data = pd.DataFrame(data)
        test = pd.concat([test, data])
    else:
        print(column,"Has Confidence Intervals that overlap")
        print("Here are the confidence intervals:")
        print("Corona",x)
        print("Open",y)
test

px.bar(test, x = "column", y = "Mean", color = "Variable", error_y = "error", barmode = "group", title = "Subsets that were significantly different in Eating during Corona shutdown vs. non shutdown")
  





## Light Training of first 40 days vs. non-light training

In [234]:
test = pd.DataFrame()
for column in light_targets.iloc[:,1:]:
    x = st.t.interval(alpha=0.95, df=len(first_40_targets[column])-1, loc=np.mean(first_40_targets[column]), scale=st.sem(first_40_targets[column],nan_policy='omit'))
    y = st.t.interval(alpha=0.95, df=len(no_light_targets[column])-1, loc=np.mean(no_light_targets[column]), scale=st.sem(no_light_targets[column],nan_policy='omit'))
    if overlap(x[0],x[1],y[0],y[1])==False:
        light_mean = first_40_targets[column].mean()
        no_light_mean = no_light_targets[column].mean()
        light_int = st.t.interval(alpha=0.95, df=len(first_40_targets[column])-1, loc=np.mean(first_40_targets[column]), scale=st.sem(first_40_targets[column],nan_policy='omit'))
        no_light_int = st.t.interval(alpha=0.95, df=len(no_light_targets[column])-1, loc=np.mean(no_light_targets[column]), scale=st.sem(no_light_targets[column],nan_policy='omit'))
        light_error = (light_int[1] - light_int[0])/2
        no_light_error = (no_light_int[1] - no_light_int[0])/2
        pd.Series()
        data = {'Variable':['First 40 Days Of Light Training','Non-Light Training'], 'Mean':[light_mean,no_light_mean],'error':[light_error,no_light_error],'column':[column,column]} 
        data = pd.DataFrame(data)
        test = pd.concat([test, data])
    else:
        print(column,"Has Confidence Intervals that overlap")
        print("Here are the confidence intervals:")
        print("Light Training:",x)
        print("Non-Light Training:",y)
test

px.bar(test, x = "column", y = "Mean", color = "Variable", error_y = "error", barmode = "group", category_orders = {"column":test.sort_values(by = "Mean", ascending = False)["column"]},
       title = "Subsets that were significantly different in Targeting going from no light training to light training for first 40 days")
  





## Light Training of 40 most recent days vs. non-light training

In [235]:
test = pd.DataFrame()
sig = ["Chandler", "BT2", "BT4","BT5", "GR1", "GR5", "male", "female", "All_BT" ,"All_GR", "All_SS","Total: "]
for column in sig:
    x = st.t.interval(alpha=0.95, df=len(last_40[column])-1, loc=np.mean(last_40[column]), scale=st.sem(last_40[column],nan_policy='omit'))
    y = st.t.interval(alpha=0.95, df=len(no_light_eat[column])-1, loc=np.mean(no_light_eat[column]), scale=st.sem(no_light_eat[column],nan_policy='omit'))
    if overlap(x[0],x[1],y[0],y[1])==False:
        light_mean = last_40[column].mean()
        no_light_mean = no_light_eat[column].mean()
        light_int = st.t.interval(alpha=0.95, df=len(last_40[column])-1, loc=np.mean(last_40[column]), scale=st.sem(last_40[column],nan_policy='omit'))
        no_light_int = st.t.interval(alpha=0.95, df=len(no_light_eat[column])-1, loc=np.mean(no_light_eat[column]), scale=st.sem(no_light_eat[column],nan_policy='omit'))
        light_error = (light_int[1] - light_int[0])/2
        no_light_error = (no_light_int[1] - no_light_int[0])/2
        pd.Series()
        data = {'Variable':['Most Recent 40 Days Of Light Training','Non-Light Training'], 'Mean':[light_mean,no_light_mean],'error':[light_error,no_light_error],'column':[column,column]} 
        data = pd.DataFrame(data)
        test = pd.concat([test, data])
    else:
        print(column,"Has Confidence Intervals that overlap")
        print("Here are the confidence intervals:")
        print("Light Training:",x)
        print("Non-Light Training:",y)
test

px.bar(test, x = "column", y = "Mean", color = "Variable", error_y = "error", barmode = "group", title = "Subsets that were significantly different in Eating going from no light training to light training for most recent 40 days",category_orders = {"column":test.sort_values(by = "Mean", ascending = False)["column"]})
#BT2 is marginal 





BT2 Has Confidence Intervals that overlap
Here are the confidence intervals:
Light Training: (2.3589882272303697, 4.24101177276963)
Non-Light Training: (1.147497719584826, 2.4008893771893676)


In [236]:
test = pd.DataFrame()
sig = ["BT2", "BT4", "BT5", "GR2", "GR5", "female", "All_GR", "All_BT", "Total: "]
for column in sig:
    x = st.t.interval(alpha=0.95, df=len(last_40_t[column])-1, loc=np.mean(last_40_t[column]), scale=st.sem(last_40_t[column],nan_policy='omit'))
    y = st.t.interval(alpha=0.95, df=len(no_light_targets[column])-1, loc=np.mean(no_light_targets[column]), scale=st.sem(no_light_targets[column],nan_policy='omit'))
    if overlap(x[0],x[1],y[0],y[1])==False:
        light_mean = last_40_t[column].mean()
        no_light_mean = no_light_targets[column].mean()
        light_int = st.t.interval(alpha=0.95, df=len(last_40_t[column])-1, loc=np.mean(last_40_t[column]), scale=st.sem(last_40_t[column],nan_policy='omit'))
        no_light_int = st.t.interval(alpha=0.95, df=len(no_light_targets[column])-1, loc=np.mean(no_light_targets[column]), scale=st.sem(no_light_targets[column],nan_policy='omit'))
        light_error = (light_int[1] - light_int[0])/2
        no_light_error = (no_light_int[1] - no_light_int[0])/2
        pd.Series()
        data = {'Variable':['Most Recent 40 Days Of Light Training','Non-Light Training'], 'Mean':[light_mean,no_light_mean],'error':[light_error,no_light_error],'column':[column,column]} 
        data = pd.DataFrame(data)
        test = pd.concat([test, data])
    else:
        print(column,"Has Confidence Intervals that overlap")
        print("Here are the confidence intervals:")
        print("Light Training:",x)
        print("Non-Light Training:",y)
test

px.bar(test, x = "column", y = "Mean", color = "Variable", error_y = "error", barmode = "group", title = "Subsets that were significantly different in Targeting going from no light training to light training for most recent 40 days",category_orders = {"column":test.sort_values(by = "Mean", ascending = False)["column"]})
#GR2 and all_gr are marginal 

All_GR Has Confidence Intervals that overlap
Here are the confidence intervals:
Light Training: (15.552234074615262, 20.847765925384735)
Non-Light Training: (10.62047810925428, 16.347263826229593)
Total:  Has Confidence Intervals that overlap
Here are the confidence intervals:
Light Training: (42.436126613591796, 57.51387338640821)
Non-Light Training: (30.47356851581103, 43.47643148418897)






## Light Training all-time vs. non-light training

In [237]:
test = pd.DataFrame()
sig = ["BT4", "male", "female","All_BT","Total: "]
for column in sig:
    x = st.t.interval(alpha=0.95, df=len(light_eat[column])-1, loc=np.mean(light_eat[column]), scale=st.sem(light_eat[column],nan_policy='omit'))
    y = st.t.interval(alpha=0.95, df=len(no_light_eat[column])-1, loc=np.mean(no_light_eat[column]), scale=st.sem(no_light_eat[column],nan_policy='omit'))
    if overlap(x[0],x[1],y[0],y[1])==False:
        light_mean = light_eat[column].mean()
        no_light_mean = no_light_eat[column].mean()
        light_int = st.t.interval(alpha=0.95, df=len(light_eat[column])-1, loc=np.mean(light_eat[column]), scale=st.sem(light_eat[column],nan_policy='omit'))
        no_light_int = st.t.interval(alpha=0.95, df=len(no_light_eat[column])-1, loc=np.mean(no_light_eat[column]), scale=st.sem(no_light_eat[column],nan_policy='omit'))
        light_error = (light_int[1] - light_int[0])/2
        no_light_error = (no_light_int[1] - no_light_int[0])/2
        pd.Series()
        data = {'Variable':['Light Training','Non-Light Training'], 'Mean':[light_mean,no_light_mean],'error':[light_error,no_light_error],'column':[column,column]} 
        data = pd.DataFrame(data)
        test = pd.concat([test, data])
    else:
        print(column,"Has Confidence Intervals that overlap")
        print("Here are the confidence intervals:")
        print("Light Training:",x)
        print("Non-Light Training:",y)
test

px.bar(test, x = "column", y = "Mean", color = "Variable", error_y = "error", barmode = "group", title = "Subsets that were significantly different in Eating going from no light training to light training",category_orders = {"column":test.sort_values(by = "Mean", ascending = False)["column"]})
  





In [238]:
test = pd.DataFrame()
for column in light_targets.iloc[:,1:]:
    x = st.t.interval(alpha=0.95, df=len(light_targets[column])-1, loc=np.mean(light_targets[column]), scale=st.sem(light_targets[column],nan_policy='omit'))
    y = st.t.interval(alpha=0.95, df=len(no_light_targets[column])-1, loc=np.mean(no_light_targets[column]), scale=st.sem(no_light_targets[column],nan_policy='omit'))
    if overlap(x[0],x[1],y[0],y[1])==False:
        light_mean = light_targets[column].mean()
        no_light_mean = no_light_targets[column].mean()
        light_int = st.t.interval(alpha=0.95, df=len(light_targets[column])-1, loc=np.mean(light_targets[column]), scale=st.sem(light_targets[column],nan_policy='omit'))
        no_light_int = st.t.interval(alpha=0.95, df=len(no_light_targets[column])-1, loc=np.mean(no_light_targets[column]), scale=st.sem(no_light_targets[column],nan_policy='omit'))
        light_error = (light_int[1] - light_int[0])/2
        no_light_error = (no_light_int[1] - no_light_int[0])/2
        pd.Series()
        data = {'Variable':['Light Training','Non-Light Training'], 'Mean':[light_mean,no_light_mean],'error':[light_error,no_light_error],'column':[column,column]} 
        data = pd.DataFrame(data)
        test = pd.concat([test, data])
    else:
        print(column,"Has Confidence Intervals that overlap")
        print("Here are the confidence intervals:")
        print("Light Training:",x)
        print("Non-Light Training:",y)
test

px.bar(test, x = "column", y = "Mean", color = "Variable", error_y = "error", barmode = "group", title = "Subsets that were significantly different in Targeting going from no light training to light training",category_orders = {"column":test.sort_values(by = "Mean", ascending = False)["column"]})






# Checking Assumptions

## 1. Check if errors of dist. are normal through qq plots

In [245]:
df = pd.DataFrame()
for column in corona_eat.drop(columns = "Etc. Comments").iloc[:,1:].columns:
    resid = []
    corona_mean = corona_eat[column].mean()
    corona_std = np.std(corona_eat[column])
    open_mean = open_eat[column].mean()
    open_std = np.std(open_eat[column])
    for date in eat.Date:
        if (date >= pd.to_datetime('03-14-2020')) & (date < pd.to_datetime('05-09-2020')):
            resid.append((corona_mean - eat[eat["Date"]==date][column].values[0])/corona_std)
        else:
            resid.append((open_mean - eat[eat["Date"]==date][column].values[0])/open_std)
    df = pd.concat([df, pd.Series(resid,name = column)], axis = 1)
df
    

Unnamed: 0,Ross,Chandler,BT1,BT2,BT3,BT4,BT5,GR1,GR2,GR3,GR4,GR5,Total:,All_GR,All_BT,All_SS,male,female
0,1.264555,-0.752995,-0.434675,-1.306980,-1.110323,-0.543469,-0.371444,0.994478,0.991042,-0.082193,-0.906527,0.468374,-1.272287,0.485810,-1.468629,0.373629,-0.348729,-1.738560
1,0.913710,1.034939,0.945861,0.315748,1.024914,1.260847,1.287002,0.994478,0.991042,1.094011,1.059721,1.085437,1.829300,1.964620,1.848537,1.366789,1.377323,1.546290
2,,,,,,,,-0.466437,0.991042,-0.670295,0.404305,1.085437,1.519141,0.485810,,,1.220409,1.200516
3,-0.489668,0.677352,0.485683,-0.089934,0.170819,0.358689,1.287002,,,,,,0.692051,,0.853387,0.125339,0.435840,0.681856
4,0.562866,0.319765,0.945861,-0.901298,0.170819,-0.543469,-0.786055,0.994478,-0.452054,-0.670295,-0.251111,1.085437,-0.651970,0.239342,-0.473479,0.621919,0.122013,-1.219899
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
577,-0.840513,1.034939,0.485683,0.315748,-1.537371,1.260847,-0.371444,0.264021,0.991042,1.094011,-2.217359,0.468374,-0.548584,0.239342,0.024096,0.125339,-0.348729,-0.528352
578,1.264555,-2.183343,0.945861,-1.712662,-1.110323,-0.994548,-1.615278,0.994478,0.269494,-0.082193,-0.251111,-2.616945,-2.409536,-0.746532,-1.800346,-0.619532,-0.976384,-2.948768
579,,,,,,,,-1.927351,0.991042,-0.082193,-0.906527,-0.765754,0.898823,-0.993000,,,0.906582,0.508969
580,-0.489668,-1.110582,-0.434675,-1.306980,-0.683276,-1.445627,0.043168,,,,,,-1.272287,,-1.468629,-1.116112,-1.133298,-0.874126


In [262]:
px.histogram(df["female"])

In [263]:
df2 = pd.DataFrame()
for column in light_eat.drop(columns = "Etc. Comments").iloc[:,1:].columns:
    resid = []
    light_mean = light_eat[column].mean()
    light_std = np.std(light_eat[column])
    no_light_mean = no_light_eat[column].mean()
    no_light_std = np.std(no_light_eat[column])
    for date in eat.Date:
        if  (date < pd.to_datetime('02-15-2018')):
            resid.append((light_mean - eat[eat["Date"]==date][column].values[0])/light_std)
        else:
            resid.append((no_light_mean - eat[eat["Date"]==date][column].values[0])/no_light_std)
    df2 = pd.concat([df2, pd.Series(resid,name = column)], axis = 1)
df2
    

Unnamed: 0,Ross,Chandler,BT1,BT2,BT3,BT4,BT5,GR1,GR2,GR3,GR4,GR5,Total:,All_GR,All_BT,All_SS,male,female
0,1.244786,-0.726181,-0.420426,-1.208749,-1.138389,-0.490154,-0.353893,1.023944,0.991227,-0.086965,-0.901766,0.480332,-1.244907,0.496762,-1.440297,0.373878,-0.329781,-1.724244
1,0.896314,1.039504,0.952142,0.371848,1.024657,1.255953,1.297606,1.023944,0.991227,1.070669,1.066605,1.091663,1.864002,1.979742,1.910766,1.356438,1.376764,1.598498
2,,,,,,,,-0.468771,0.991227,-0.665782,0.410481,1.091663,1.553111,0.496762,,,1.221624,1.248736
3,-0.497574,0.686367,0.494619,-0.023301,0.159438,0.382900,1.297606,,,,,,0.724069,,0.905447,0.128238,0.445921,0.724092
4,0.547842,0.333230,0.952142,-0.813600,0.159438,-0.490154,-0.766767,1.023944,-0.448623,-0.665782,-0.245642,1.091663,-0.623125,0.249598,-0.434978,0.619518,0.135640,-1.199601
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
577,-0.894630,1.138869,0.382627,-0.132954,-1.770247,1.029935,-1.057497,0.000000,0.857075,1.319247,-2.969287,0.059976,-1.395427,-0.157966,-0.698360,-0.199840,-1.020285,-1.046462
578,1.086337,-3.825889,1.080358,-3.076936,-1.316713,-2.100260,-2.787946,0.622495,0.026784,-0.021627,-0.568038,-4.588165,-3.853243,-1.137354,-2.832239,-1.129097,-1.939662,-3.580932
579,,,,,,,,-1.867485,0.857075,-0.021627,-1.368454,-1.799280,0.516208,-1.382201,,,0.818470,0.039739
580,-0.564469,-2.170970,-1.012836,-2.488139,-0.863178,-2.726299,-0.480680,,,,,,-2.351244,,-2.444261,-1.748602,-2.169506,-1.408529


In [273]:
px.histogram(df2["Total: "])

In [277]:
df3 = pd.DataFrame()
for column in light_targets.iloc[:,1:].columns:
    resid = []
    light_mean = light_targets[column].mean()
    light_std = np.std(light_targets[column])
    no_light_mean = no_light_targets[column].mean()
    no_light_std = np.std(no_light_targets[column])
    for date in eat.Date:
        if  (date < pd.to_datetime('02-15-2018')):
            resid.append((light_mean - eat[eat["Date"]==date][column].values[0])/light_std)
        else:
            resid.append((no_light_mean - eat[eat["Date"]==date][column].values[0])/no_light_std)
    df3 = pd.concat([df3, pd.Series(resid,name = column)], axis = 1)
df3
    

Unnamed: 0,Ross,Chandler,BT1,BT2,BT3,BT4,BT5,GR1,GR2,GR3,GR4,GR5,Total:,All_GR,All_BT,All_SS,male,female
0,0.335534,-8.398823,-2.379000,-4.280038,-4.654275,-4.296091,-3.861487,0.277639,0.456800,-3.131960,-6.551693,-2.532455,-9.841429,-4.597596,-7.566229,-6.579148,-7.141049,-8.825239
1,-2.348735,0.442043,0.775817,-0.942713,0.617641,0.586374,0.672898,0.277639,0.456800,0.463831,0.338589,0.346621,0.133738,0.792384,0.509189,-0.889074,0.235066,0.024061
2,,,,,,,,-5.830416,0.456800,-4.929856,-1.958172,0.346621,-0.863778,-4.597596,,,-0.435490,-0.907444
3,-13.085810,-1.326130,-0.275789,-1.777044,-1.491125,-1.854858,0.672898,,,,,,-3.523823,,-1.913436,-8.001666,-3.788270,-2.304702
4,-5.033004,-3.094303,0.775817,-3.445707,-1.491125,-4.296091,-4.995083,0.277639,-2.986115,-4.929856,-4.254933,0.346621,-7.846395,-5.495926,-5.143603,-5.156629,-5.129381,-7.427981
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
577,-0.068863,1.462354,1.062722,0.789542,-0.148701,1.590943,-0.081258,0.722770,1.078328,1.358855,-1.097023,0.212000,0.546755,0.836223,0.895890,0.835242,0.496259,0.459139
578,1.480560,-0.454324,1.273858,-1.677777,0.169944,-0.484200,-1.126006,1.231994,0.521192,0.835569,0.178266,-1.841746,-0.349973,0.320344,-0.113133,0.455587,0.198875,-1.035731
579,,,,,,,,-0.804903,1.078328,0.835569,-0.246830,-0.609499,1.244210,0.191375,,,1.091027,1.099797
580,0.189374,0.184569,0.640448,-1.184313,0.488588,-0.899229,0.266991,,,,,,0.198027,,0.070326,0.202483,0.124530,0.245586


In [278]:
px.histogram(df["Total: "])

Each of the total histogram of residuals look like they are approximately normal. When we look at the individual sharks they do look like they are left skewed but we are going to say that this assumption passes and we can move on.

## 2. Mean of Residuals needs to be 0

In [276]:
df.mean()

Ross       -1.140502e-16
Chandler    1.918391e-16
BT1         1.138105e-16
BT2        -6.641060e-17
BT3         1.120362e-16
BT4         9.479987e-17
BT5         2.846856e-16
GR1         4.705110e-16
GR2         1.107682e-16
GR3        -5.385979e-17
GR4         1.582766e-16
GR5        -2.342393e-16
Total:      2.025536e-17
All_GR     -6.605446e-18
All_BT      2.495455e-17
All_SS      4.552167e-17
male       -1.442145e-16
female      2.703069e-16
dtype: float64

In [275]:
df2.mean()

Ross       -0.079567
Chandler   -0.428786
BT1        -0.328122
BT2        -0.605772
BT3        -0.106215
BT4        -0.678622
BT5        -0.496606
GR1        -0.195404
GR2        -0.248215
GR3         0.068648
GR4        -0.233730
GR5        -0.591610
Total:     -0.622686
All_GR     -0.347461
All_BT     -0.677115
All_SS     -0.315772
male       -0.468966
female     -0.455556
dtype: float64

In [279]:
df3.mean()

Ross       -0.079840
Chandler    0.549911
BT1         0.715070
BT2         0.248774
BT3         0.824309
BT4         0.268131
BT5         0.079594
GR1         0.298590
GR2         0.195506
GR3         0.594756
GR4         0.127269
GR5        -0.295701
Total:      0.449838
All_GR      0.307144
All_BT      0.629189
All_SS      0.214629
male        0.314832
female      0.508242
dtype: float64

All of the residuals are pretty close to having residuals of 0 so we are good on this assumption

## 3. Do the samples have equal variances?

### Check Corona Vs. Non-Corona

In [105]:
df = pd.DataFrame()
for column in corona_eat.drop(columns = "Etc. Comments").iloc[:,1:]:
    numbers = []
    corona = corona_eat[column]
    opened = open_eat[column]
    if np.std(corona)>np.std(opened):
        num = np.std(corona)/np.std(opened)
    else:
        num = np.std(opened)/np.std(corona)
    numbers.append(num)
    df = pd.concat([df,pd.Series(numbers,name = column)], axis = 1)
df >2
    

Unnamed: 0,Ross,Chandler,BT1,BT2,BT3,BT4,BT5,GR1,GR2,GR3,GR4,GR5,Total:,All_GR,All_BT,All_SS,male,female
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [107]:
df = pd.DataFrame()
for column in corona_drop.iloc[:,1:]:
    numbers = []
    corona = corona_drop[column]
    opened = open_drop[column]
    if np.std(corona)>np.std(opened):
        num = np.std(corona)/np.std(opened)
    else:
        num = np.std(opened)/np.std(corona)
    numbers.append(num)
    df = pd.concat([df,pd.Series(numbers,name = column)], axis = 1)
df >2
    

Unnamed: 0,Ross,Chandler,BT1,BT2,BT3,BT4,BT5,GR1,GR2,GR3,GR4,GR5,Total:,All_GR,All_BT,All_SS,male,female
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [110]:

df = pd.DataFrame()
for column in corona_target.iloc[:,1:]:
    numbers = []
    corona = corona_target[column]
    opened = open_target[column]
    if np.std(corona)>np.std(opened):
        num = np.std(corona)/np.std(opened)
    else:
        num = np.std(opened)/np.std(corona)
    numbers.append(num)
    df = pd.concat([df,pd.Series(numbers,name = column)], axis = 1)
df > 2
    


Unnamed: 0,Ross,Chandler,BT1,BT2,BT3,BT4,BT5,GR1,GR2,GR3,GR4,GR5,Total:,All_GR,All_BT,All_SS,male,female
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


### Check non-light training vs. all light-training

In [116]:
df = pd.DataFrame()
for column in light_eat.drop(columns = "Etc. Comments").iloc[:,1:]:
    numbers = []
    light = light_eat[column]
    no_light = no_light_eat[column]
    if np.std(light)>np.std(no_light):
        num = np.std(light)/np.std(no_light)
    else:
        num = np.std(no_light)/np.std(light)
    numbers.append(num)
    df = pd.concat([df,pd.Series(numbers,name = column)], axis = 1)
df>2
    

Unnamed: 0,Ross,Chandler,BT1,BT2,BT3,BT4,BT5,GR1,GR2,GR3,GR4,GR5,Total:,All_GR,All_BT,All_SS,male,female
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [122]:
df = pd.DataFrame()
for column in light_drops.iloc[:,1:]:
    numbers = []
    light = light_drops[column]
    no_light = no_light_drops[column]
    if np.std(light)>np.std(no_light):
        num = np.std(light)/np.std(no_light)
    else:
        num = np.std(no_light)/np.std(light)
    numbers.append(num)
    df = pd.concat([df,pd.Series(numbers,name = column)], axis = 1)
df >2
    

Unnamed: 0,Ross,Chandler,BT1,BT2,BT3,BT4,BT5,GR1,GR2,GR3,GR4,GR5,Total:,All_GR,All_BT,All_SS,male,female
0,False,False,True,False,False,False,False,True,False,True,False,False,False,True,False,False,True,False


Looks like BT1, GR1, GR3, All_GR, and male are all unequal variances. We will need to make sure that we are doing the Welch T-test for these tests.

In [125]:
df = pd.DataFrame()
for column in light_targets.iloc[:,1:]:
    numbers = []
    light = light_targets[column]
    no_light = no_light_targets[column]
    if np.std(light)>np.std(no_light):
        num = np.std(light)/np.std(no_light)
    else:
        num = np.std(no_light)/np.std(light)
    numbers.append(num)
    df = pd.concat([df,pd.Series(numbers,name = column)], axis = 1)
df>2
    

Unnamed: 0,Ross,Chandler,BT1,BT2,BT3,BT4,BT5,GR1,GR2,GR3,GR4,GR5,Total:,All_GR,All_BT,All_SS,male,female
0,True,True,True,False,True,True,True,True,True,True,True,True,True,True,True,True,True,True


Looks like all of them except BT2 have unequal variances. We will need to make sure we do the Welch T-test.

### Check non light training vs. first 40 days of light training variances

In [139]:
df = pd.DataFrame()
for column in no_light_eat.drop(columns = "Etc. Comments").iloc[:,1:]:
    numbers = []
    light = first_40_eat[column]
    no_light = no_light_eat[column]
    if np.std(light)>np.std(no_light):
        num = np.std(light)/np.std(no_light)
    else:
        num = np.std(no_light)/np.std(light)
    numbers.append(num)
    df = pd.concat([df,pd.Series(numbers,name = column)], axis = 1)
df>2
    

Unnamed: 0,Ross,Chandler,BT1,BT2,BT3,BT4,BT5,GR1,GR2,GR3,GR4,GR5,Total:,All_GR,All_BT,All_SS,male,female
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [127]:
df = pd.DataFrame()
for column in no_light_drops.iloc[:,1:]:
    numbers = []
    light = first_40_drops[column]
    no_light = no_light_drops[column]
    if np.std(light)>np.std(no_light):
        num = np.std(light)/np.std(no_light)
    else:
        num = np.std(no_light)/np.std(light)
    numbers.append(num)
    df = pd.concat([df,pd.Series(numbers,name = column)], axis = 1)
df >2
    

Unnamed: 0,Ross,Chandler,BT1,BT2,BT3,BT4,BT5,GR1,GR2,GR3,GR4,GR5,Total:,All_GR,All_BT,All_SS,male,female
0,False,False,False,False,False,False,False,True,True,False,False,False,False,True,False,False,False,False


Looks like GR1, GR2, and All_GR have unequal variances in these tests

In [130]:
df = pd.DataFrame()
for column in no_light_targets.iloc[:,1:]:
    numbers = []
    light = first_40_targets[column]
    no_light = no_light_targets[column]
    if np.std(light)>np.std(no_light):
        num = np.std(light)/np.std(no_light)
    else:
        num = np.std(no_light)/np.std(light)
    numbers.append(num)
    df = pd.concat([df,pd.Series(numbers,name = column)], axis = 1)
df>2
    

Unnamed: 0,Ross,Chandler,BT1,BT2,BT3,BT4,BT5,GR1,GR2,GR3,GR4,GR5,Total:,All_GR,All_BT,All_SS,male,female
0,True,True,True,False,True,True,True,True,True,True,True,True,True,True,True,True,True,True


All but BT2 have unequal variances here

### Check non light training vs. last 40 days

In [133]:
df = pd.DataFrame()
for column in no_light_eat.drop(columns = "Etc. Comments").iloc[:,1:]:
    numbers = []
    light = last_40[column]
    no_light = no_light_eat[column]
    if np.std(light)>np.std(no_light):
        num = np.std(light)/np.std(no_light)
    else:
        num = np.std(no_light)/np.std(light)
    numbers.append(num)
    df = pd.concat([df,pd.Series(numbers,name = column)], axis = 1)
df>2
    

Unnamed: 0,Ross,Chandler,BT1,BT2,BT3,BT4,BT5,GR1,GR2,GR3,GR4,GR5,Total:,All_GR,All_BT,All_SS,male,female
0,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


Looks like Chandler has unequal variance

In [134]:
df = pd.DataFrame()
for column in no_light_drops.iloc[:,1:]:
    numbers = []
    light = last_40_d[column]
    no_light = no_light_drops[column]
    if np.std(light)>np.std(no_light):
        num = np.std(light)/np.std(no_light)
    else:
        num = np.std(no_light)/np.std(light)
    numbers.append(num)
    df = pd.concat([df,pd.Series(numbers,name = column)], axis = 1)
df >2
    


divide by zero encountered in double_scalars



Unnamed: 0,Ross,Chandler,BT1,BT2,BT3,BT4,BT5,GR1,GR2,GR3,GR4,GR5,Total:,All_GR,All_BT,All_SS,male,female
0,True,False,True,False,False,False,False,False,False,True,True,False,False,True,False,False,True,False


Looks like Ross, BT1, GR3, GR4, All_GR, and male all have unequal variances

In [135]:
df = pd.DataFrame()
for column in no_light_targets.iloc[:,1:]:
    numbers = []
    light = last_40_t[column]
    no_light = no_light_targets[column]
    if np.std(light)>np.std(no_light):
        num = np.std(light)/np.std(no_light)
    else:
        num = np.std(no_light)/np.std(light)
    numbers.append(num)
    df = pd.concat([df,pd.Series(numbers,name = column)], axis = 1)
df>2
    

Unnamed: 0,Ross,Chandler,BT1,BT2,BT3,BT4,BT5,GR1,GR2,GR3,GR4,GR5,Total:,All_GR,All_BT,All_SS,male,female
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


There was some definite variances that were not equal as you looked at the two groups. So making note of all of those, I went to the t tests that we made and changed the test to include a parameter that said that the variances were not equal which made some that we found significant, not significant anymore.

## 4. Independence Assumption

In [287]:
px.line(df["Ross"].dropna())

In [288]:
px.line(df["Total: "])

In [289]:
px.line(df2["Chandler"].dropna())

In [291]:
px.line(df2["Total: "])

These graphs are looking independent and random so we can say that this assumption also checks out