## The OS numbers between of Cpfl1 are checked for significance

In [11]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.stats.multicomp as mc

from scipy.stats import ttest_ind

In [12]:
path = "../../measurements/cpfl/"
measurements = pd.read_csv(path + "05-measurements-filtered-5-and-95-percentiles.csv")
measurements

Unnamed: 0,label,age,image_id,genotype,maximum,mean,median,minimum,sigma,sum,...,equivalent_spherical_perimeter_rescaled,equivalent_spherical_radius_rescaled,feret_diameter_rescaled,perimeter_2d_rescaled,major_axis_length_2d_rescaled,minor_axis_length_2d_rescaled,surface_area_rescaled,bbox_volume_rescaled,convex_volume_rescaled,volume_rescaled
0,1,8,0,cpfl,250.0,189.793103,184.144531,157.0,20.305487,5504.0,...,14.744113,0.615610,1.480172,2.957153,0.976660,0.904212,3.881768,1.516422,1.078345,0.977250
1,2,8,0,cpfl,334.0,207.273684,202.871094,133.0,42.706679,19691.0,...,32.521386,0.914283,2.624064,4.973873,2.127172,1.027014,7.095704,4.852550,3.572016,3.201335
2,4,8,0,cpfl,262.0,214.904762,215.355469,175.0,27.285353,4513.0,...,11.889571,0.552814,1.370373,2.957153,0.976660,0.904212,5.174290,1.516422,0.943551,0.707664
3,5,8,0,cpfl,311.0,195.171429,190.386719,118.0,36.997206,6831.0,...,16.713421,0.655434,1.678357,3.390218,1.255098,0.902928,7.379101,2.426275,1.381629,1.179439
4,6,8,0,cpfl,355.0,219.354167,215.355469,147.0,47.335750,21058.0,...,32.749208,0.917480,2.068209,5.481241,1.910113,1.514661,11.022045,6.065688,3.774206,3.235034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5226,176,70,39,wt,437.0,228.476298,219.855469,118.0,59.914239,101215.0,...,90.772814,1.527475,5.732681,13.307188,6.005873,1.731845,45.719796,55.197761,20.994020,14.928332
5227,177,70,39,wt,1542.0,485.858939,412.605469,137.0,265.029227,1043625.0,...,260.041601,2.585340,9.158631,32.454379,8.011704,7.033195,160.768595,300.992921,132.366793,72.383878
5228,178,70,39,wt,416.0,224.548673,213.832031,127.0,62.839505,25374.0,...,36.509477,0.968722,2.261000,5.734924,1.901773,1.715924,15.687029,8.491963,4.414473,3.807904
5229,180,70,39,wt,481.0,251.469466,237.925781,145.0,64.019970,65885.0,...,63.956915,1.282154,4.579315,13.771030,4.629315,2.203200,35.317111,35.383180,15.063125,8.828946


### Preparing the dataset

In [16]:
number_of_labels = measurements.groupby("image_id", as_index=False)["label"].size()
number_of_labels.rename(columns={'size': 'label'}, inplace=True)
age = measurements.groupby("image_id", as_index=False)["age", "genotype"].max()
number_of_labels = pd.merge(number_of_labels, age)
number_of_labels

  age = measurements.groupby("image_id", as_index=False)["age", "genotype"].max()


Unnamed: 0,image_id,label,age,genotype
0,0,99,8,cpfl
1,1,93,8,cpfl
2,2,62,8,cpfl
3,3,290,14,cpfl
4,4,216,14,cpfl
5,5,135,14,cpfl
6,6,145,20,cpfl
7,7,164,20,cpfl
8,8,155,20,cpfl
9,9,33,245,cpfl


### Test for significance

- different genotypes are compared within their respective age
- Welsh's t test is used
- based on the plot:
    - alternative hypothesis: OS number (Cpfl1) < OS number (WT)
    - Null hypothesis: OS number (Cpfl1) >= OS number (WT)

In [1]:
ages = [8, 14, 20, 30, 70, 245]

test_results = []

for i, postnatal_age in enumerate(ages):
    
    age_filtered = measurements[measurements["age"] == postnatal_age]
    
    number_of_labels_cpfl = age_filtered[age_filtered["genotype"] == "cpfl"]["label"]
    number_of_labels_wt = age_filtered[age_filtered["genotype"] == "wt"]["label"]
    
    t_statistic, p_value = ttest_ind(number_of_labels_wt,
                                     number_of_labels_cpfl,
                                     equal_var=False,
                                     alternative="greater")
    
    test_results.append({"age": postnatal_age, "p_value": p_value})
    
test_results = pd.DataFrame(test_results)
test_results

NameError: name 'number_of_labels' is not defined

### ANOVA test for Cpfl1 OS number alone

In [19]:
#import custom functions
import sys
sys.path.append("../../../../../../01_python_definitions")
from fluorescent_microscopy_analysis import tukey_test

In [21]:
number_of_labels_cpfl = number_of_labels[number_of_labels["genotype"] == "cpfl"]
number_of_labels_cpfl

Unnamed: 0,image_id,label,age,genotype
0,0,99,8,cpfl
1,1,93,8,cpfl
2,2,62,8,cpfl
3,3,290,14,cpfl
4,4,216,14,cpfl
5,5,135,14,cpfl
6,6,145,20,cpfl
7,7,164,20,cpfl
8,8,155,20,cpfl
9,9,33,245,cpfl


In [22]:
f, p, tukey = tukey_test(number_of_labels_cpfl, feature="label")
tukey

group1,group2,meandiff,p-adj,lower,upper,reject
8,14,129.0,0.0063,33.7069,224.2931,True
8,20,70.0,0.2132,-25.2931,165.2931,False
8,30,37.3333,0.7806,-57.9598,132.6264,False
8,70,-33.0,0.8526,-128.2931,62.2931,False
8,245,-67.6667,0.1883,-156.8052,21.4719,False
14,20,-59.0,0.3662,-154.2931,36.2931,False
14,30,-91.6667,0.0622,-186.9598,3.6264,False
14,70,-162.0,0.0009,-257.2931,-66.7069,True
14,245,-196.6667,0.0001,-285.8052,-107.5281,True
20,30,-32.6667,0.8576,-127.9598,62.6264,False
