# Statistical Analysis of the Data Table
This notebook takes the created data table containing BBB permeability and computes statistics such as median and p-value.

## Import
Import the python libraries and python file needed to compute statistics relating to the data table.

In [17]:
from data_analysis import *
import statistics
import numpy as np, matplotlib.pyplot as plt 
from scipy.stats import ttest_ind
from scipy import stats

## Determine the P-Value for TPSA between BBB+ and BBB-
Use ttest_ind to perform an independent two sample t-test to determine whether the difference between the distribution of TPSA for BBB+ and the distribution of TPSA for BBB- are statistically different/significant from each other or not

In [18]:
'''Find P-value between TPSA of BBB+ and BBB- molecules'''
tpsa_positive_array = np.array(tpsa_positive)
tpsa_negative_array = np.array(tpsa_negative)
t_stat, p_value = ttest_ind(tpsa_positive_array, tpsa_negative_array)
print(f'The P-value between TPSA of BBB+ and BBB- molecules is {p_value:.10f}')

The P-value between TPSA of BBB+ and BBB- molecules is 0.0000000000


## Determine the P-Value for logP between BBB+ and BBB-
Use ttest_ind to perform an independent two sample t-test to determine whether the difference between the distribution of logP for BBB+ and the distribution of logP for BBB- are statistically different/significant from each other or not

In [19]:
'''Find P-value between logP of BBB+ and BBB- molecules'''
logP_positive_array = np.array(logP_positive)
logP_negative_array = np.array(logP_negative)
t_stat, p_value = ttest_ind(logP_positive_array, logP_negative_array)
print(f'The P-value between logP of BBB+ and BBB- molecules is {p_value:.10f}')

The P-value between logP of BBB+ and BBB- molecules is 0.0000000000


## Determine the Median of TPSA for BBB+ and BBB- 
Determine the median of the distribution of TPSA for the BBB+ group and the BBB- group

In [20]:
'''Find the Median of TPSA for BBB+ and BBB- molecules'''
median_tpsa_positive = statistics.median(tpsa_positive_array)
median_tpsa_negative = statistics.median(tpsa_negative_array)
print(f'Median TPSA BBB+: {median_tpsa_positive}')
print(f'Median TPSA BBB-: {median_tpsa_negative}')

Median TPSA BBB+: 46.25
Median TPSA BBB-: 107.61


## Determine the Median of logP for BBB+ and BBB- 
Determine the median of the distribution of logP for the BBB+ group and the BBB- group

In [21]:
'''Find the Median of logP for BBB+ and BBB- molecules'''
median_logP_positive = statistics.median(logP_positive_array)
median_logP_negative = statistics.median(logP_negative_array)
print(f'Median logP BBB+: {median_logP_positive.round(3)}')
print(f'Median logP BBB-: {median_logP_negative.round(3)}') 

Median logP BBB+: 2.924
Median logP BBB-: 1.41


## Determine the Interquartile Range of TPSA for BBB+ and BBB- 
Determine the interquartile range (Q1 and Q3) of the distribution of TPSA for the BBB+ group and the BBB- group

In [22]:
'''Find the Interquartile Range of TPSA for BBB+ and BBB- molecules'''
Q1_tpsa_positive = np.percentile(tpsa_positive_array, 25)
Q3_tpsa_positive = np.percentile(tpsa_positive_array, 75)
print(f"TPSA BBB+ Q1: {Q1_tpsa_positive.round(3)}")
print(f"TPSA BBB+ Q3: {Q3_tpsa_positive.round(3)}")

Q1_tpsa_negative = np.percentile(tpsa_negative_array, 25)
Q3_tpsa_negative = np.percentile(tpsa_negative_array, 75)
print(f"TPSA BBB- Q1: {Q1_tpsa_negative.round(3)}")
print(f"TPSA BBB- Q3: {Q3_tpsa_negative.round(3)}")

TPSA BBB+ Q1: 29.02
TPSA BBB+ Q3: 72.83
TPSA BBB- Q1: 67.45
TPSA BBB- Q3: 173.5


## Determine the Interquartile Range of logP for BBB+ and BBB- 
Determine the interquartile range (Q1 and Q3) of the distribution of logP for the BBB+ group and the BBB- group

In [25]:
'''Find the Interquartile Range of logP for BBB+ and BBB- molecules'''
Q1_logP_positive = np.percentile(logP_positive_array, 25)
Q3_logP_positive = np.percentile(logP_positive_array, 75)
print(f"logP BBB+ Q1: {Q1_logP_positive.round(3)}")
print(f"logP BBB+ Q3: {Q3_logP_positive.round(3)}")

Q1_logP_negative = np.percentile(logP_negative_array, 25)
Q3_logP_negative = np.percentile(logP_negative_array, 75)
print(f"logP BBB- Q1: {Q1_logP_negative.round(3)}")
print(f"logP BBB- Q3: {Q3_logP_negative.round(3)}")

logP BBB+ Q1: 1.831
logP BBB+ Q3: 3.949
logP BBB- Q1: 0.122
logP BBB- Q3: 2.996


## Determine the Confidence Interval for the difference between TPSA of BBB+ and BBB-
Using the confidence interval formula for a mean using t-distribution, calculate the lower bound and upper bound of the TPSA range with 95% confidence.

In [26]:
'''Find Confidence Interval of TPSA difference of BBB+ and BBB- molecules'''
n1 = len(tpsa_positive) # Size of the BBB+ dataset
n2 = len(tpsa_negative) # Size of the BBB- dataset
mean1 = np.mean(tpsa_positive_array)
mean2 = np.mean(tpsa_negative_array)
sd1 = np.std(tpsa_positive_array, ddof = 1) # Sample Standard Deviation of TPSA of BBB+ dataset
sd2 = np.std(tpsa_negative_array, ddof = 1) # Sample Standard Deviation of TPSA of BBB- dataset
pooled_standard_deviation = np.sqrt(((n1-1)*(sd1**2)+(n2-1)*(sd2**2))/(n1+n2-2))
standard_error = pooled_standard_deviation * np.sqrt((1/n1)+(1/n2))
s1 = np.var(tpsa_positive_array, ddof = 1) # Sample variance of TPSA of BBB+ dataset
s2 = np.var(tpsa_negative_array, ddof = 1) # Sample variance of TPSA of BBB- dataset
numerator = ((s1/n1)+ (s2/n2))**2
denominator = (((s1/n1)**2)/(n1-1))+ (((s2/n2)**2)/(n2-1))
degrees_of_freedom = numerator/denominator # Calculate degrees of freedom using the Welch-Satterthwaite formula for unequal variance
t_critical_value = stats.t.ppf(0.975, degrees_of_freedom) # Critical Value for 95% confidence 
critical_value_right = round(((mean1-mean2) + t_critical_value * np.sqrt((s1/n1)+(s2/n2))), 5) # + Critical Value
critical_value_left = round(((mean1-mean2) - t_critical_value * np.sqrt((s1/n1)+(s2/n2))), 5) # - Critical Value
print(f'We can state that the TPSA difference of BBB+ and BBB- lies between {critical_value_left.round(3)} and {critical_value_right.round(3)} with 95% confidence.')

We can state that the TPSA difference of BBB+ and BBB- lies between -75.308 and -70.127 with 95% confidence.


## Determine the Confidence Interval for the difference between logP of BBB+ and BBB-
Using the confidence interval formula for a mean using t-distribution, calculate the lower bound and upper bound of the logP range with 95% confidence.

In [27]:
'''Find Confidence Interval of logP difference of BBB+ and BBB- molecules'''
n1 = len(logP_positive) # Size of the BBB+ dataset
n2 = len(logP_negative) # Size of the BBB- dataset
mean1 = np.mean(logP_positive_array)
mean2 = np.mean(logP_negative_array)
sd1 = np.std(logP_positive_array, ddof = 1) # Sample Standard Deviation of TPSA of BBB+ dataset
sd2 = np.std(logP_negative_array, ddof = 1) # Sample Standard Deviation of TPSA of BBB- dataset
pooled_standard_deviation = np.sqrt(((n1-1)*(sd1**2)+(n2-1)*(sd2**2))/(n1+n2-2))
standard_error = pooled_standard_deviation * np.sqrt((1/n1)+(1/n2))
s1 = np.var(logP_positive_array, ddof = 1) # Sample variance of TPSA of BBB+ dataset
s2 = np.var(logP_negative_array, ddof = 1) # Sample variance of TPSA of BBB- dataset
numerator = ((s1/n1)+ (s2/n2))**2
denominator = (((s1/n1)**2)/(n1-1))+ (((s2/n2)**2)/(n2-1))
degrees_of_freedom = numerator/denominator # Calculate degrees of freedom using the Welch-Satterthwaite formula for unequal variance
t_critical_value = stats.t.ppf(0.975, degrees_of_freedom) # Critical Value for 95% confidence 
critical_value_right = round(((mean1-mean2) + t_critical_value * np.sqrt((s1/n1)+(s2/n2))), 5) # + Critical Value
critical_value_left = round(((mean1-mean2) - t_critical_value * np.sqrt((s1/n1)+(s2/n2))), 5) # - Critical Value
print(f'We can state that the logP difference of BBB+ and BBB- lies between {critical_value_left.round(3)} and {critical_value_right.round(3)} with 95% confidence.')

We can state that the logP difference of BBB+ and BBB- lies between 1.353 and 1.531 with 95% confidence.


## Determine the Effect Size for the difference between TPSA of BBB+ and BBB- 
Quantify the magnitude of the difference between TPSA for BBB+ and BBB- by using the formula for Cohen's d 

In [28]:
'''Find Effect Size of TPSA difference of BBB+ and BBB- molecules'''
effective_size = (mean1 - mean2)/pooled_standard_deviation # Cohen's d for effective size
print(f'The effective size for the difference between TPSA of BBB+ and BBB- is {effective_size.round(3)}.')

The effective size for the difference between TPSA of BBB+ and BBB- is 0.762.


## Determine the Effect Size for the difference between logP of BBB+ and BBB- 
Quantify the magnitude of the difference between logP for BBB+ and BBB- by using the formula for Cohen's d 

In [29]:
'''Find Effect Size of logP difference of BBB+ and BBB- molecules'''
effective_size = (mean1 - mean2)/pooled_standard_deviation # Cohen's d for effective size
print(f'The effective size for the difference between logP of BBB+ and BBB- is {effective_size.round(3)}.')

The effective size for the difference between logP of BBB+ and BBB- is 0.762.
