T-test compares two averages (means) and tells you if they are different from each other. The t test also tells you how significant the differences are

![%D0%91%D0%B5%D0%B7%D1%8B%D0%BC%D1%8F%D0%BD%D0%BD%D1%8B%D0%B9.png](attachment:%D0%91%D0%B5%D0%B7%D1%8B%D0%BC%D1%8F%D0%BD%D0%BD%D1%8B%D0%B9.png)

![1.png](attachment:1.png)

![2.png](attachment:2.png)

In [1]:
import numpy as np
from scipy import stats


In [2]:
## Define 2 random distributions
#Sample Size
N = 10
#Gaussian distributed data with mean = 2 and var = 1
a = np.random.randn(N) + 2
#Gaussian distributed data with with mean = 0 and var = 1
b = np.random.randn(N)


In [3]:
## Calculate the Standard Deviation
#Calculate the variance to get the standard deviation

#For unbiased max likelihood estimate we have to divide the var by N-1, and therefore the parameter ddof = 1
var_a = a.var(ddof=1)
var_b = b.var(ddof=1)

In [4]:
#std deviation
s = np.sqrt((var_a + var_b)/2)
s

1.0070952081319295

In [8]:
## Calculate the t-statistics
t = (a.mean() - b.mean())/(s*np.sqrt(2/N))

In [9]:
## Compare with the critical t-value
#Degrees of freedom
df = 2*N - 2

In [10]:
#p-value after comparison with the t 
p = 1 - stats.t.cdf(t,df=df)

In [12]:
print("t = " + str(t))
print("p = " + str(2*p))
### You can see that after comparing the t statistic with the critical t value (computed internally) we get a good p value of 0.0005 and thus we reject the null hypothesis and thus it proves that the mean of the two distributions are different and statistically significant.


t = 5.317848935045986
p = 4.6906956899572805e-05


In [13]:
## Cross Checking with the internal scipy function
t2, p2 = stats.ttest_ind(a,b)
print("t = " + str(t2))
print("p = " + str(p2))

t = 5.317848935045986
p = 4.690695689956707e-05


![image.png](attachment:image.png)

### PRACTICE

In [1]:
## Import the packages
import numpy as np
from scipy import stats


## Define 2 random distributions
#Sample Size
N = 10
#Gaussian distributed data with mean = 2 and var = 1
a = np.random.randn(N) + 2
#Gaussian distributed data with with mean = 0 and var = 1
b = np.random.randn(N)


## Calculate the Standard Deviation
#Calculate the variance to get the standard deviation

#For unbiased max likelihood estimate we have to divide the var by N-1, and therefore the parameter ddof = 1
var_a = a.var(ddof=1)
var_b = b.var(ddof=1)

#std deviation
s = np.sqrt((var_a + var_b)/2)
s



## Calculate the t-statistics
t = (a.mean() - b.mean())/(s*np.sqrt(2/N))



## Compare with the critical t-value
#Degrees of freedom
df = 2*N - 2

#p-value after comparison with the t 
p = 1 - stats.t.cdf(t,df=df)


print("t = " + str(t))
print("p = " + str(2*p))

### You can see that after comparing the t statistic with 
### the critical t value (computed internally) we get a good p value of 0.0005 
### and thus we reject the null hypothesis and thus it proves that the mean of 
### the two distributions are different and statistically significant.


## Cross Checking with the internal scipy function
t2, p2 = stats.ttest_ind(a,b)
print("t = " + str(t2))
print("p = " + str(p2))

t = 3.5481013913200146
p = 0.0022976151186209215
t = 3.5481013913200146
p = 0.002297615118620956


In [2]:
Nx = 70
Ny = 40
x = np.random.randn(Nx) 
y = np.random.randn(Ny)+2
print("Nx: {}, Ny: {}".format(len(x), len(y)))
t3, p3 = stats.ttest_ind(y,x)
print("t = " + str(t3))
print("p = " + str(p3))

Nx: 70, Ny: 40
t = 11.155390312968938
p = 1.0973714870043545e-19


In [None]:
#find mean of X, Y
print(np.mean())

In [None]:
from scipy import stats
#find mode (Мода) of X and Y

In [None]:
import pandas as pd

In [3]:
# CREATE FOLLOWING DATASET
import pandas as pd

#Create a sample DataFrame
data = {'id': [1,2,3,4,5,6,7],
        'height': [188,175,173,182,190,177,169],
        'sex': ['m','m','f','f','m','m','f']
        }
df = pd.DataFrame(data)

![image.png](attachment:image.png)

In [5]:
# Divide dataset on Males and Females
# Find mean, median and mode of their height

mean_height_by_sex = df.groupby('sex')['height'].mean()
median_height = df.groupby('sex')['height'].median()
mode_height = df.groupby ('sex' )['height'].apply(lambda x: x.mode().iloc[0])

print(f"Mean of height: {mean_height_by_sex}")
print(f"Median of height: {median_height}")
print(f"Mode of height: {mode_height}")

Mean of height: sex
f    174.666667
m    182.500000
Name: height, dtype: float64
Median of height: sex
f    173.0
m    182.5
Name: height, dtype: float64
Mode of height: sex
f    169
m    175
Name: height, dtype: int64


In [7]:
#Calculate T and P
male = df[df["sex"].str.startswith("m")]
female = df[df["sex"].str.startswith("m")]
t4, p4 = stats.ttest_ind(male['height'],female['height'])
print("t4 = " + str(t4))
print("p4 = " + str(p4))

t4 = 0.0
p4 = 1.0


In [None]:
#Calculate T-crit
dffm = female.shape[0] + male.shape[0] -2
t_crit = stats.t.cdf(t4,df=dffm)