In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ttest_1samp


In [2]:
df=sns.load_dataset('titanic')

In [3]:
df.head(5)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [4]:
df1=df[["sex","age"]]

In [5]:
df1

Unnamed: 0,sex,age
0,male,22.0
1,female,38.0
2,female,26.0
3,female,35.0
4,male,35.0
...,...,...
886,male,27.0
887,female,19.0
888,female,
889,male,26.0


In [6]:
df1.describe()

Unnamed: 0,age
count,714.0
mean,29.699118
std,14.526497
min,0.42
25%,20.125
50%,28.0
75%,38.0
max,80.0


**One sample t test**

A one-sample t-test is a statistical test used to determine whether the mean of a single sample is significantly different from a known or hypothesized population mean. It is a parametric test commonly employed when the data are approximately normally distributed.
Hypotheses in a One-Sample T-Test:
Null Hypothesis (H0​): The population mean (μ) is equal to a specified value.
Alternative Hypothesis (H1​): The population mean (μ) is not equal to the specified value.


In [7]:
ttest_1samp(df1["age"],45)

TtestResult(statistic=nan, pvalue=nan, df=nan)

In [8]:
from scipy.stats import ttest_1samp
ttest_1samp(df1["age"],45)

state, p_value=ttest_1samp(df1["age"],45)
print("state=",state,"p_value=",p_value)
if p_value> 0.05:
    print("There is no significance difference")
else:
    print("There is significance difference")

state= nan p_value= nan
There is significance difference


In [9]:
df2=df[["fare"]]

In [10]:
df2.describe()

Unnamed: 0,fare
count,891.0
mean,32.204208
std,49.693429
min,0.0
25%,7.9104
50%,14.4542
75%,31.0
max,512.3292


In [11]:
from scipy.stats import ttest_1samp
ttest_1samp(df2["fare"],32)

state, p_value=ttest_1samp(df2["fare"],32)
print("state=",state,"p_value=",p_value)
if p_value> 0.05:
    print("There is no significance difference")
else:
    print("There is significance difference")

state= 0.12266271558913089 p_value= 0.9024018818220448
There is no significance difference


# Two sample t test


A two-sample t-test is a statistical test used to compare the means of two independent samples to determine if there is a significant difference between them. It is a parametric test that assumes the data in each sample are approximately normally distributed.


In [12]:
male=df1[df1["sex"]=="male"]
male.head()

Unnamed: 0,sex,age
0,male,22.0
4,male,35.0
5,male,
6,male,54.0
7,male,2.0


In [13]:
female=df1[df1["sex"]=="female"]
female.head()

Unnamed: 0,sex,age
1,female,38.0
2,female,26.0
3,female,35.0
8,female,27.0
9,female,14.0


**Independent Samples T-Test**

The independent samples t-test is used to compare the means of two independent groups to determine if there is a significant difference between them. 
It assumes that the data in each group are independent and approximately normally distributed.
Hypotheses:

Null Hypothesis (H0​): The means of the two groups are equal.

Alternative Hypothesis (H1​): The means of the two groups are not equal


In [14]:
from scipy.stats import ttest_ind
state, p_value=ttest_ind(male["age"], female["age"])
print("state=",state,"p_value=",p_value)
if p_value> 0.05:
    print("There is no significance difference")
else:
    print("There is significance difference")


state= nan p_value= nan
There is significance difference


In [15]:
male.describe()

Unnamed: 0,age
count,453.0
mean,30.726645
std,14.678201
min,0.42
25%,21.0
50%,29.0
75%,39.0
max,80.0


In [16]:
female.describe()

Unnamed: 0,age
count,261.0
mean,27.915709
std,14.110146
min,0.75
25%,18.0
50%,27.0
75%,37.0
max,63.0


# Paired t test
The paired samples t-test is used when the observations are paired or matched, such as when the measurements are taken on the same subjects at different points in time or under different conditions.
Null Hypothesis (H0​): The mean difference between paired observations is zero.
Alternative Hypothesis (H1​): The mean difference between paired observations is not equal to zero.


In [17]:
df1=df[["sex","class", 'age']]

In [18]:
df1.head()

Unnamed: 0,sex,class,age
0,male,Third,22.0
1,female,First,38.0
2,female,Third,26.0
3,female,First,35.0
4,male,Third,35.0


In [19]:
male.head()

Unnamed: 0,sex,age
0,male,22.0
4,male,35.0
5,male,
6,male,54.0
7,male,2.0


In [20]:
male=df[df["sex"]=="male"]

In [21]:
male.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False


In [22]:
female=df[df1["sex"]=="female"]

In [23]:
male_first=male[male['class']=="First"]
male_second=male[male['class']=="Second"]
male_third=male[male['class']=="Third"]

In [24]:
first=male_first.sample(n=100)
second=male_second.sample(n=100)


In [25]:
first.shape

(100, 15)

In [26]:
from scipy.stats import ttest_rel # rel for related data
ttest_rel(first["age"], second['age'])

TtestResult(statistic=nan, pvalue=nan, df=nan)