[Reference](https://python.plainenglish.io/how-to-perform-the-5-most-popular-statistical-analysis-with-python-ce9012c3be30)

In [1]:
import seaborn as sns

penguins = sns.load_dataset("penguins")
penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [2]:
import pandas as pd

penguins = penguins.dropna()
penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


# Correlation

In [3]:
from scipy.stats import pearsonr

pearsonr(penguins["flipper_length_mm"], penguins["body_mass_g"])

(0.8729788985653614, 3.132836250971883e-105)

# Ordinary Linear Regression(OLS)

In [4]:
from statsmodels.formula.api import ols

formula = "body_mass_g ~ flipper_length_mm"
lm = ols(formula, penguins).fit()
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:            body_mass_g   R-squared:                       0.762
Model:                            OLS   Adj. R-squared:                  0.761
Method:                 Least Squares   F-statistic:                     1060.
Date:                Thu, 12 Jan 2023   Prob (F-statistic):          3.13e-105
Time:                        11:18:22   Log-Likelihood:                -2461.1
No. Observations:                 333   AIC:                             4926.
Df Residuals:                     331   BIC:                             4934.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept         -5872.0927    310.28

In [5]:
formula = "body_mass_g ~ flipper_length_mm + bill_length_mm + bill_depth_mm"
lm = ols(formula, penguins).fit()
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:            body_mass_g   R-squared:                       0.764
Model:                            OLS   Adj. R-squared:                  0.762
Method:                 Least Squares   F-statistic:                     354.9
Date:                Thu, 12 Jan 2023   Prob (F-statistic):          9.26e-103
Time:                        11:18:36   Log-Likelihood:                -2459.8
No. Observations:                 333   AIC:                             4928.
Df Residuals:                     329   BIC:                             4943.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept         -6445.4760    566.13

# Independent Sample t-test

In [6]:
from scipy.stats import ttest_ind

Male_Penguins = penguins.loc[penguins["sex"] == "Male", "body_mass_g"]
Female_Penguins = penguins.loc[penguins["sex"] == "Female", "body_mass_g"]

Male_Penguins.head(), Female_Penguins.head()

(0     3750.0
 5     3650.0
 7     4675.0
 13    3800.0
 14    4400.0
 Name: body_mass_g, dtype: float64, 1     3800.0
 2     3250.0
 4     3450.0
 6     3625.0
 12    3200.0
 Name: body_mass_g, dtype: float64)

In [7]:
ttest_ind(Male_Penguins, Female_Penguins)

Ttest_indResult(statistic=8.541720337994516, pvalue=4.897246751596224e-16)

# Analysis of Variance(ANOVA)

In [8]:
from scipy.stats import f_oneway

Adelie = penguins.loc[penguins["species"] == "Adelie", "body_mass_g"]
Chinstrap = penguins.loc[penguins["species"] == "Chinstrap", "body_mass_g"]
Gentoo = penguins.loc[penguins["species"] == "Gentoo", "body_mass_g"]

In [9]:
f_oneway(Adelie, Chinstrap, Gentoo)

F_onewayResult(statistic=341.8948949481461, pvalue=3.74450512630046e-81)

# Two-Way ANOVA

In [10]:
from statsmodels.stats.anova import anova_lm

formula = "body_mass_g ~ species + sex + species:sex"
lm = ols(formula,data = penguins).fit()

In [11]:
anova_table = anova_lm(lm, typ=2)
print(anova_table)

                   sum_sq     df           F         PR(>F)
species      1.434016e+08    2.0  749.015666  8.144406e-123
sex          3.709026e+07    1.0  387.459976   1.902273e-57
species:sex  1.676557e+06    2.0    8.756997   1.973489e-04
Residual     3.130263e+07  327.0         NaN            NaN


# Chi-Square Analysis

In [15]:
from scipy.stats import chi2_contingency
import numpy as np

cross_tab = pd.crosstab(index = penguins["species"], columns = penguins["sex"])
cross_tab_matrix = np.array(cross_tab) 
cross_tab

sex,Female,Male
species,Unnamed: 1_level_1,Unnamed: 2_level_1
Adelie,73,73
Chinstrap,34,34
Gentoo,58,61


In [16]:
chi2_contingency(cross_tab_matrix)[1]

0.9759893689765846