# ***Essential Statistical Tests Every Data Scientist Should Know!***

# 1. Parametric Statistical Tests

## 1.1. Regression Tests

### 1.1.1) Linear Regression

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.datasets import fetch_california_housing

california = fetch_california_housing()
X = pd.DataFrame(california.data, columns=california.feature_names)
y = california.target
X = sm.add_constant(X)

# Fit the model
model = sm.OLS(y, X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.606
Model:                            OLS   Adj. R-squared:                  0.606
Method:                 Least Squares   F-statistic:                     3970.
Date:                Fri, 14 Jun 2024   Prob (F-statistic):               0.00
Time:                        18:41:17   Log-Likelihood:                -22624.
No. Observations:               20640   AIC:                         4.527e+04
Df Residuals:                   20631   BIC:                         4.534e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -36.9419      0.659    -56.067      0.0

## 1.2. Comparison Tests

### 1.2.1)T-test

### 1.2.1.1) Independent T-test

In [7]:
from scipy.stats import ttest_ind

# Example data
group1 = np.random.normal(10, 2, 30)
group2 = np.random.normal(12, 2, 30)

# Perform the independent t-test
t_stat, p_value = ttest_ind(group1, group2)
print(f"t-statistic: {t_stat}, p-value: {p_value}")

t-statistic: -4.5676405093057335, p-value: 2.6234849583706998e-05


### 1.2.1.2) Paired T-test

In [8]:
from scipy.stats import ttest_rel

# Example data
before = np.random.normal(10, 2, 30)
after = before + np.random.normal(1, 1, 30)

# Perform the paired t-test
t_stat, p_value = ttest_rel(before, after)
print(f"t-statistic: {t_stat}, p-value: {p_value}")


t-statistic: -6.364795749240651, p-value: 5.884756787533127e-07


### 1.2.1.3) One Sample T-test

In [9]:
from scipy.stats import ttest_1samp

# Example data
data = np.random.normal(10, 2, 30)

# Perform the one-sample t-test
t_stat, p_value = ttest_1samp(data, 10)
print(f"t-statistic: {t_stat}, p-value: {p_value}")


t-statistic: 0.5813442795359124, p-value: 0.5655000191402095


### 1.2.2)  ANOVA (Analysis of Variance)

In [10]:
from scipy.stats import f_oneway

# Example data
group1 = np.random.normal(10, 2, 30)
group2 = np.random.normal(12, 2, 30)
group3 = np.random.normal(11, 2, 30)

# Perform the ANOVA test
f_stat, p_value = f_oneway(group1, group2, group3)
print(f"F-statistic: {f_stat}, p-value: {p_value}")


F-statistic: 8.835127567704138, p-value: 0.0003211993727262113


### 1.2.3) Z-test

In [11]:
import statsmodels.api as sm

# Example data
data = np.random.normal(10, 2, 100)

# Perform the one-sample z-test
z_stat, p_value = sm.stats.ztest(data, value=10)
print(f"z-statistic: {z_stat}, p-value: {p_value}")


z-statistic: 0.5641495350982042, p-value: 0.5726523688339221


## 1.3 Correlation Tests

### 1.3.1)Pearson Correlation Coefficient

In [12]:
from scipy.stats import pearsonr

# Example data
x = np.random.normal(10, 2, 30)
y = x + np.random.normal(1, 1, 30)

# Calculate the Pearson correlation coefficient
corr, p_value = pearsonr(x, y)
print(f"Pearson correlation coefficient: {corr}, p-value: {p_value}")


Pearson correlation coefficient: 0.8996467473399045, p-value: 1.3801382955121948e-11


# 2. Non-parametric Statistical Tests

### 2.1 Chi-square Test

In [13]:
import pandas as pd
from scipy.stats import chi2_contingency

# Example data
data = pd.DataFrame({
    'A': [10, 20, 30],
    'B': [6, 9, 17]
})

# Create a contingency table
contingency_table = pd.crosstab(index=data['A'], columns=data['B'])

# Perform the chi-square test
chi2, p, dof, expected = chi2_contingency(contingency_table)
print(f"Chi-square statistic: {chi2}, p-value: {p}")


Chi-square statistic: 6.000000000000001, p-value: 0.19914827347145564
