In [46]:
# 1. Problem – Defective Items in a Factory
from numpy.random import randint as ri
import pandas as pd

#Generate random data for 1000 days (defective items between 0 and 20)
defects = ri(0, 21, 1000)
defects = pd.Series(defects)

In [47]:
defects.head()

0    12
1    19
2     0
3     7
4    13
dtype: int32

In [48]:
defects_mean = defects.mean()
defects_mean

9.899

In [49]:
defects_std = defects.std()
defects_std

6.159767994148945

In [50]:
import scipy.stats as st
exact_defect = 5
probability = st.norm.cdf(exact_defect, loc=defects_mean, scale=defects_std)
probability

0.21321306250917843

In [51]:
# 2 Problem – Testing the Claim About Delivery Time
import numpy as np

# Known values
population_mean = 30        # Claimed average delivery time
sample_mean = 31.2          # Observed sample mean
std_dev = 4                 # Known population standard deviation
n = 40                      # Sample size

z=(sample_mean-population_mean)/(std_dev/np.sqrt(40))
z

1.8973665961010264

In [52]:
# From z table we got the value 0.9678 which is greater than 0.05 so we accept the null hypothesis

In [76]:
import scipy.stats as st
pvalues=st.norm.cdf(sample_mean, loc=population_mean, scale=std_dev/np.sqrt(40))
pvalues

0.9711102144382013

In [74]:
#3. Problem – Fitness Program Impact Analysis
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(100)

# Sample size
n = 150

# Gender (0 = Female, 1 = Male)
gender = np.random.choice([0, 1], size=n)

# Initial scores (mean slightly < 65 to create realistic test)
initial_scores = np.random.normal(loc=64, scale=6, size=n)

# Final scores (showing average improvement)
final_scores = initial_scores + np.random.normal(loc=5, scale=3, size=n)

# Create DataFrame
df = pd.DataFrame({
    'Gender': gender,
    'Initial_Score': initial_scores,
    'Final_Score': final_scores
})

df.head()


Unnamed: 0,Gender,Initial_Score,Final_Score
0,0,73.167718,76.049901
1,0,67.883235,75.156484
2,1,59.93598,65.727168
3,1,62.409887,68.352951
4,1,68.476639,70.330144


In [80]:
### 1. One-Sample T-Test
#Test whether the **average initial fitness score** is at least **65**.
#**Null Hypothesis** H₀: μ ≥ 65 (Average initial score is at least 65)
#**Alternate Hypothesis** H₁: μ < 65 (Average initial score is less than 65)

from scipy.stats import ttest_1samp
alpha=0.05
mu=65
st.ttest_1samp(df.Initial_Score, mu)

TtestResult(statistic=-2.298969249023647, pvalue=0.02289660650128741, df=149)

In [82]:
# pvalue=0.02289660650128741, Alternate Hypothesis*H₁: μ < 65 (Average initial score is less than 65)

In [84]:
### 2. Two-Sample Independent T-Test
#**Null Hypothesis** H₀: μ₁ = μ₂ (No difference in average initial scores between males and females)
#**Alternate Hypothesis** H₁: μ₁ ≠ μ₂ (There is a difference in average initial scores)

from scipy.stats import ttest_ind  # For independent t-test
from scipy.stats import ttest_rel  # For paired sample t-test (used later)

In [88]:
t,p=st.ttest_ind(df[df['Gender']==1].Initial_Score,df[df['Gender']==0].Initial_Score)

In [90]:
t

-0.44707225764576697

In [92]:
p

0.6554764604792216

In [122]:
#3️Paired Sample t-Test:
#**Null Hypothesis** H₀: μ_diff = 0 (No change in scores before and after the program)
#**Alternate Hypothesis** H₁: μ_diff < 0 (Final scores are higher than initial scores)
t,p=st.ttest_rel(df.Initial_Score,df.Initial_Score)

In [124]:
t

nan

In [126]:
p

nan

In [128]:
#4 Problem – ANOVA Analysis of Customer Satisfaction Across Store Branches
import numpy as np
import pandas as pd

# Set seed for reproducibility
np.random.seed(42)

# Sample size per branch
n = 70

# Create satisfaction scores for three branches
branch_a = np.random.normal(loc=420, scale=30, size=n)
branch_b = np.random.normal(loc=400, scale=35, size=n)
branch_c = np.random.normal(loc=430, scale=25, size=n)

# Combine into a DataFrame
data = pd.DataFrame({
    'Customer_ID': range(1, n*3 + 1),
    'Branch': ['A'] * n + ['B'] * n + ['C'] * n,
    'Satisfaction_Score': np.concatenate([branch_a, branch_b, branch_c])
})

data.head()

Unnamed: 0,Customer_ID,Branch,Satisfaction_Score
0,1,A,434.901425
1,2,A,415.852071
2,3,A,439.430656
3,4,A,465.690896
4,5,A,412.975399


In [130]:
#- **H₀ (Null Hypothesis)**: The average satisfaction scores across all three branches are **equal**.
#- **H₁ (Alternative Hypothesis)**: At least one branch has a **different average** satisfaction score.

In [132]:
import numpy as np
import statsmodels.formula.api as sm

In [134]:
mod=sm.ols('Satisfaction_Score ~ Branch',data=data).fit()

In [136]:
import statsmodels.api as k

In [138]:
anov_table=k.stats.anova_lm(mod,typ=1)
anov_table

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
Branch,2.0,42018.567984,21009.283992,24.799904,2.212591e-10
Residual,207.0,175360.431682,847.151844,,


In [143]:
## 5 Problem – Evaluate Forecast Accuracy Using the Chi-Square Goodness of Fit Test
import numpy as np
from scipy.stats import chi2

# Data
expected = np.array([95, 110, 100, 130, 160, 210, 230])
observed = np.array([90, 105, 98, 135, 165, 205, 225])

In [147]:
#1. **Perform the Chi-Square Goodness of Fit Test** using the given data.
chi_statistic = 0
for i, obs in enumerate(observed):
    exp = expected[i]
    chi_statistic += ((obs - exp) ** 2) / exp
chi_statistic

1.1067315855387938

In [151]:
## Step 2: Degrees of freedom = number of categories - 1
DF=len(observed)-1
print('Degrees of Freedom: ', DF)

Degrees of Freedom:  6


In [153]:
## Step 3: Critical value at α = 0.10
# From Chi-Square Distribution table critical score = 10.645

In [155]:
## Step 4: Compare and conclude
#Critical Score (in the table 10.645 )is greater than test score 1.1067315855387938 , 
#we cannot reject the null hypothesis 

In [163]:
### Problem – Manual Covariance Calculation Between Study Hours and Exam Scores
##  Objective
#Manually compute the **covariance** between `Hours_Studied` and `Exam_Score` **without using built-in functions** like `.cov()` or NumPy methods.

# Dataset
hours = [2, 4, 6, 8, 10]
scores = [65, 70, 75, 85, 95]
mean_hours = sum(hours) / len(hours)
mean_hours

6.0

In [165]:
mean_scores = sum(scores) / len(scores)
mean_scores

78.0

In [167]:
n=len(hours)

In [173]:
h_s= [(2-mean_hours)*(65-mean_scores), 
 (4-mean_hours)*(70-mean_scores),
 (6-mean_hours)*(75-mean_scores),
 (8-mean_hours)*(85-mean_scores),
 (10-mean_hours)*(95-mean_scores)]
sum_h_s=np.sum(h_s)
print("Covariance: ",(sum_h_s/(n-1)))

Covariance:  37.5


In [175]:
###  Problem – Manual Correlation Calculation Between Exercise Hours and Stress Level
###  Objective: Manually compute the **Pearson correlation coefficient** between `Exercise_Hours` and `Stress_Level` without using built-in correlation functions.

# Data
exercise = [1, 3, 5, 7, 9]
stress = [85, 75, 60, 55, 40]

In [177]:
mean_exercise = sum(exercise) / len(exercise)
mean_stress = sum(stress) / len(stress)
n = len(exercise)

In [179]:
covariance = sum((exercise[i] - mean_exercise) * (stress[i] - mean_stress)for i in range(n)) / (n - 1)
covariance

-55.0

In [181]:
std_exercise = (sum((x - mean_exercise) ** 2 for x in exercise) / (n - 1)) ** 0.5
std_stress = (sum((y - mean_stress) ** 2 for y in stress) / (n - 1)) ** 0.5
correlation = covariance / (std_exercise * std_stress)
print("Correlation: ",correlation)

Correlation:  -0.9918365981341756
