# Statistics & Probability Assignment

Complete solutions (code + brief explanations) for Q1–Q24. Run each code cell in Google Colab.

## Question 1
Generate a list of 100 integers between 90 and 130 stored in `int_list`. Then implement:

(i) mean function

(ii) mode of a list

(iii) weighted mean

(iv) geometric mean

(v) harmonic mean

(vi) midrange

(vii) trimmed mean


In [None]:

import random
from collections import Counter
import math
import statistics
from typing import List, Iterable, Tuple

random.seed(0)
int_list = [random.randint(90,130) for _ in range(100)]
print("int_list sample (first 20):", int_list[:20])

# (i) Mean
def mean(lst: Iterable[float]) -> float:
    lst = list(lst)
    return sum(lst)/len(lst) if lst else float('nan')

print("Mean:", mean(int_list))

# (ii) Mode (may be multimodal) - return list of modes
def mode_list(lst: Iterable[int]) -> List[int]:
    if not lst:
        return []
    c = Counter(lst)
    max_count = max(c.values())
    return [val for val,count in c.items() if count==max_count]

print("Mode(s):", mode_list(int_list))

# (iii) Weighted mean
def weighted_mean(values: Iterable[float], weights: Iterable[float]) -> float:
    v = list(values); w = list(weights)
    if len(v) != len(w) or not v:
        raise ValueError("values and weights must be same non-zero length")
    return sum(vi*wi for vi,wi in zip(v,w))/sum(w)

# Example weights (positive)
weights = [random.uniform(0.5,2.0) for _ in range(len(int_list))]
print("Weighted mean (example):", weighted_mean(int_list, weights))

# (iv) Geometric mean (only positive values)
def geometric_mean(lst: Iterable[float]) -> float:
    lst = list(lst)
    if any(x<=0 for x in lst):
        raise ValueError("geometric mean defined for positive numbers only")
    log_sum = sum(math.log(x) for x in lst)
    return math.exp(log_sum/len(lst))

print("Geometric mean:", geometric_mean(int_list))

# (v) Harmonic mean
def harmonic_mean(lst: Iterable[float]) -> float:
    lst = list(lst)
    if any(x==0 for x in lst):
        raise ValueError("harmonic mean not defined when any value is 0")
    return len(lst)/sum(1.0/x for x in lst)

print("Harmonic mean:", harmonic_mean(int_list))

# (vi) Midrange
def midrange(lst: Iterable[float]) -> float:
    lst = list(lst)
    return (min(lst)+max(lst))/2.0

print("Midrange:", midrange(int_list))

# (vii) Trimmed mean: remove given percent from both ends
def trimmed_mean(lst: Iterable[float], proportion_to_cut: float) -> float:
    if not 0<=proportion_to_cut<0.5:
        raise ValueError("proportion_to_cut must be in [0,0.5)")
    lst = sorted(lst)
    n = len(lst)
    k = int(n * proportion_to_cut)
    trimmed = lst[k:n-k] if n-2*k>0 else []
    return mean(trimmed) if trimmed else float('nan')

print("Trimmed mean (10%):", trimmed_mean(int_list, 0.10))


## Question 2
Generate `int_list2` with 500 integers between 200 and 300. Then:

(i) Visual comparisons: frequency histogram + Gaussian fit, KDE, overlay

(ii) range

(iii) variance & std

(iv) IQR

(v) coefficient of variation

(vi) MAD

(vii) quartile deviation

(viii) range-based coefficient of dispersion

In [None]:

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

np.random.seed(0)
int_list2 = list(np.random.randint(200,301,size=500))
print("int_list2 sample (first 20):", int_list2[:20])

# (i) Visual comparisons
data = np.array(int_list2)

plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
# Frequency histogram + Gaussian fit
sns.histplot(data, bins=20, kde=False)
mu, std = data.mean(), data.std(ddof=0)
xmin, xmax = data.min(), data.max()
x = np.linspace(xmin, xmax, 200)
plt.plot(x, 500*(x[1]-x[0])*stats.norm.pdf(x, mu, std), label=f'Normal fit (mu={mu:.1f}, std={std:.1f})')
plt.title("Frequency + Gaussian fit")
plt.legend()

plt.subplot(1,3,2)
# Frequency smoothened KDE plot
sns.kdeplot(data, bw_method='scott')
plt.title("KDE (smoothed)")

plt.subplot(1,3,3)
# Gaussian distribution curve + smooth KDE overlay
sns.histplot(data, bins=20, kde=False, stat='density', alpha=0.4)
sns.kdeplot(data, bw_method='scott', label='KDE')
plt.plot(x, stats.norm.pdf(x, mu, std), label='Normal PDF')
plt.title("Gaussian PDF & KDE")
plt.legend()
plt.tight_layout()
plt.show()

# (ii) Range
def data_range(lst: Iterable[float]) -> float:
    lst = list(lst)
    return max(lst)-min(lst)

print("Range:", data_range(int_list2))

# (iii) Variance & std (sample & population)
def variance_and_std(lst: Iterable[float], sample: bool=True) -> Tuple[float,float]:
    arr = np.array(lst)
    if sample:
        var = arr.var(ddof=1)
        sd = arr.std(ddof=1)
    else:
        var = arr.var(ddof=0)
        sd = arr.std(ddof=0)
    return var, sd

print("Sample variance & std:", variance_and_std(int_list2, sample=True))

# (iv) IQR
def iqr(lst: Iterable[float]) -> float:
    q75, q25 = np.percentile(lst, [75,25])
    return q75 - q25

print("IQR:", iqr(int_list2))

# (v) Coefficient of variation (CV = sd/mean)
def coefficient_of_variation(lst: Iterable[float], sample: bool=True) -> float:
    arr = np.array(lst)
    sd = arr.std(ddof=1) if sample else arr.std(ddof=0)
    return sd/arr.mean()

print("Coefficient of Variation:", coefficient_of_variation(int_list2))

# (vi) Mean absolute deviation (MAD) about the mean
def mad(lst: Iterable[float]) -> float:
    arr = np.array(lst)
    return np.mean(np.abs(arr - arr.mean()))

print("MAD:", mad(int_list2))

# (vii) Quartile deviation (semi-interquartile range)
def quartile_deviation(lst: Iterable[float]) -> float:
    return iqr(lst)/2.0

print("Quartile Deviation:", quartile_deviation(int_list2))

# (viii) Range-based coefficient of dispersion = (max-min)/(max+min)
def range_based_coefficient_of_dispersion(lst: Iterable[float]) -> float:
    arr = np.array(lst)
    return (arr.max() - arr.min())/(arr.max() + arr.min())

print("Range-based Coefficient of Dispersion:", range_based_coefficient_of_dispersion(int_list2))


## Question 3
Create a Python class for a discrete random variable with methods to compute expected value and variance.

In [None]:

class DiscreteRV:
    def __init__(self, outcomes: Iterable[float], probs: Iterable[float]):
        self.outcomes = list(outcomes)
        self.probs = list(probs)
        if not math.isclose(sum(self.probs), 1.0, rel_tol=1e-6):
            raise ValueError("Probabilities must sum to 1")
    def expected_value(self):
        return sum(x*p for x,p in zip(self.outcomes,self.probs))
    def variance(self):
        mu = self.expected_value()
        return sum(((x-mu)**2)*p for x,p in zip(self.outcomes,self.probs))

# Example
rv = DiscreteRV([0,1], [0.6,0.4])
print("Expected value:", rv.expected_value())
print("Variance:", rv.variance())


## Question 4
Simulate rolling a fair six-sided die many times and calculate expected value and variance.

In [None]:

import random
import numpy as np

def simulate_die_rolls(n=10000):
    rolls = [random.randint(1,6) for _ in range(n)]
    return np.mean(rolls), np.var(rolls, ddof=0)

mean_sim, var_sim = simulate_die_rolls(100000)
print("Simulated mean:", mean_sim)
print("Simulated variance:", var_sim)

# Theoretical values: mean = 3.5, variance = 35/12 ≈ 2.9167
print("Theoretical mean:", 3.5, "Theoretical variance:", 35/12)


## Question 5
Generate random samples from distributions (binomial, Poisson) and compute mean & variance.

In [None]:

import numpy as np

def sample_binomial(n_trials=10, p=0.3, size=1000):
    s = np.random.binomial(n_trials, p, size=size)
    return s, s.mean(), s.var(ddof=0)

def sample_poisson(lam=3, size=1000):
    s = np.random.poisson(lam, size=size)
    return s, s.mean(), s.var(ddof=0)

b_s, b_mean, b_var = sample_binomial(10, 0.3, 10000)
p_s, p_mean, p_var = sample_poisson(3, 10000)
print("Binomial mean/var:", b_mean, b_var)
print("Poisson mean/var:", p_mean, p_var)


## Question 6
Generate random numbers from Gaussian distribution and compute mean, variance, std.

In [None]:

s = np.random.normal(loc=50, scale=5, size=10000)
print("Sample mean:", s.mean())
print("Sample variance:", s.var(ddof=0))
print("Sample std:", s.std(ddof=0))

# Quick histogram
plt.figure(figsize=(6,4))
sns.histplot(s, bins=30, stat='density')
plt.title("Normal samples histogram")
plt.show()


## Question 7
Load seaborn 'tips' dataset and analyze 'total_bill' and 'tip' columns:
(i) skewness
(ii) decide pos/neg/symmetric
(iii) covariance
(iv) Pearson correlation
(v) scatter plot visualization

In [None]:

tips = sns.load_dataset('tips')
cols = ['total_bill','tip']

# (i) skewness
def skewness(arr):
    return float(stats.skew(arr))

print("Skewness total_bill:", skewness(tips['total_bill']))
print("Skewness tip:", skewness(tips['tip']))

# (ii) decide type
def skew_type(arr):
    s = skewness(arr)
    if s > 0.5:
        return "Positive skew"
    elif s < -0.5:
        return "Negative skew"
    else:
        return "Approximately symmetric"

print("total_bill:", skew_type(tips['total_bill']))
print("tip:", skew_type(tips['tip']))

# (iii) covariance
def covariance(x,y):
    x,y = np.array(x), np.array(y)
    return ((x - x.mean())*(y - y.mean())).sum()/(len(x)-1)

print("Covariance:", covariance(tips['total_bill'], tips['tip']))

# (iv) Pearson correlation
r, pval = stats.pearsonr(tips['total_bill'], tips['tip'])
print("Pearson r:", r, "p-value:", pval)

# (v) Scatter plot
plt.figure(figsize=(6,4))
sns.scatterplot(x='total_bill', y='tip', data=tips)
plt.title("Scatter: total_bill vs tip")
plt.show()


## Question 8
Function for PDF of normal distribution (continuous RV).

In [None]:

def normal_pdf(x, mu=0, sigma=1):
    coef = 1.0/(sigma * math.sqrt(2*math.pi))
    exp_term = math.exp(-0.5 * ((x-mu)/sigma)**2)
    return coef * exp_term

print("Normal PDF at 0 (mu=0,sigma=1):", normal_pdf(0))


## Question 9
CDF of exponential distribution.

In [None]:

def exponential_cdf(x, lambd=1.0):
    if x < 0:
        return 0.0
    return 1 - math.exp(-lambd * x)

print("Exponential CDF at x=1, lambda=1:", exponential_cdf(1,1))


## Question 10
PMF of Poisson distribution.

In [None]:

def poisson_pmf(k, lam):
    return math.exp(-lam) * lam**k / math.factorial(k)

print("Poisson PMF k=2, lambda=3:", poisson_pmf(2,3))


## Question 11
Z-test for proportions: compare conversion rates of old and new layouts.

In [None]:

import numpy as np
from math import sqrt

old_layout = np.array([1]*50 + [0]*950)
new_layout = np.array([1]*70 + [0]*930)

def z_test_proportions(a, b):
    pa = a.mean(); pb = b.mean()
    n1 = len(a); n2 = len(b)
    p_pool = (a.sum() + b.sum())/(n1 + n2)
    se = math.sqrt(p_pool*(1-p_pool)*(1/n1 + 1/n2))
    z = (pb - pa)/se
    # two-sided p-value
    p = 2*(1 - stats.norm.cdf(abs(z)))
    return z, p, pa, pb

z, p, pa, pb = z_test_proportions(old_layout, new_layout)
print("old p:", pa, "new p:", pb, "z:", z, "p-value:", p)
if p < 0.05 and z>0:
    print("New layout has significantly higher conversion rate (reject H0).")
else:
    print("No significant improvement detected at alpha=0.05.")


## Question 12
Z-test for before/after program scores (paired).

In [None]:

before_program = np.array([75,80,85,70,90,78,92,88,82,87])
after_program = np.array([80,85,90,80,92,80,95,90,85,88])

# For paired samples, compute differences and use z-test approx (n small, better use t-test but we use z as requested)
diff = after_program - before_program
dz = diff.mean() / (diff.std(ddof=0)/math.sqrt(len(diff)))
p_val = 2*(1 - stats.norm.cdf(abs(dz)))
print("z-stat:", dz, "p-value:", p_val)
if p_val < 0.05:
    print("Significant improvement after program (reject H0).")
else:
    print("No significant improvement detected.")


## Question 13
Z-test for blood pressure before/after drug.

In [None]:

before_drug = np.array([145,150,140,135,155,160,152,148,130,138])
after_drug = np.array([130,140,132,128,145,148,138,136,125,130])

diff = before_drug - after_drug  # reduction
z_stat = diff.mean() / (diff.std(ddof=0)/math.sqrt(len(diff)))
p_val = 2*(1 - stats.norm.cdf(abs(z_stat)))
print("z-stat:", z_stat, "p-value:", p_val)
if p_val < 0.05 and z_stat>0:
    print("Drug appears effective (significant reduction).")
else:
    print("No significant evidence of effect at alpha=0.05.")


## Question 14
Z-test: response times claim (mean < 5). One-sided test.

In [None]:

response_times = np.array([4.3,3.8,5.1,4.9,4.7,4.2,5.2,4.5,4.6,4.4])
# H0: mu = 5, H1: mu < 5
mu0 = 5.0
n = len(response_times)
z = (response_times.mean() - mu0) / (response_times.std(ddof=0)/math.sqrt(n))
p_one_sided = stats.norm.cdf(z)  # lower tail
print("z:", z, "one-sided p:", p_one_sided)
if p_one_sided < 0.05:
    print("Claim (mean < 5) supported at alpha=0.05.")
else:
    print("Insufficient evidence to support claim.")


## Question 15
A/B test: two samples of clicks. Compute t-statistic, df, p-value (Welch's t-test).

In [None]:

layout_a_clicks = np.array([28,32,33,29,31,34,30,35,36,37])
layout_b_clicks = np.array([40,41,38,42,39,44,43,41,45,47])

# Welch's t-test
t_stat, p_val = stats.ttest_ind(layout_b_clicks, layout_a_clicks, equal_var=False)
# approximate df for Welch is returned via formula; scipy doesn't return df directly
def welch_df(a,b):
    sa2 = a.var(ddof=1); sb2 = b.var(ddof=1)
    n1, n2 = len(a), len(b)
    num = (sa2/n1 + sb2/n2)**2
    den = (sa2**2)/((n1**2)*(n1-1)) + (sb2**2)/((n2**2)*(n2-1))
    return num/den

df = welch_df(layout_b_clicks, layout_a_clicks)
print("t-stat:", t_stat, "p-value:", p_val, "df (approx):", df)


## Question 16
Compare existing vs new drug cholesterol levels (two-sample t-test).

In [None]:

existing_drug_levels = np.array([180,182,175,185,178,176,172,184,179,183])
new_drug_levels = np.array([170,172,165,168,175,173,170,178,172,176])

t_stat, p_val = stats.ttest_ind(existing_drug_levels, new_drug_levels, equal_var=False)
df = welch_df(existing_drug_levels, new_drug_levels)
print("t-stat:", t_stat, "p-value:", p_val, "df approx:", df)
if p_val < 0.05:
    print("Significant difference between drugs.")
else:
    print("No significant difference detected.")


## Question 17
Pre/post intervention paired t-test.

In [None]:

pre_intervention_scores = np.array([80,85,90,75,88,82,92,78,85,87])
post_intervention_scores = np.array([90,92,88,92,95,91,96,93,89,93])

t_stat, p_val = stats.ttest_rel(post_intervention_scores, pre_intervention_scores)
print("paired t-stat:", t_stat, "p-value:", p_val)
if p_val < 0.05:
    print("Intervention had a significant impact.")
else:
    print("No significant impact found.")


## Question 18
Two-sample t-test for male vs female salaries (synthetic).

In [None]:

np.random.seed(0)
male_salaries = np.random.normal(loc=50000, scale=10000, size=20)
female_salaries = np.random.normal(loc=55000, scale=9000, size=20)

t_stat, p_val = stats.ttest_ind(male_salaries, female_salaries, equal_var=False)
df = welch_df(male_salaries, female_salaries)
print("t-stat:", t_stat, "p-value:", p_val, "df approx:", df)
if p_val < 0.05:
    print("Significant salary difference detected.")
else:
    print("No significant difference detected at alpha=0.05.")


## Question 19
Compare version1 vs version2 quality scores (two-sample t-test).

In [None]:

version1_scores = np.array([85,88,82,89,87,84,90,88,85,86,91,83,87,84,89,86,84,88,85,86,89,90,87,88,85])
version2_scores = np.array([80,78,83,81,79,82,76,80,78,81,77,82,80,79,82,79,80,81,79,82,79,78,80,81,82])

t_stat, p_val = stats.ttest_ind(version1_scores, version2_scores, equal_var=False)
df = welch_df(version1_scores, version2_scores)
print("t-stat:", t_stat, "p-value:", p_val, "df approx:", df)
if p_val < 0.05:
    print("Significant difference in quality.")
else:
    print("No significant difference found.")


## Question 20
Compare branch A vs B customer satisfaction (two-sample t-test).

In [None]:

branch_a_scores = np.array([4,3,4,5,4,5,3,4,4,5,4,4,3,4,5,4,3,5,4,4,5,4,3,5,4,4])
branch_b_scores = np.array([3,4,2,3,4,3,4,2,3,3,4,3,3,2,4,3,4,2,3,4,3,3,4,2,3,4,3])

t_stat, p_val = stats.ttest_ind(branch_a_scores, branch_b_scores, equal_var=False)
df = welch_df(branch_a_scores, branch_b_scores)
print("t-stat:", t_stat, "p-value:", p_val, "df approx:", df)
if p_val < 0.05:
    print("Significant difference in satisfaction.")
else:
    print("No significant difference found.")


## Question 21
Chi-Square test for association between age groups and voter preferences.

In [None]:

np.random.seed(0)
age_groups = np.random.choice(['18-30','31-50','51+'], size=30)
voter_preferences = np.random.choice(['Candidate A','Candidate B'], size=30)

# Build contingency table
import pandas as pd
ct = pd.crosstab(age_groups, voter_preferences)
print("Contingency table:\n", ct)

chi2, p, dof, expected = stats.chi2_contingency(ct)
print("chi2:", chi2, "p-value:", p, "dof:", dof)
if p < 0.05:
    print("Significant association between age group and preference.")
else:
    print("No significant association detected.")


## Question 22
Chi-Square test on product satisfaction vs region (provided contingency table).

In [None]:

data = np.array([[50,30,40,20], [30,40,30,50], [20,30,40,30]])
chi2, p, dof, expected = stats.chi2_contingency(data)
print("chi2:", chi2, "p-value:", p, "dof:", dof)
print("Expected counts:\n", expected)
if p < 0.05:
    print("Significant relationship between satisfaction and region.")
else:
    print("No significant relationship detected.")


## Question 23
Chi-Square test for job performance before vs after training (provided table).

In [None]:

data = np.array([[50,30,20], [30,40,30], [20,30,40]])
chi2, p, dof, expected = stats.chi2_contingency(data)
print("chi2:", chi2, "p-value:", p, "dof:", dof)
if p < 0.05:
    print("Significant change in job performance distribution.")
else:
    print("No significant change detected.")


## Question 24
ANOVA to test difference among Standard, Premium, Deluxe satisfaction scores.

In [None]:

standard_scores = np.array([80,85,90,78,88,82,92,78,85,87])
premium_scores = np.array([90,92,88,92,95,91,96,93,89,93])
deluxe_scores = np.array([95,98,92,97,96,94,98,97,92,99])

f_stat, p_val = stats.f_oneway(standard_scores, premium_scores, deluxe_scores)
print("ANOVA F-stat:", f_stat, "p-value:", p_val)
if p_val < 0.05:
    print("At least one group mean differs significantly.")
else:
    print("No evidence of difference among group means.")
