In [None]:
#libraries and data

In [None]:
%cd /content/drive/MyDrive/Statistics with Python/Inferential Statistics/Confidence Intervals

/content/drive/MyDrive/Statistics with Python/Inferential Statistics/Confidence Intervals


In [None]:
import pandas as pd
import scipy.stats as st
import math as m
import statsmodels.stats.api as sm
import numpy as np


In [None]:
#load data
df = pd.read_csv("pizza_restaurant.csv")
df.head()

Unnamed: 0,Product Name,Crust,Toppings,Price,Delivery Time,# pizzas the customer ordered before
0,Pepperoni,Cheese_and_Garlic,4,17,26.3,4
1,Hawaiian,Cheese_and_Chili,4,17,27.8,4
2,Calzone,Cheese_and_Garlic,3,20,31.5,7
3,Margherita,Cheese,4,23,20.8,7
4,Calzone,Cheese_and_Garlic,4,19,27.7,8


In [None]:
#summary statistics
df.describe()

Unnamed: 0,Toppings,Price,Delivery Time,# pizzas the customer ordered before
count,1000.0,1000.0,1000.0,1000.0
mean,3.965,19.342,25.0611,5.659
std,1.021185,3.345479,2.490397,2.459831
min,1.0,12.0,17.8,0.0
25%,3.0,17.0,23.3,4.0
50%,4.0,19.0,25.1,5.0
75%,5.0,21.0,26.7,7.0
max,7.0,33.0,32.4,15.0


Standard error of the sample mean

In [None]:
print(df.Price.std() / m.sqrt(df.Price.count()))
print(st.sem(df.Price))

0.105793327900337
0.105793327900337


In [None]:
# Standardization an Z-score

In [None]:
#using the formula for delivery time

df['Delivery_Time_Standardized'] = (df['Delivery Time'] - df['Delivery Time'].mean()) / df['Delivery Time'].std()

In [None]:
#using SKlearn
from sklearn import preprocessing

df['Delivery_Time_Standardized2'] = preprocessing.scale(df['Delivery Time'])

In [None]:
#look at the data

df.head()

Unnamed: 0,Product Name,Crust,Toppings,Price,Delivery Time,# pizzas the customer ordered before,Delivery_Time_Standardized,Delivery_Time_Standardized2
0,Pepperoni,Cheese_and_Garlic,4,17,26.3,4,0.497471,0.49772
1,Hawaiian,Cheese_and_Chili,4,17,27.8,4,1.099784,1.100335
2,Calzone,Cheese_and_Garlic,3,20,31.5,7,2.585491,2.586785
3,Margherita,Cheese,4,23,20.8,7,-1.711012,-1.711868
4,Calzone,Cheese_and_Garlic,4,19,27.7,8,1.05963,1.06016


#confidence level

In [None]:

cl = [0.005, 0.025, 0.05, 0.95, 0.975, 0.995]

for alpha in cl:
  print(f"The corresponding normal distribution value for {alpha} is {round(st.norm.ppf(alpha), 2)}")

The corresponding normal distribution value for 0.005 is -2.58
The corresponding normal distribution value for 0.025 is -1.96
The corresponding normal distribution value for 0.05 is -1.64
The corresponding normal distribution value for 0.95 is 1.64
The corresponding normal distribution value for 0.975 is 1.96
The corresponding normal distribution value for 0.995 is 2.58


In [None]:
#confidence interval for the price mean

print(f"The mean is {df.Price.mean()}")
st.norm.interval(confidence = 0.95,
                 loc = df.Price.mean(),
                 scale = st.sem(df.Price))

The mean is 19.342


(19.134648887510703, 19.549351112489294)

In [None]:
# create a python function that for each numerical variable in a pandas
#dataframe computes the confidence interval for the mean of each
#variable that has a sample size bigger than 30



def calculate_confidence_intervals(df, confidence=0.95):
    intervals = {}

    for column in df.select_dtypes(include=[np.number]).columns:
        data = df[column].dropna()
        if len(data) > 30:
            mean = np.mean(data)
            std_dev = np.std(data, ddof=1)  # use ddof=1 to match stats.sem() behavior
            interval = st.norm.interval(confidence, loc=mean, scale=std_dev/np.sqrt(len(data)))
            intervals[column] = interval

    return intervals

# usage
print(calculate_confidence_intervals(df))




{'Toppings': (3.9017074909279676, 4.028292509072032), 'Price': (19.134648887510703, 19.549351112489294), 'Delivery Time': (24.9067464105456, 25.2154535894544), '# pizzas the customer ordered before': (5.5065408812039385, 5.811459118796061), 'Delivery_Time_Standardized': (-0.061979503230456014, 0.06197950323045629), 'Delivery_Time_Standardized2': (-0.06201051624377051, 0.06201051624377079)}


In [None]:
# take a sample from the data
sample = df.sample(20)
sample.describe()

Unnamed: 0,Toppings,Price,Delivery Time,# pizzas the customer ordered before
count,20.0,20.0,20.0,20.0
mean,4.05,19.5,24.425,5.65
std,1.190975,3.236307,2.175673,3.013566
min,1.0,15.0,20.6,1.0
25%,3.75,17.0,23.175,3.75
50%,4.0,18.5,23.8,5.5
75%,5.0,22.25,26.125,7.25
max,6.0,25.0,28.8,12.0


In [None]:
print(f"The mean is {sample.Price.mean()}")
st.t.interval(confidence = 0.95,
                  df = len(sample)-1,
                  loc = sample.Price.mean(),
                  scale = st.sem(sample.Price))


The mean is 19.5


(17.985361612746438, 21.014638387253562)

In [None]:
#adapt the formaula to calculate the confidence interval
# When the sample size is less than 30



def calculate_confidence_intervals(df, confidence=0.95):
    intervals = {}

    for column in df.select_dtypes(include=[np.number]).columns:
        data = df[column].dropna()
        n = len(data)

        if n > 0:  # Ensure there is data to process
            mean = np.mean(data)
            std_dev = np.std(data, ddof=1)  # Sample standard deviation
            if n < 30:
                # Use t-distribution
                interval = st.t.interval(confidence, df=n-1, loc=mean, scale=std_dev/np.sqrt(n))
            else:
                # Use normal distribution
                interval = st.norm.interval(confidence, loc=mean, scale=std_dev/np.sqrt(n))
            intervals[column] = interval

    return intervals

# usage
print(calculate_confidence_intervals(df))


{'Toppings': (3.9017074909279676, 4.028292509072032), 'Price': (19.134648887510703, 19.549351112489294), 'Delivery Time': (24.9067464105456, 25.2154535894544), '# pizzas the customer ordered before': (5.5065408812039385, 5.811459118796061)}
