# A reference guide for statistical data collection and Analysis

In [1]:
import pandas as pd
import numpy as np
# A comprehensive Jupyter Notebook as a reference guide for survey preparation, 
# sample size determination, power analysis, statistical tests, and related concepts is a great idea. 
data = {
  "Duration":{
    "0":60,
    "1":60,
    "2":60,
    "3":45,
    "4":45,
    "5":60
  },
  "Pulse":{
    "0":110,
    "1":117,
    "2":103,
    "3":109,
    "4":117,
    "5":102
  },
  "Maxpulse":{
    "0":130,
    "1":145,
    "2":135,
    "3":175,
    "4":148,
    "5":127
  },
  "Calories":{
    "0":409,
    "1":479,
    "2":340,
    "3":282,
    "4":406,
    "5":300
  }
}

df = pd.DataFrame(data)

## Table of Contents

In [2]:

# Introduction to Survey Research and Statistical Analysis

# Overview of Survey Research
    # Importance of Sample Size and Power Analysis
    # Types of Statistical Tests
    # Sample Size Determination

# Basics of Sample Size
    # Calculating Sample Size for Different Scenarios
    # Sample Size for Infinite and Finite Populations
    # Power Analysis

# Understanding Statistical Power
    # Power Analysis for Different Statistical Tests
    # Example Power Calculations
    # Finite Population Correction (FPC)

# Importance of FPC
    # Applying FPC to Adjust Sample Size
    # Example Calculations with FPC
    # Statistical Tests and Their Application

# Overview of Common Statistical Tests
    # Hypothesis Testing: Null and Alternative Hypotheses
    # Chi-Square Test
    # T-Tests (One-sample, Independent, and Paired)
    # ANOVA (Analysis of Variance)
    # Non-Parametric Tests
    # Directionalities in Statistical Tests

# Understanding Test Directions: One-tailed vs. Two-tailed Tests
    # Examples and Scenarios for Choosing the Test Direction
    # Interpreting Results

# Understanding p-values and Statistical Significance
    # Practical vs. Statistical Significance
    # Reporting and Communicating Results
    # Practical Guide to Conducting Surveys

# Designing Effective Surveys
    # Common Pitfalls and How to Avoid Them
    # Ensuring Validity and Reliability
    # Python Code Examples

# Sample Size Calculation
    # Power Analysis
    # Statistical Tests (Chi-Square, T-Tests, ANOVA, etc.)
    # Adjusting Sample Size with FPC
    # Additional Resources

# Recommended Reading and References
    # Useful Python Libraries for Statistical Analysis


## Section 1: Introduction to Survey Research and Statistical Analysis

In [3]:
# Overview of Survey Research

# Survey research involves collecting data from a predefined group to gather insights, measure opinions, or identify patterns. Key considerations include:

# Purpose of the Survey: Define what you aim to achieve.
# Target Population: Identify the group from which you’ll collect data.
# Sample Size: Determine how many participants you need to make your findings statistically significant.
# Importance of Sample Size and Power Analysis

# Sample Size: Ensures the collected data is representative of the population.
# Power Analysis: Determines the likelihood that your study will detect an effect if there is one.
# Types of Statistical Tests

# Chi-Square Test: Used for categorical data to assess how likely it is that an observed distribution is due to chance.
# T-Tests: Compare means between two groups.
# ANOVA: Compares means among three or more groups.
# Non-Parametric Tests: Used when data doesn’t fit normal distribution assumptions.
# Section 2: Sample Size Determination
# Basics of Sample Size

# Sample size refers to the number of observations or participants needed to ensure reliable and valid results. Factors influencing sample size include:

# Desired Confidence Level: Commonly 95% or 99%.
# Margin of Error: Acceptable range of error in results.
# Population Variability: How diverse or varied the population is.


### Calculating Sample Size for Different Scenarios

#### For Proportions:

In [4]:
# from statsmodels.stats.proportion import proportion_effectsize, samplesize_proportions_2indep
# # statsmodels.stats.proportion.test_proportions_2indep
# # Example: Comparing two proportions (50% and 60%)
# effect_size = proportion_effectsize(0.5, 0.6)  # Cohen's h
# alpha = 0.05
# power = 0.8

# sample_size = samplesize_proportions_2indep(
#     effect_size=effect_size, alpha=alpha, power=power, ratio=1
# )
# print(f"Sample size needed per group: {sample_size:.2f}")


#### For Means:

In [5]:
from statsmodels.stats.power import tt_ind_solve_power

# Example: Comparing means with small effect size (Cohen's d = 0.2)
effect_size = 0.2
alpha = 0.05
power = 0.8

sample_size = tt_ind_solve_power(effect_size, alpha=alpha, power=power, ratio=1)
print(f"Sample size needed per group: {sample_size:.2f}")


Sample size needed per group: 393.41


### Sample Size for Infinite and Finite Populations

#### Infinite Population:

In [6]:
from statsmodels.stats.power import zt_ind_solve_power

# Example: Two independent sample z-test for proportions
effect_size = 0.3  # Cohen's h for small to medium effect
alpha = 0.05
power = 0.8

sample_size = zt_ind_solve_power(effect_size, alpha=alpha, power=power)
print(f"Sample size needed per group (infinite population): {sample_size:.2f}")


Sample size needed per group (infinite population): 174.42


#### Finite Population: Adjust the infinite population sample size using the FPC.

In [7]:
# Finite Population Size
N = 3000

# Initial sample size from infinite population calculation
n0 = 100  # Example value

# Apply finite population correction (FPC)
def adjusted_sample_size(n0, N):
    return n0 / (1 + ((n0 - 1) / N))

n_prime = adjusted_sample_size(n0, N)
print(f"Adjusted sample size for finite population of {N}: {n_prime:.2f}")


Adjusted sample size for finite population of 3000: 96.81


## Section 3: Power Analysis

In [8]:
# Understanding Statistical Power

# Statistical power is the probability of detecting a true effect when it exists. It is influenced by:

# Sample Size: Larger samples increase power.
# Effect Size: Larger effects are easier to detect.
# Significance Level (α): Higher significance levels reduce power.
# Variability in Data: Less variability increases power.
# Power Analysis for Different Statistical Tests

### Power Analysis for Different Statistical Tests 

#### Chi-Square Test:

In [9]:
from statsmodels.stats.power import GofChisquarePower

# Parameters
effect_size = 0.3  # Cohen's w
alpha = 0.05
power = 0.8

power_analysis = GofChisquarePower()
sample_size = power_analysis.solve_power(effect_size, power=power, alpha=alpha)
print(f"Sample size needed for chi-square test: {sample_size:.2f}")


Sample size needed for chi-square test: 87.21


#### T-Tests:

In [10]:
from statsmodels.stats.power import TTestIndPower

# Parameters
effect_size = 0.5  # Medium effect size (Cohen's d)
alpha = 0.05
power = 0.8

power_analysis = TTestIndPower()
sample_size = power_analysis.solve_power(effect_size, power=power, alpha=alpha)
print(f"Sample size needed for t-test: {sample_size:.2f}")


Sample size needed for t-test: 63.77


#### ANOVA:

In [11]:
from statsmodels.stats.power import FTestAnovaPower

# Parameters
effect_size = 0.25  # Small to medium effect size
alpha = 0.05
power = 0.8
groups = 3  # Number of groups

power_analysis = FTestAnovaPower()
sample_size = power_analysis.solve_power(effect_size, power=power, alpha=alpha, k_groups=groups)
print(f"Sample size needed per group for ANOVA: {sample_size:.2f}")


Sample size needed per group for ANOVA: 157.19


## Section 4: Finite Population Correction (FPC)

In [12]:
# Importance of FPC

# FPC adjusts the sample size for the finite population. 
# This is important when the population is small and the initial sample size is a significant fraction of the total population.

#### Applying FPC to Adjust Sample Size

In [13]:
# Use FPC to refine the sample size from the power analysis.

# Example with initial sample size and finite population size
initial_sample_size = 120  # From previous power analysis
population_size = 3000

def adjusted_sample_size(n0, N):
    return n0 / (1 + ((n0 - 1) / N))

adjusted_size = adjusted_sample_size(initial_sample_size, population_size)
print(f"Adjusted sample size for finite population: {adjusted_size:.2f}")


Adjusted sample size for finite population: 115.42


## Section 5: Statistical Tests and Their Application

In [14]:
# Overview of Common Statistical Tests

# Explain different tests, when to use them, and their assumptions.

# Chi-Square Test: For categorical data.
# T-Tests: For comparing means.
# ANOVA: For comparing means among multiple groups.
# Non-Parametric Tests: When data doesn’t meet parametric test assumptions.

In [15]:
# Define how to formulate hypotheses for different tests.


# Example hypothesis for Chi-Square Test
# H0: There is no difference in willingness to collaborate between groups.
# H1: There is a difference in willingness to collaborate between groups.

#### Chi-Square Test

In [16]:
from scipy.stats import chi2_contingency
import numpy as np

# Example contingency table
contingency_table = np.array([[82, 35], [27, 16]])

# Perform Chi-Square Test
chi2_stat, p_value, _, _ = chi2_contingency(contingency_table)
print(f"Chi-square statistic: {chi2_stat}, p-value: {p_value}")


Chi-square statistic: 0.4712298410763503, p-value: 0.4924219276038485


#### T-Tests (One-sample, Independent, and Paired)

In [17]:
from scipy.stats import ttest_ind, ttest_rel

# Example data
group1 = np.random.normal(50, 10, size=30)
group2 = np.random.normal(55, 10, size=30)

# Independent t-test
t_stat, p_value = ttest_ind(group1, group2)
print(f"Independent t-test: t-statistic = {t_stat}, p-value = {p_value}")


Independent t-test: t-statistic = -2.1356194508957467, p-value = 0.03694423309900591


#### ANOVA (Analysis of Variance)

In [18]:
from scipy.stats import f_oneway

# Example data
group1 = np.random.normal(50, 10, size=30)
group2 = np.random.normal(55, 10, size=30)
group3 = np.random.normal(60, 10, size=30)

# ANOVA
f_stat, p_value = f_oneway(group1, group2, group3)
print(f"ANOVA: F-statistic = {f_stat}, p-value = {p_value}")


ANOVA: F-statistic = 14.883539982840723, p-value = 2.758349085704979e-06


### Non-Parametric Tests

In [19]:
# Provide examples for tests like the Mann-Whitney U test, 
# Wilcoxon signed-rank test, and Kruskal-Wallis test.

## Section 6: Directionalities in Statistical Tests

### Understanding Test Directions: One-tailed vs. Two-tailed Tests

In [20]:
# When conducting statistical tests, it's crucial to decide whether to use a one-tailed or two-tailed test. This decision is based on the nature of your hypothesis.

# One-tailed Test: Used when you have a directional hypothesis (e.g., Group A is greater than Group B).
# Two-tailed Test: Used when you are testing for any difference without a specified direction (e.g., Group A is different from Group B).



# Examples and Scenarios for Choosing the Test Direction
# Scenario 1: Testing if a new drug is more effective than an existing drug.

# Hypothesis: The new drug is more effective (one-tailed test).
# Decision: Use a one-tailed test if the expectation is strictly directional.
# Scenario 2: Investigating if there is any difference in test scores between two teaching methods.

# Hypothesis: There is a difference in test scores (two-tailed test).
# Decision: Use a two-tailed test since any difference is of interest, not just a specific direction.

In [21]:
import numpy as np
from scipy.stats import ttest_ind

# Example data: Two groups with normally distributed values
group1 = np.random.normal(50, 10, size=30)
group2 = np.random.normal(55, 10, size=30)

# Two-tailed t-test
t_stat, p_value_two_tailed = ttest_ind(group1, group2)
print(f"Two-tailed t-test: t-statistic = {t_stat}, p-value = {p_value_two_tailed}")

# One-tailed t-test
# Since we're interested in testing if group1 mean is less than group2 mean
# We divide the two-tailed p-value by 2 to get the one-tailed p-value
p_value_one_tailed = p_value_two_tailed / 2 if t_stat < 0 else (1 - p_value_two_tailed / 2)
print(f"One-tailed t-test: t-statistic = {t_stat}, p-value = {p_value_one_tailed}")



Two-tailed t-test: t-statistic = -2.6786530715335637, p-value = 0.009601493286261961
One-tailed t-test: t-statistic = -2.6786530715335637, p-value = 0.004800746643130981


## Section 7: Interpreting Results

### Understanding p-values and Statistical Significance

In [22]:
# Explain how to interpret p-values and the concept of statistical significance.

# p-value: The probability of observing your data, or something more extreme, under the assumption that the null hypothesis is true.

# Statistical Significance: Typically, if the p-value is less than a predefined threshold (commonly 0.05), 
#     we reject the null hypothesis. This indicates that the observed effect is statistically significant.

# Statistical Significance: Typically, p < 0.05 indicates significant results.
# Practical vs. Statistical Significance

# Distinguish between statistical significance and practical (or clinical) significance.

# Reporting and Communicating Results

# Guide on how to report statistical results clearly and effectively.

### Intepretating p-values and Statistical Significance

In [23]:
# Assuming we have the results of a statistical test
p_value = 0.03  # Example p-value

# Define significance level
alpha = 0.05

# Interpretation
if p_value < alpha:
    print(f"p-value = {p_value}: Statistically significant (reject H0)")
else:
    print(f"p-value = {p_value}: Not statistically significant (fail to reject H0)")


p-value = 0.03: Statistically significant (reject H0)


### Practical vs. Statistical Significance

In [24]:
# Statistical Significance: 
#     A result is statistically significant if it is unlikely to have occurred by chance, given the null hypothesis.
    
# Practical Significance: 
#     Refers to the real-world relevance or importance of the result. 
#     A statistically significant result may not always be practically significant.

### Reporting and Communicating Results

In [25]:
# When reporting statistical results, include the following:

# Test Used: Specify the statistical test performed.
# Test Statistic: Report the value of the test statistic.
# p-value: Include the p-value and interpret it.
# Effect Size: Report the effect size to understand the magnitude of the difference or relationship.
# Context: Explain the practical implications of the results.
# Example Report:

# "In an independent t-test comparing the means of Group A and Group B, 
# the t-statistic was 2.35 with a p-value of 0.02. Since the p-value is less than 0.05, 
# we reject the null hypothesis and conclude that there is a statistically significant difference between the two groups. 
# The effect size was medium (Cohen's d = 0.6), indicating a meaningful difference in the context of our study."



## Section 8: Practical Guide to Conducting Surveys

#### Designing Effective Surveys

In [26]:
# Surveys are tools for collecting data from respondents. 
# Effective survey design is crucial for obtaining reliable and valid data.


#### Key Tips for Survey Design:

In [27]:
# Define Clear Objectives: Know what information you need and why.
# Choose the Right Question Types:
# Open-ended questions: Allow for detailed responses.
# Closed-ended questions: Provide specific response options (e.g., multiple choice, Likert scale).
# Avoid Leading Questions: Ensure questions are neutral and do not bias the respondent.
# Pilot Test the Survey: Test the survey with a small group to identify any issues.

#### Example Questions:

In [28]:

# **Question 1**: How satisfied are you with our service? (Likert Scale)
# - Very Dissatisfied
# - Dissatisfied
# - Neutral
# - Satisfied
# - Very Satisfied

# **Question 2**: What improvements would you suggest for our service? (Open-ended)
# Common Pitfalls and How to Avoid Them
# Sampling Bias: Ensure your sample is representative of the population.
# Response Bias: Questions should be worded to avoid influencing responses.
# Survey Length: Keep surveys concise to maintain respondent engagement.



### Ensuring Validity and Reliability

In [29]:
# Ensuring Validity and Reliability
# Validity: Ensure your survey measures what it is intended to measure.
# Reliability: Ensure your survey produces consistent results over repeated administrations.
# Reliability: Reliability, assessed through Cronbach's Alpha, evaluates the internal consistency of survey items,
# ensuring that they reliably measure the intended constructs. Both surveys conducted with daladala operators and 
# commuters demonstrated Cronbach's Alpha values greater than 0.7, which signifies acceptable reliability. 
# This means the responses are stable and reproducible, reflecting consistent attitudes and experiences 
# rather than random variations. Reliable data is crucial for the integrity of the study, as it ensures that the insights 
# and trends identified are genuine and not influenced by inconsistent responses. This consistency is foundational 
# for making credible and dependable conclusions from the data.
# Validity: Validity, particularly through Principal Component Analysis (PCA), ensures that the survey measures what it is intended to measure by capturing the core constructs effectively. Our PCA results indicated that three components could explain 95% of the variance in the responses for both groups. This implies that the survey data comprehensively covers the key aspects of the stakeholders' perspectives. For instance, these components likely encapsulate critical factors such as operational impacts, future demand, and integration potential for daladala operators, and travel time, cost, and convenience for commuters. High validity ensures that our survey questions are meaningful and accurately represent the underlying issues, providing a robust basis for the subsequent analysis.


In [30]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Assuming survey data is in a DataFrame df
# Standardizing the data
scaler = StandardScaler()
data_standardized = scaler.fit_transform(df.select_dtypes(include=[float, int]))

# PCA for data validity
pca = PCA(n_components=0.95)  # Retain 95% of the variance
pca_fit = pca.fit(data_standardized)

explained_variance = pca.explained_variance_ratio_
print(f"Number of components to retain 95% variance: {pca.n_components_}")
print(f"Explained variance by each component: {explained_variance}")


Number of components to retain 95% variance: 3
Explained variance by each component: [0.51677906 0.42996663 0.05097807]


## Section 9: Python Code Examples

#### Sample Size Calculation

In [31]:
from statsmodels.stats.power import TTestIndPower

# Parameters for sample size calculation
effect_size = 0.5  # Medium effect size
alpha = 0.05       # Significance level
power = 0.8        # Desired power

# Calculate the required sample size
sample_size = TTestIndPower().solve_power(effect_size, alpha=alpha, power=power)
print(f"Required sample size per group: {sample_size}")

Required sample size per group: 63.765611775409525


#### Power Analysis

In [32]:

from statsmodels.stats.power import TTestIndPower

# Parameters for power analysis
effect_size = 0.5  # Medium effect size
alpha = 0.05       # Significance level
n = 30             # Sample size per group

# Calculate the power of the test
power = TTestIndPower().power(effect_size, nobs1=n, alpha=alpha)
print(f"Power of the test: {power}")

Power of the test: 0.4778965200281735


#### Chi-Square Test

In [33]:
from scipy.stats import chi2_contingency

# Example contingency table
contingency_table = np.array([[82, 35], [27, 16]])

# Perform Chi-Square Test
chi2_stat, p_value, _, _ = chi2_contingency(contingency_table)
print(f"Chi-square statistic: {chi2_stat}, p-value: {p_value}")


Chi-square statistic: 0.4712298410763503, p-value: 0.4924219276038485


#### Adjusting Sample Size with FPC

In [34]:
def adjusted_sample_size(n, N):
    # n = initial sample size
    # N = population size
    return n / (1 + ((n - 1) / N))

# Example
initial_sample_size = 100
population_size = 3000

adjusted_n = adjusted_sample_size(initial_sample_size, population_size)
print(f"Adjusted sample size: {adjusted_n}")


Adjusted sample size: 96.8054211035818


## Section 10: Additional Resources

#### Recommended Reading and References

In [35]:
# Books:
# "Survey Research Methods" by Floyd J. Fowler
# "Statistical Methods for the Social Sciences" by Alan Agresti and Barbara Finlay

# Online Resources:
# Khan Academy Statistics and Probability
# Coursera Statistical Learning

#### Useful Python Libraries for Statistical Analysis

In [36]:

# numpy: Essential for numerical operations and array manipulations.

# Numpy Documentation
# pandas: Excellent for data manipulation and analysis.

# Pandas Documentation
# scipy: Comprehensive library for statistical tests and scientific computations.

# Scipy Documentation
# statsmodels: Great for conducting statistical modeling and hypothesis testing.

# Statsmodels Documentation
# scikit-learn: Useful for machine learning and data preprocessing.

# Scikit-learn Documentation