In [1]:
#Q1.

In [3]:
# Assumptions for using ANOVA:

#1. Normality of Sampling Distribution of mean
#2. Absence of Outliers
#3. Homogenity of variance
#4. Samples are independent and random

# Examples of Violations:

#1. Non-normality: Residuals not normally distributed.
#2. Dependence: Non-independent observations within groups.
#3. Outliers: Extreme values or outliers.
#4. Unequal Sample Sizes: Significantly different group sizes.
#5. Non-Additivity: Violation of additivity assumption.

In [4]:
#Q2.

# Three types of ANOVA:

#1. One-way ANOVA: For comparing means of three or more independent groups.
#2. Two-way ANOVA: When you have two independent factors to analyze.
#3. Repeated Measures ANOVA: When measuring the same subjects over multiple time points.

# Situations for Each:

#1. One-way ANOVA: Comparing class performance in multiple classes
#2. Two-way ANOVA: Analyzing the combined impact of gender and age on student test scores.
#3. Repeated Measures ANOVA: Assessing how an exercise program affects weight over time with the same participants.

In [5]:
#Q3.

# Partitioning of Variance in ANOVA:

# It breaks down total variance into components.
# Components include between-group, within-group, and error variances.
# It helps identify sources of variation in the data.
# Assesses model fit and its explanatory power.
# Essential for making inferences and drawing conclusions.
# Useful for optimizing processes in experimental design.

In [6]:
#Q4.

import numpy as np

In [7]:
group1 = np.array([23, 25, 27, 24, 22])
group2 = np.array([30, 31, 32, 29, 28])
group3 = np.array([35, 36, 34, 33, 37])

In [8]:
# Combine all data points into one array
all_data = np.concatenate((group1, group2, group3))
print(all_data)

[23 25 27 24 22 30 31 32 29 28 35 36 34 33 37]


In [10]:
# Calculate the Grand Mean
GM = np.mean(all_data)
GM

29.733333333333334

In [11]:
SST = np.sum((all_data - GM)**2)

In [12]:
group1_mean = np.mean(group1)
group2_mean = np.mean(group2)
group3_mean = np.mean(group3)

In [13]:
SSE1 = np.sum((group1 - group1_mean)**2)
SSE2 = np.sum((group2 - group2_mean)**2)
SSE3 = np.sum((group3 - group3_mean)**2)

In [14]:
SSE = SSE1 + SSE2 + SSE3

In [15]:
SSR = SST - SSE

In [16]:
print(f"SST: {SST}")
print(f"SSE: {SSE}")
print(f"SSR: {SSR}")

SST: 326.93333333333334
SSE: 34.8
SSR: 292.1333333333333


In [26]:
#Q5.

# In a two-way ANOVA, we can calculate the main effects as follows:

# Calculate the main effect for each independent variable separately.
# For each factor, compute the difference in means between the levels of that factor.
# We can use libraries like 'scipy.stats' or 'pingouin' to perform test for each factor.

# Interaction Effects:
# Calculate the interaction effects between two or more factors.
# Perform the two-way ANOVA using libraries like 'statsmodels' or 'pingouin'.
# Examine the interaction term (e.g., "FactorA:Factor B") in the ANOVA table.
# A significant interaction term indicates that the effect of one factor depends on the level of another factor.

In [18]:
#Q6.

# F-statistic: The F-statistic of 5.23 indicates that there is some degree of variation between the group means.

# P-value: The p-value of 0.02 is less than the significance level (i.e. making an assumption of 0.05), suggesting an evidence to reject the null hypothesis.

# Interpretation about the results:

# There is evidence to suggest that at least one group mean is different from the others.

# All in all, ANOVA results indicate that there are statistically significant differences among the groups, but further tests are needed to determine which specific groups differ from each other.

In [19]:
#Q7.

# Handling missing data in repeated meassures ANOVA:

#1. Importance: Addressing missing data is crucial to maintain data integrity and statistical power.
#2. Methods: Various methods exist, including complete case analysis, mean imputation, regression imputation, and multiple imputation.
#3. Choice: Method choice depends on data nature and missing data mechanism.
#4. Multiple Imputation: Often considered the gold standard due to its accuracy and ability to address uncertainty.
#5. Consequences: Using different methods can lead to bias, power loss, invalid inferences, and increased complexity.
#6. Transparency: Transparently report the chosen method in research for reproducibility and trustworthiness of results.

In [27]:
#Q8.

# Tukey's HSD: For comparing all pairs of group means; suitable for equal or unequal group sizes.
# Bonferroni Correction: Controls familywise erroe rate; useful for multiple comparisons.
# Duncan's Multiple Range Test: Identifies subsets of groups with no significant differences.
# Scheffé's Test: Compares all possible group combinations; robust for unequal group sizes.
# Holm-Bonferroni Method: Step-down procedure to control familywise error rate.
# Fisher's LSD: Compares two groups at a time; less stringent but can increase Type I errors.

In [50]:
#Q9.

import scipy.stats as stats
import numpy as np

In [22]:
np.random.seed(0)
sample_size = 50

In [23]:
diet_A = np.random.normal(loc=2.0, scale=0.5, size=sample_size)
diet_B = np.random.normal(loc=1.8, scale=0.6, size=sample_size)
diet_C = np.random.normal(loc=2.2, scale=0.7, size=sample_size)

In [25]:
f_statistic, p_value = stats.f_oneway(diet_A, diet_B, diet_C)

In [26]:
print(f"F-statistic: {f_statistic:.4f}")
print(f"P-value: {p_value:.4f}")

F-statistic: 11.6230
P-value: 0.0000


In [27]:
significance_level = 0.05 # Assumption

In [28]:
if p_value < significance_level:
    print("There is a significant difference in mean weight loss between the diets.")
else:
    print("There is no significant difference in mean weight loss between the diets.")

There is a significant difference in mean weight loss between the diets.


In [51]:
# Q10.

In [30]:
pip install numpy pandas statsmodels

Note: you may need to restart the kernel to use updated packages.


In [31]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [32]:
np.random.seed(42)

In [45]:
n_employees = 90

data = {
    'Software': np.repeat(['A', 'B', 'C'], n_employees // 3),
    'Experience': np.random.choice(['Novice', 'Experienced'], size=n_employees),
    'Time': np.random.normal(loc=20, scale=5, size=n_employees)
}

df = pd.DataFrame(data)

In [46]:
df

Unnamed: 0,Software,Experience,Time
0,A,Novice,16.140905
1,A,Novice,28.306509
2,A,Novice,18.328765
3,A,Experienced,18.410765
4,A,Experienced,21.588835
...,...,...,...
85,C,Experienced,24.019065
86,C,Novice,23.069207
87,C,Novice,26.789404
88,C,Experienced,17.833483


In [48]:
# Perform two-way ANOVA test using the formula.

formula = 'Time ~ Software + Experience + Software:Experience'
model = ols(formula, data=df).fit()

In [49]:
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table)

                          sum_sq    df         F    PR(>F)
Software                3.404554   2.0  0.066478  0.935733
Experience              0.689127   1.0  0.026912  0.870087
Software:Experience   206.939945   2.0  4.040737  0.021110
Residual             2150.963553  84.0       NaN       NaN


In [52]:
# Summary and interpretation of the results.
# The high p-value (0.935733 > 0.05) for Software factor indicates that it is not statistically significant. In other words, there is no strong evidence to suggest that the choice of software significantly affects completion time.

# The high p-value (0.870087 > 0.05) for Experience factor suggests that it is not statistically significant. In other words, there is no strong evidence to suggest that employee experience significantly affects completion time.

# The p-value (0.021110) is less than the typical significance level of 0.05, indicating that the interaction effect is statistically significant. This means that the combined effect of software choice and employee experience level significantly affects completion time.

In [16]:
#Q11.

In [17]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [18]:
# Assuming data for the control and experimental groups
control_scores = np.random.normal(85, 5, 50)
experimental_scores = np.random.normal(90, 5, 50)

In [19]:
data = pd.DataFrame({'Group':['Control'] * 50 + ['Experimental'] * 50,
                    'Score':np.concatenate([control_scores, experimental_scores])})

In [20]:
t_stat, p_value = stats.ttest_ind(control_scores, experimental_scores)

In [21]:
if p_value < 0.05:
    print("The two-sample t-test shows significant differences between the groups.")
    print(f"T-statistic: {t_stat:.2f}, p-value: {p_value:.4f}")
    
    posthoc = pairwise_tukeyhsd(data['Score'], data['Group'], alpha=0.05)
    
    print("\nPost-Hoc Tukey's HSD Test:")
    print(posthoc)
else:
    print("The two-sample t-test does not show significant differences between the groups.")

The two-sample t-test shows significant differences between the groups.
T-statistic: -4.24, p-value: 0.0000

Post-Hoc Tukey's HSD Test:
  Multiple Comparison of Means - Tukey HSD, FWER=0.05   
 group1    group2    meandiff p-adj lower  upper  reject
--------------------------------------------------------
Control Experimental   4.2729   0.0 2.2751 6.2707   True
--------------------------------------------------------


In [4]:
#Q12

In [5]:
pip install pingouin

Collecting pingouin
  Downloading pingouin-0.5.3-py3-none-any.whl (198 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.6/198.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting pandas-flavor>=0.2.0
  Downloading pandas_flavor-0.6.0-py3-none-any.whl (7.2 kB)
Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Collecting outdated
  Downloading outdated-0.2.2-py2.py3-none-any.whl (7.5 kB)
Collecting xarray
  Downloading xarray-2023.9.0-py3-none-any.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
Collecting littleutils
  Downloading littleutils-0.2.2.tar.gz (6.6 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: littleutils
  Building wheel for littleutils (setup.py) ... [?25ldone
[?25h  Created wheel for littleutils: filename=littleutils-0.2.2-py3-none-any.whl size=7028 sha256=520a3d4f4ace9659b07b52

In [6]:
import pandas as pd
import pingouin as pg
from statsmodels.stats.multicomp import MultiComparison

In [19]:
data = {
    
    'Store': ['A'] * 30 + ['B'] * 30 + ['C'] * 30,
    'Sales': [
        # Sales data for Store A for 30 days
        100, 110, 95, 105, 115, 100, 105, 110, 98, 112,
        92, 116, 94, 102, 111, 98, 105, 109, 97, 110,
        85, 92, 88, 86, 89, 84, 91, 87, 88, 93,
        # Sales data for Store B for 30 days
        90, 85, 82, 88, 92, 87, 86, 95, 91, 84,
        98, 100, 94, 96, 99, 85, 90, 89, 92, 88,
        93, 97, 85, 89, 86, 91, 83, 87, 88, 85,
        # Sales data for Store C for 30 days
        75, 78, 82, 79, 81, 84, 76, 80, 85, 83,
        80, 82, 86, 79, 77, 78, 81, 85, 80, 79,
        84, 76, 82, 78, 85, 83, 77, 80, 81, 79,
    ],
    'Day':list(range(1,31)) * 3

}

In [20]:
df = pd.DataFrame(data)

In [21]:
df

Unnamed: 0,Store,Sales,Day
0,A,100,1
1,A,110,2
2,A,95,3
3,A,105,4
4,A,115,5
...,...,...,...
85,C,83,26
86,C,77,27
87,C,80,28
88,C,81,29


In [22]:
aov = pg.rm_anova(data=df, dv='Sales', within='Day', subject='Store')

In [23]:
print(aov)

  Source  ddof1  ddof2         F     p-unc       ng2       eps
0    Day     29     58  1.148441  0.320344  0.154275  0.051071


In [24]:
posthoc = MultiComparison(df['Sales'], df['Store'])
posthoc_results = posthoc.tukeyhsd()

In [25]:
print(posthoc_results)

 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj  lower    upper   reject
-----------------------------------------------------
     A      B  -9.0667   0.0 -13.0938  -5.0395   True
     A      C    -18.4   0.0 -22.4272 -14.3728   True
     B      C  -9.3333   0.0 -13.3605  -5.3062   True
-----------------------------------------------------


In [28]:
# End

In [30]:
# End

In [31]:
# End