## Anova

In [None]:
#ANOVA tells you if the dependent variable changes according to the level of the independent variable.
#For example: Your independent variable is social media use, 
# and you assign groups to low, medium, 
# and high levels of social media use to find out if there is a difference in hours of sleep per night.

In [3]:
from scipy.stats import f_oneway
group1 = [85, 86, 88, 75, 78, 94, 98, 79, 71, 80]
group2 = [91, 92, 93, 85, 87, 84, 82, 88, 95, 96]
group3 = [79, 78, 88, 94, 92, 85, 83, 85, 82, 81]

# Perform one way anova
f_oneway(group1,group2,group3)
# pvalue is greater than fvalue,we fail to reject the null hypothsis

F_onewayResult(statistic=2.3575322551335636, pvalue=0.11384795345837218)

## Ancova

In [None]:
# Ancova removes any effect of covariates, which are variables you don't want to study.
# For-example For example, you might want to study how different levels of teaching skills affect student performance in math; 
# It may not be possible to randomly


In [10]:
import pingouin import ancova
import numpy as np
import pandas as pd
df = pd.DataFrame({'technique': np.repeat(['A', 'B', 'C'], 5),
                   'current_grade': [67, 88, 75, 77, 85,
                                     92, 69, 77, 74, 88, 
                                     96, 91, 88, 82, 80],
                   'exam_score': [77, 89, 72, 74, 69,
                                  78, 88, 93, 94, 90,
                                  85, 81, 83, 88, 79]})
                                  
                                  

SyntaxError: unmatched '}' (68915666.py, line 10)

In [12]:
import numpy as np
from pingouin import ancova
import pandas as pd
df = pd.DataFrame({'technique': np.repeat(['A', 'B', 'C'], 5),
                   'current_grade': [67, 88, 75, 77, 85,
                                     92, 69, 77, 74, 88, 
                                     96, 91, 88, 82, 80],
                   'exam_score': [77, 89, 72, 74, 69,
                                  78, 88, 93, 94, 90,
                                  85, 81, 83, 88, 79]})
                                  #perform ANCOVA
ancova(data=df, dv='exam_score', covar='current_grade', between='technique')
# p_value 0.03 is less than 0.05,we can reject the null hypothesis.


Unnamed: 0,Source,SS,DF,F,p-unc,np2
0,technique,390.57513,2,4.809973,0.031556,0.466536
1,current_grade,4.193886,1,0.103296,0.753934,0.009303
2,Residual,446.606114,11,,,


## Manova

In [None]:
# Manova stands for multivariate analysis of variance. 
# It tests if there is a significant difference between the means of multiple groups.
#  The dependent variables are continuous and the independent variables are categorical.

In [25]:
import pandas as pd
import boto

# import the csv file directly from an s3 bucket
data = pd.read_csv('s3://articledatas3/manova_data_plant_growth.csv')
data


Unnamed: 0,Treatment,Height,Width,Weight
0,1,15.8,3.9,29.4
1,1,15.1,3.8,29.9
2,1,14.8,4.1,30.2
3,1,14.4,4.7,30.1
4,1,15.1,3.7,30.9
5,2,15.7,4.8,31.2
6,2,15.9,4.3,31.4
7,2,15.4,4.5,31.8
8,2,16.7,5.4,32.4
9,2,16.9,5.8,32.6


In [26]:
from statsmodels.multivariate.manova import MANOVA

# add the intercept
data['Treatment'] = ['trt ' + str(x) for x in data['Treatment']]

# fit manova
manova_result = MANOVA.from_formula('Height + Width + Weight ~ Treatment', data)
print(manova_result.mv_test())

#The p-values are shown in the right column and are all inferior to 0.05, 
#which confirms that treatment has an impact on plant growth.

                    Multivariate linear model
                                                                 
-----------------------------------------------------------------
       Intercept          Value   Num DF  Den DF  F Value  Pr > F
-----------------------------------------------------------------
          Wilks' lambda    0.0005 3.0000 10.0000 6374.4483 0.0000
         Pillai's trace    0.9995 3.0000 10.0000 6374.4483 0.0000
 Hotelling-Lawley trace 1912.3345 3.0000 10.0000 6374.4483 0.0000
    Roy's greatest root 1912.3345 3.0000 10.0000 6374.4483 0.0000
-----------------------------------------------------------------
                                                                 
------------------------------------------------------------------
        Treatment         Value   Num DF   Den DF  F Value  Pr > F
------------------------------------------------------------------
           Wilks' lambda  0.0922  6.0000  20.0000   7.6444  0.0002
          Pillai's trace  

## Variance

In [None]:
#The variance is a measure of variability. It is calculated by taking the average of squared deviations from the mean. 
#Variance tells you the degree of spread in your data set.

In [27]:
import numpy as np

dataset= [21, 11, 19, 18, 29, 46, 20]

variance= np.var(dataset)

print(variance)

108.81632653061224


## Co-Variance

In [None]:
#Covariance is a measure of how much two random variables vary together.
# It's similar to variance, but where variance tells you how a single variable varies, co variance tells you how two variables vary together.
# Types
#Positive Covariance.
#Negative Covariance.

In [28]:
import numpy as np

math = [84, 82, 81, 89, 73, 94, 92, 70, 88, 95]
english = [85, 82, 72, 77, 75, 89, 95, 84, 77, 94]
bio = [97, 94, 93, 95, 88, 82, 78, 84, 69, 78]

data = np.array([math, english, bio])
np.cov(data,bias=True)
#The variance of the math scores is 64.96
#The variance of the english scores is 56.4
#The variance of the bio scores is 75.56
#The covariance between the math and english is 33.2
#The covariance between the math and bio score is -24.44
#The covariance between the english and bio score is -24.1


array([[ 64.96,  33.2 , -24.44],
       [ 33.2 ,  56.4 , -24.1 ],
       [-24.44, -24.1 ,  75.56]])

## Co-variate

In [None]:
#any of two or more random variables exhibiting correlated variation
#Variables that affect a response variable, but are not of interest in a study. 

## Standard devition

In [None]:
#The standard deviation measures the spread of the data about the mean value. 
# It is useful in comparing sets of data which may have the same mean but a different range.

In [30]:
import statistics
# Python code to demonstrate stdev() function
 
 
# creating a simple data - set
sample = [1, 2, 3, 4, 5]
 
# Prints standard deviation
# xbar is set to default value of 1
print("Standard Deviation of sample is % s "
                % (statistics.stdev(sample)))

Standard Deviation of sample is 1.5811388300841898 


## Standard error of the mean vs Standard deviation


In [None]:
#The standard deviation (SD) measures the amount of variability, or dispersion,
# from the individual data values to the mean,

In [None]:
#the standard error of the mean (SEM) measures how far the sample mean 
# (average) of the data is likely to be from the true population mean. 
# The SEM is always smaller than the SD.

## Confidence Interval

In [None]:
#a range of values so defined that there is a specified probability 
# that the value of a parameter lies within it.
#Formula
#CI = \bar{x} \pm z \frac{s}{\sqrt{n}}
#CI	=	confidence interval
#\bar{x}	=	sample mean
#z	=	confidence level value
#{s}	=	sample standard deviation
#{n}	=	sample size


In [33]:
import scipy.stats as st
import numpy as np

data = [12, 12, 13, 13, 15, 16, 17, 22, 23, 25, 26, 27, 28, 28, 29]

#create 95% confidence interval for population mean weight
st.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=st.sem(data)) 

(16.75776979778498, 24.042230202215016)

## Alpha

In [None]:
# Alpha is a threshold value used to judge whether
# a test statistic is statistically significant. 
# it can range from 0 to 1.

In [1]:
import pandas as pd
import pingouin as pg
df = pd.DataFrame({'Q1': [1, 2, 2, 3, 2, 2, 3, 3, 2, 3],
                   'Q2': [1, 1, 1, 2, 3, 3, 2, 3, 3, 3],
                   'Q3': [1, 1, 2, 1, 2, 3, 3, 3, 2, 3]})
                   pg.cronbach_alpha(data=df)

IndentationError: unexpected indent (Temp/ipykernel_11364/4025897672.py, line 6)