In [1]:
# 1. An experiment was conducted to compare the effectiveness of ammonium chloride and urea on the grain yield of paddy
#    and data of the same is collected. X1 values are the ammonium chloride levels and X2 column denote urea.
#    What can you conclude about the effectiveness of the two?

In [2]:
# Hypothesis
# H0: The effect of ammonium chloride and urea on grain yield of paddy are equal i.e., μ1 = μ2
# H1: The effect of ammonium chloride and urea on grain yield of paddy is not equal i.e., μ1 ≠ μ2

In [3]:
import pandas as pd
import numpy as np

paddy_df = pd.read_csv("Paddy_yield_data.csv")
paddy_df.head()

Unnamed: 0,X1,X2
0,13.4,12.0
1,10.9,11.7
2,11.2,10.7
3,11.8,11.2
4,14.0,14.8


In [4]:
from scipy.stats import ttest_ind
tstat, pval = ttest_ind(paddy_df['X1'], paddy_df['X2'])
print("Test statistic: ", tstat)
print("p-value: ", pval)

Test statistic:  0.1846496543760765
p-value:  0.8551954147800473


In [5]:
alpha = 0.05
if pval <= alpha:
    print("Reject H0")
else:
    print("Accept H0")

Accept H0


In [6]:
# Conclusion: The effect of ammonium chloride and urea on grain yield of paddy are equal i.e., μ1 = μ2

In [7]:
#################################################################

In [8]:
# 2. Suppose the IQ in a certain population is normally distributed with a mean of μ = 100 and standard deviation of σ = 15.
#    A researcher wants to know if a new drug affects IQ levels, so he recruits 20 patients to try it 
#    and records their IQ levels. How to determine if the new drug causes a significant difference in IQ levels?

In [9]:
# H0: The new drug does not affect IQ, i.e., μ = 100
# H1: The new drug significantly affects IQ, i.e., μ ≠ 100

In [10]:
iq_df = pd.read_csv("IQ_data.csv")
iq_df.head()

Unnamed: 0,IQ Level
0,88
1,92
2,94
3,94
4,96


In [11]:
from statsmodels.stats.weightstats import ztest
zstat, pval = ztest(iq_df['IQ Level'], value=100)
print("Test statistic: ", zstat)
print("p-value: ", pval)

Test statistic:  1.5976240527147705
p-value:  0.1101266701438426


In [12]:
alpha = 0.05
if pval <= alpha:
    print("Reject H0")
else:
    print("Accept H0")

Accept H0


In [13]:
# Conclusion: The new drug does not significantly affect IQ, i.e., μ = 100

In [14]:
#############################################################

In [15]:
# 3. Suppose the IQ levels among individuals in two different cities are known to be normally distributed 
#    with known standard deviations. A researcher wants to know if the mean IQ level between individuals in city A and city B
#    are different, so she selects a simple random sample of 20 individuals from each city and records their IQ levels.
#    What can you conclude about the same?

In [16]:
# H0: Mean IQ levels in both cities are the same
# H1: Mean IQ levels in both cities are not the same

In [17]:
iq_city = pd.read_csv("IQ_city_data.csv")
iq_city.head()

Unnamed: 0,IQ_CITYA,IQ_CITYB
0,82,90
1,84,91
2,85,91
3,89,91
4,91,95


In [18]:
zstat, pval = ztest(iq_city['IQ_CITYA'], iq_city['IQ_CITYB'], value=0)
print("Test statistic: ", zstat)
print("p-value: ", pval)

Test statistic:  -1.9953236073282115
p-value:  0.046007596761332065


In [19]:
alpha = 0.05
if pval <= alpha:
    print("Reject H0")
else:
    print("Accept H0")

Reject H0


In [20]:
# Conclusion: Mean IQ levels in both cities are significantly different

In [21]:
# 4. An experiment was carried out to evaluate how genotype and number of years affect yield of a certain crop and data is
#    collected on the same. What do you infer about the effect of genotype and years on the yield independently as well as
#    collectively?

In [22]:
gen_df = pd.read_csv("Genotype_yield_data.csv")
gen_df.head()

Unnamed: 0,Genotype,1_year,2_year,3_year
0,A,1.53,4.08,6.69
1,A,1.83,3.84,5.97
2,A,1.38,3.96,6.33
3,B,3.6,5.7,8.55
4,B,2.94,5.07,7.95


In [23]:
# Reshape the dataframe into stacked format suitable for statsmodels package 
gen_df_melt = pd.melt(gen_df, id_vars=['Genotype'], value_vars=['1_year', '2_year', '3_year'])
gen_df_melt.head(10)

Unnamed: 0,Genotype,variable,value
0,A,1_year,1.53
1,A,1_year,1.83
2,A,1_year,1.38
3,B,1_year,3.6
4,B,1_year,2.94
5,B,1_year,4.02
6,C,1_year,3.99
7,C,1_year,3.3
8,C,1_year,4.41
9,D,1_year,3.75


In [24]:
# Renaming the cols
gen_df_melt.columns = ['Genotype', 'Years', 'Value']

In [25]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
model = ols('Value ~ C(Genotype) + C(Years) + C(Genotype):C(Years)', data=gen_df_melt).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table

Unnamed: 0,sum_sq,df,F,PR(>F)
C(Genotype),58.551733,5.0,32.748581,1.931655e-12
C(Years),278.925633,2.0,390.014868,4.006243e-25
C(Genotype):C(Years),17.122967,10.0,4.788525,0.0002230094
Residual,12.873,36.0,,


In [26]:
# Conclusion: 
# The p value obtained from ANOVA analysis for genotype, years, and interaction are statistically significant (p<0.05)
# We conclude that type of genotype significantly affects the yield outcome, time (years)
# significantly affects the yield outcome, and interaction of both genotype and time (years) 
# significantly affects the yield outcome.

In [27]:
#######################################################################

In [28]:
# 5. A survey was conducted on a small sample of population to analyse the relationship between gender and preference for a 
#    particular phone brand. Are gender and brand preference associated with each other?

In [29]:
# H0: Gender and brand preference are dependent
# H1: Gender and brand preference are independent

In [30]:
brand_df = pd.read_csv("Phone_brand_data.csv")
brand_df

Unnamed: 0,Gender,iPhone,Samsung,Redmi
0,Male,102,115,123
1,Female,54,89,47


In [31]:
brand_df.set_index(keys='Gender', inplace=True)

In [32]:
brand_df

Unnamed: 0_level_0,iPhone,Samsung,Redmi
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,102,115,123
Female,54,89,47


In [33]:
from scipy import stats
chi_stat,pval,dof,expected_values = stats.chi2_contingency(brand_df)

print('Chi-square statistic:',chi_stat)
print('p-value:',pval)
print('Degree of Freedom: ',dof)
print('Expected values: \n', expected_values)

Chi-square statistic: 10.443084371162136
p-value: 0.005398996448349567
Degree of Freedom:  2
Expected values: 
 [[100.0754717  130.86792453 109.05660377]
 [ 55.9245283   73.13207547  60.94339623]]


In [34]:
# Critical value
from scipy.stats import chi2
alpha = 0.05
critical_value = chi2.ppf(q=1-alpha, df=dof)
print('critical_value:', critical_value)

critical_value: 5.991464547107979


In [35]:
# Inference

if chi_stat>=critical_value:
    print("Reject H0, There is a relationship between gender and brand preference")
else:
    print("Accept H0, There is no relationship between gender and brand preference")
    
if pval<=alpha:
    print("Reject H0, There is a relationship between gender and brand preference")
else:
    print("Accept H0, There is no relationship between gender and brand preference")

Reject H0, There is a relationship between gender and brand preference
Reject H0, There is a relationship between gender and brand preference


In [36]:
## end of notebook