In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('NoOutliersDataset.csv', index_col= [0])

numerical = ["CO2 Emissions", "AFF", "Government Effectiveness", "Individuals using the Internet", "Life Expectancy", "Renewable Energy Consumption"]
categorical = ["Country", "Income Group", "Region"]
df

Unnamed: 0,Country,Income Group,Region,CO2 Emissions,AFF,Government Effectiveness,Individuals using the Internet,Life Expectancy,Renewable Energy Consumption
0,Afghanistan,Low income,South Asia,0.200151,22.042897,-1.453096,60.828447,64.486000,21.422701
1,Albania,Upper middle income,Europe & Central Asia,1.939732,18.440931,0.057240,65.400000,78.458000,38.266399
2,Algeria,Lower middle income,Middle East & North Africa,3.591657,11.840233,-0.527378,49.038468,76.693000,0.193300
3,Andorra,High income,Europe & Central Asia,5.973405,10.393144,1.943918,60.828447,72.210078,18.506001
4,Angola,Lower middle income,Sub-Saharan Africa,0.887380,8.607742,-1.045002,37.169226,60.782000,56.785500
...,...,...,...,...,...,...,...,...,...
188,"Venezuela, RB",Low income,Latin America & Caribbean,4.782755,10.393144,-1.584651,60.828447,72.128000,14.564700
189,Vietnam,Lower middle income,East Asia & Pacific,2.698806,14.681979,-0.001380,69.847929,75.317000,23.491800
190,"Yemen, Rep.",Low income,Middle East & North Africa,0.326682,5.000963,-2.230443,60.828447,66.096000,4.269000
191,Zambia,Lower middle income,Sub-Saharan Africa,0.446065,3.341147,-0.557072,37.169226,63.510000,85.104599


In [3]:
import statsmodels.stats.api as sms

def MeanEstimationConfidenceInterval(dataframe, numeric_variables, interval):
    for attribute in numeric_variables:
        estimation = sms.DescrStatsW(dataframe[attribute]).tconfint_mean(alpha = interval)
        print("At a " + str(int((1-interval)*100)) + " % confidence level, we estimate that the population mean of  "  + str(attribute)  + " is in the range of " + str(estimation) )
#sms.DescrStatsW(df['CO2 Emissions']).tconfint_mean(alpha = 0.1)    #estimate the confidence interval 

MeanEstimationConfidenceInterval(df, numerical, 0.1)

At a 90 % confidence level, we estimate that the population mean of  CO2 Emissions is in the range of (3.4360674931982604, 4.341827687988032)
At a 90 % confidence level, we estimate that the population mean of  AFF is in the range of (9.063652861956367, 11.216381673002811)
At a 90 % confidence level, we estimate that the population mean of  Government Effectiveness is in the range of (-0.19542804714813405, 0.038899796886353935)
At a 90 % confidence level, we estimate that the population mean of  Individuals using the Internet is in the range of (61.33485428388349, 65.37219112378824)
At a 90 % confidence level, we estimate that the population mean of  Life Expectancy is in the range of (71.33414141059933, 73.08601420976123)
At a 90 % confidence level, we estimate that the population mean of  Renewable Energy Consumption is in the range of (28.311388779624366, 34.866564148997824)


In [4]:
from scipy import stats
import random

#An one sample t-test is used to determine whether or not the mean of a population is equal to some value.

for i in numerical:
    result = stats.ttest_1samp(df[i], random.randint(int(df[i].min()),int(df[i].max())))
    print(result[1])


1.759345223954178e-73
1.5446748004854826e-64
3.885779582253237e-28
1.98098576501279e-37
9.65835197410692e-47
2.386641236717998e-05


In [5]:
stats.ttest_1samp(df['CO2 Emissions'], 4)

Ttest_1sampResult(statistic=-0.405296232337732, pvalue=0.685711016339649)

In [6]:
#testing 2 means with the student test

from scipy import stats 

high_income = df.loc[df['Income Group'] == "High income"]
upp_mid_income = df.loc[df['Income Group'] == "Low income"]
print(stats.ttest_ind(high_income['Government Effectiveness'],upp_mid_income['Government Effectiveness']))

Ttest_indResult(statistic=15.73772805360967, pvalue=8.911724959874227e-27)


In [2]:
#testing all categories means at the same time with ANOVA
#Null hypothesis: Groups means are equal (no variation in means of groups)


import statsmodels.api as sm
from statsmodels.formula.api import ols

model = ols('Q("Government Effectiveness") ~ Q("Region")', data = df).fit()   #the Q stands for quote and it's the way the formula works for variables with spaces in it

sm.stats.anova_lm(model, typ=2)


Unnamed: 0,sum_sq,df,F,PR(>F)
"Q(""Region"")",64.117971,6.0,16.280694,4.925227e-15
Residual,122.086758,186.0,,


In [29]:
#From ANOVA analysis, we know that treatment differences are statistically significant, but ANOVA does not tell which treatments are 
# significantly different from each other. To know the pairs of significant different treatments, we will perform multiple pairwise comparison
#  (post hoc comparison) analysis for all unplanned comparison using Tukey’s honestly significantly differenced (HSD) test.
#source: https://www.reneshbedre.com/blog/anova.html
from bioinfokit.analys import stat

res = stat()
res.tukey_hsd(df, res_var='Government Effectiveness', xfac_var='Region', anova_model='AFF ~ Region')
res.tukey_summary


  mult_group[ele] = df[df[xfac_var] == ele].mean().loc[res_var]


Unnamed: 0,group1,group2,Diff,Lower,Upper,q-value,p-value
0,South Asia,Europe & Central Asia,1.125035,-7.217517,9.467586,0.568487,0.9
1,South Asia,Middle East & North Africa,0.192971,-8.996467,9.382409,0.088523,0.9
2,South Asia,Sub-Saharan Africa,0.356397,-8.032373,8.745168,0.179098,0.9
3,South Asia,Latin America & Caribbean,0.287441,-8.369409,8.944291,0.139973,0.9
4,South Asia,East Asia & Pacific,0.570272,-8.170627,9.311171,0.27503,0.9
5,South Asia,North America,2.117799,-15.248606,19.484204,0.514078,0.9
6,Europe & Central Asia,Middle East & North Africa,0.932063,-4.847826,6.711952,0.679798,0.9
7,Europe & Central Asia,Sub-Saharan Africa,1.481432,-2.915479,5.878342,1.420325,0.9
8,Europe & Central Asia,Latin America & Caribbean,0.837593,-4.051415,5.726601,0.722215,0.9
9,Europe & Central Asia,East Asia & Pacific,0.554762,-4.481572,5.591097,0.464351,0.9
