In [60]:
# Importing necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as scs
import seaborn as sns

from scipy.stats import t, f, ttest_ind

In [2]:
# Importing iris dataset as a Pandas dataframe
df = pd.read_csv('../Dataset/iris.csv')

In [3]:
# Initial visualizatin of the dataframe
df.head(3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa


In [4]:
# renaming colmns to remove space in he beginning of series name
df=df.rename(columns={' sepal_width':'sepal_width', ' petal_length':'petal_length', ' petal_width':'petal_width', ' class':'class'})

In [5]:
# Initial check of data shows that there are 5 series, 150 entries, no NaN values.
# Series expected to contain measurements are of type float
# Series expected to contain text is of type object
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   class         150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [6]:
# There are 3 species of iris in series 'class' with 50 entries each.
df['class'].value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: class, dtype: int64

In [None]:
# Descriptive analysis of the dataframe shows for mean, standard deviation (std) and median (50%) for each data series.
df.describe()

In [None]:
# Descriptive analysis of each species separately shows mean, standard deviation (std) and median (50%) for each data series.

species_list = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'] # list of species in dataframe

for species in species_list:
    print(f'Descriptive analysis for {species}:') # print a heading line
    print(f"{df[df['class'] == species].describe()}\n") # print result from describe() funktion on dataframe filtered for each species


In [None]:
# Visulazation of all series relationships with class as hue
# ref: https://www.geeksforgeeks.org/exploratory-data-analysis-on-iris-dataset/
sns.pairplot(data=df, hue='class', height=1.5)

In [39]:
# Q1 - preparation. Construct a 95% confidence interval for means for sepal width for setosa


setosa = df[df['class'] == 'Iris-setosa'] # creating separate dataframes for setosa
alpha = 0.05
n= 50

sem = scs.sem(setosa['sepal_width'])    # Standard error
mean = setosa['sepal_width'].mean()     # Average
lower, upper = t.interval(confidence=1-alpha, df=n-1, loc=mean, scale=sem) # Calculating lower and higher limits of the confidence interval

print(f'Confidence interval for setosa is : {lower:.2f}-{upper:.2f}') # printing confidence intervall

Confidence interval for setosa is : 3.31-3.53


In [42]:
# Q1. Construct a 95% confidence interval for means for sepal width for each species
# filtering used in the for loop for example for setosa: setosa = df[df['class'] == 'Iris-setosa']

species_list = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'] # list of species in dataframe
alpha = 0.05  # confidence interval
n= 50 # number of samples for each species

for species in species_list:                                    # calculations for each species in the list
    sem = scs.sem(df[df['class'] == species]['sepal_width'])    # Standard error for the specific species
    mean = df[df['class'] == species]['sepal_width'].mean()     # Average for the specific species
    lower, upper = t.interval(confidence=1-alpha, df=n-1, loc=mean, scale=sem) # Calculating lower and higher limits of the confidence interval
    print(f'Confidence interval for {species} is : {lower:.2f}-{upper:.2f}') # Printing confidence intervall

Confidence interval for Iris-setosa is : 3.31-3.53
Confidence interval for Iris-versicolor is : 2.68-2.86
Confidence interval for Iris-virginica is : 2.88-3.07


In [80]:
# Q2. Is sepial length for Iris-virginica different from sepial length for Iris-versicolor?

# Test is done in 2 steps:
# 1. test if variances for the two populations are equal
        # H_a: sigma_virginica != sigma_versicolor
        # H_0: sigma_virginica = sigma_versicolor
# 2. Hypothesis testing where:
        # H_a: µ_virginica != µ_versicolor
        # H_0: µ_virginica = µ_versicolor


filt_virginica = (df['class'] == 'Iris-virginica')      # Filter to filter out Iris-virginica from full dataframe df
filt_versicolor = (df['class'] == 'Iris-versicolor')    # Filter to filter out Iris-versicolor from full dataframe df
n = 50                                                  # Both series contain 50 samples
alpha = 0.05

# Test of variances:
f_statistic = df[filt_virginica]["sepal_length"].var()/df[filt_versicolor]["sepal_length"].var() # Statistica for samples: var_virginica/var_versicolor
f_crit_49_49 = f.ppf(q=0.05, dfn=n-1, dfd=n-1) # Critical F value for 49,49 degrees of freedom
print(f'Test of variances:\n{f_statistic = :.3f}, {f_crit_49_49 = :.3f}') # printing results with 3 decimals
print(f'H0 can be discarded since f_statistica is larger than f_crit')
print(f'Conclusion: Variances for the two populations can not be assumed equal.\n')

# Test of means:
# a and b are series sepal_lengt filtered out for each class of iris; False based on test of variances above; tow-sided since question was if different
ttest_means = ttest_ind(a=df[filt_virginica]["sepal_length"], b=df[filt_versicolor]["sepal_length"], equal_var=False, alternative="two-sided")
print(f'Test of means:\n{ttest_means = }')
print(f'H0 hypothsis can be discarded, since the p-value from t-test (1.87e-07) is smaller than alpha (0.05)')
print(f'Colnclusion: The sepial lengths for virginica and versicolor are different.')

Test of variances:
f_statistic = 1.518, f_crit_49_49 = 0.622
H0 can be discarded since f_statistica is larger than f_crit
Conclusion: Variances for the two populations can not be assumed equal.

Test of means:
ttest_means = Ttest_indResult(statistic=5.629165259719801, pvalue=1.8661443873771216e-07)
H0 hypothsis can be discarded, since the p-value from t-test (1.87e-07) is smaller than alpha (0.05)
Colnclusion: The sepial lengths for virginica and versicolor are different.
