# Statistical Analysis: Iris Dataset
We will calculate the following statistics for each attribute
- [] Central Tendancy
    - Mean
    - Median
    - Mode
- [] Dispersion
    - Range
    - Quartiles
    - InterQuartile Range
    - Variance
    - Santard Deviation
- [] Covariance Matrix
- [] Correlation Matrix


## Importing necessary Libraries

In [1]:
import pandas as pd

## Load datasets

In [3]:
iris_df = pd.read_csv('../iris/iris.data', header=None)
iris_df.columns = [
    "sepal_length", "sepal_width", "petal_length", "petal_width", "class" 
]
iris_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


## Handle 'class' attribute which is a categorial data

In [6]:
# Handling categorical data

# Iris-setosa -> 0
iris_df.loc[iris_df['class'] == 'Iris-setosa', 'class'] = 0
# Iris-versicolor -> 1
iris_df.loc[iris_df['class'] == 'Iris-versicolor', 'class'] = 1
# Iris-virginica -> 2
iris_df.loc[iris_df['class'] == 'Iris-virginica', 'class'] = 2

iris_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


## Compute central tendency (mean, median, mode)

In [7]:
mean_values = iris_df.mean()
median_values = iris_df.median()
mode_values = iris_df.mode().iloc[0]  # mode() returns a dataframe, so we take the first row

# Combine them into a single DataFrame
central_tendency_table = pd.DataFrame({
    'Mean': mean_values,
    'Median': median_values,
    'Mode': mode_values
})

# Display the table
central_tendency_table

Unnamed: 0,Mean,Median,Mode
sepal_length,5.843333,5.8,5.0
sepal_width,3.054,3.0,3.0
petal_length,3.758667,4.35,1.5
petal_width,1.198667,1.3,0.2
class,1.0,1.0,0.0


## Measures of Dispersion (range, quartiles, InterQuartile Ranges, Variance, Standard Deviations)

In [8]:
iris_dispersion = iris_df.describe().T
iris_dispersion['IQR'] = iris_dispersion['75%'] - iris_dispersion['25%']
iris_dispersion['Variance'] = iris_dispersion['std'] ** 2

iris_dispersion

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,IQR,Variance
sepal_length,150.0,5.843333,0.828066,4.3,5.1,5.8,6.4,7.9,1.3,0.685694
sepal_width,150.0,3.054,0.433594,2.0,2.8,3.0,3.3,4.4,0.5,0.188004
petal_length,150.0,3.758667,1.76442,1.0,1.6,4.35,5.1,6.9,3.5,3.113179
petal_width,150.0,1.198667,0.763161,0.1,0.3,1.3,1.8,2.5,1.5,0.582414


# Compute covariance and correlation matrices

In [9]:
iris_covariance = iris_df.cov()
iris_correlation = iris_df.corr()

### IRIS Covariance Matrix

In [10]:
iris_covariance

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
sepal_length,0.685694,-0.039268,1.273682,0.516904,0.530872
sepal_width,-0.039268,0.188004,-0.321713,-0.117981,-0.148993
petal_length,1.273682,-0.321713,3.113179,1.296387,1.371812
petal_width,0.516904,-0.117981,1.296387,0.582414,0.597987
class,0.530872,-0.148993,1.371812,0.597987,0.671141


### IRIS Correlation Matrix

In [11]:
iris_correlation

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
sepal_length,1.0,-0.109369,0.871754,0.817954,0.782561
sepal_width,-0.109369,1.0,-0.420516,-0.356544,-0.419446
petal_length,0.871754,-0.420516,1.0,0.962757,0.949043
petal_width,0.817954,-0.356544,0.962757,1.0,0.956464
class,0.782561,-0.419446,0.949043,0.956464,1.0
