In [1]:
import os
import glob
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
# Setting the ticks on the x and y axis
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
from IPython.display import display

In [2]:
summary_filename = "summary/summary_statistics.txt"
corr_filename = "summary/correlation.txt"

In [12]:
# ************************ Function definitions ***************************
# Function to calculate the basic statistics of a dataset
def get_summary_stats(data, item=''):
    # Get the basic stats 
    summary_df = pd.DataFrame()
    summary_df['Min (cm)'] = data.min()
    summary_df['Max (cm)'] = data.max()
    summary_df['Mean (cm)'] = data.mean()
    summary_df['Median (cm)'] = data.median()
    summary_df['StDev (cm)'] = data.std()
   
    return summary_df

# Function to write a summary of data to a file 
def write_to_file(summary_filename, df_data, heading, dec_format="%.2f"):   
    with open (summary_filename, 'at') as f:
        # to_string for nice formatting for the text file
        df_summary_asstr = df_data.to_string(float_format=dec_format, 
                                                justify='center')
        # write header and data 
        f.write(f'***************** {heading} *****************\n')
        f.write(f'{df_summary_asstr}\n')
        f.write('\n')

# Function to plot histograms of data 
def plot_histograms(data, var):   
        plt.figure()
        sns.histplot(data, x=var, hue="Class", binwidth=0.2, kde=True)
        plt.savefig(f'plots/histogram_{var}.png')  

# Run pandas correlation method
def get_corr(data):
    return data.corr()
# Styler for notebook table display
def make_pretty(styler):
    styler.set_caption(item)
   # styler.format()   
  #  styler.background_gradient(axis=None, vmin=1, vmax=5, cmap="YlGnBu")
    return styler

In [34]:
# ***************************** Reading in data ******************************
# Read in the data from the source file - no header  
data = pd.read_csv('data/iris.data', header=None)
# Make a list of the columns
variables = ["Sepal Length", "Sepal Width", "Petal Length",
             "Petal Width", "Class"]
# List of variables without class
variables_wo_class = variables[:-1]
# Assign the header to the data
data.columns = variables
display(data)
# get the different classifications
class_names = data["Class"].unique()
# dataframe without class column
data_wo_class = data.drop(columns="Class").copy()
# Delete anything in summary directory
files = glob.glob('summary/*')
for f in files:
    os.remove(f)
    
# Get the statistics for the whole dataset and write to file
df_summary_all = get_summary_stats(data.drop(columns="Class"))
write_to_file(summary_filename, df_summary_all, "All data")
df_corr_all = get_corr(data)
write_to_file(corr_filename, df_corr_all, "All data", "%.3f")
#print("Title")
display(df_corr_all)

for item in class_names:
    # Extract the data related to one class of iris
    iris_data = data[data["Class"] == item].copy()
    # Strip the class column before passing to function
    iris_data.drop(columns = "Class", inplace=True) 
    # Get the stats and write to file
    df_summary = get_summary_stats(iris_data)
 #   write_to_file(summary_filename, df_summary, item)
    df_corr =  get_corr(iris_data)      
  #  write_to_file(corr_filename, df_corr, item, "%.2f")
    df_styler = df_corr.style.format(precision=2).set_caption(item+" Correlation")
   # df_styler = df_styler.background_gradient(axis=None, vmin=0.75, vmax=0.99, cmap='YlOrRd')
    df_styler = df_styler.highlight_between(color='yellow', axis=0, left=0.7, right=0.99, inclusive='both', props=None)
    display(df_styler)

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width
Sepal Length,1.0,-0.109369,0.871754,0.817954
Sepal Width,-0.109369,1.0,-0.420516,-0.356544
Petal Length,0.871754,-0.420516,1.0,0.962757
Petal Width,0.817954,-0.356544,0.962757,1.0


Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width
Sepal Length,1.0,0.75,0.26,0.28
Sepal Width,0.75,1.0,0.18,0.28
Petal Length,0.26,0.18,1.0,0.31
Petal Width,0.28,0.28,0.31,1.0


Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width
Sepal Length,1.0,0.53,0.75,0.55
Sepal Width,0.53,1.0,0.56,0.66
Petal Length,0.75,0.56,1.0,0.79
Petal Width,0.55,0.66,0.79,1.0


Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width
Sepal Length,1.0,0.46,0.86,0.28
Sepal Width,0.46,1.0,0.4,0.54
Petal Length,0.86,0.4,1.0,0.32
Petal Width,0.28,0.54,0.32,1.0
