In [1]:
import os
import glob
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
# Setting the ticks on the x and y axis
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
from IPython.display import display

In [40]:
summary_filename = "summary/summary_statistics.txt"
corr_filename = "summary/correlation.txt"

In [75]:
# ************************ Function definitions ***************************
# Function to calculate the basic statistics of a dataset
def get_summary_stats(data, item=''):
    # Get the basic stats 
    summary_df = pd.DataFrame()
    summary_df['Min (cm)'] = data.min()
    summary_df['Max (cm)'] = data.max()
    summary_df['Mean (cm)'] = data.mean()
    summary_df['Median (cm)'] = data.median()
    summary_df['StDev (cm)'] = data.std()
   
    return summary_df

# Function to write a summary of data to a file 
def write_to_file(summary_filename, df_data, heading, dec_format="%.2f"):   
    with open (summary_filename, 'at') as f:
        # to_string for nice formatting for the text file
        df_summary_asstr = df_data.to_string(float_format=dec_format, 
                                                justify='center')
        # write header and data 
        f.write(f'***************** {heading} *****************\n')
        f.write(f'{df_summary_asstr}\n')
        f.write('\n')

# Function to plot histograms of data 
def plot_histograms(data, var):   
        plt.figure()
        sns.histplot(data, x=var, hue="Class", binwidth=0.2, kde=True)
        plt.savefig(f'plots/histogram_{var}.png')  

# Run pandas correlation method
def get_corr(data):
    return data.corr()
# Styler for notebook table display
def my_styler(df, precision, caption):
    df_styler = df.style.format(precision=precision).set_caption(caption)
    return df_styler
def styler_highlight_between(df_styler, left, right):
    df_styler = df_styler.highlight_between(color='yellow', axis=0, left=left, right=right, inclusive='both', props=None) 
    return df_styler

In [76]:
# ***************************** Reading in data ******************************
# Read in the data from the source file - no header  
data = pd.read_csv('data/iris.data', header=None)
# Make a list of the columns
variables = ["Sepal Length", "Sepal Width", "Petal Length",
             "Petal Width", "Class"]
# List of variables without class
variables_wo_class = variables[:-1]
# Assign the header to the data
data.columns = variables

# get the different classifications
class_names = data["Class"].unique()
# dataframe without class column
data_wo_class = data.drop(columns="Class").copy()
# Delete anything in summary directory
files = glob.glob('summary/*')
for f in files:
    os.remove(f)
    
# Get the statistics for the whole dataset and write to file
df_summary_all = get_summary_stats(data.drop(columns="Class"))
write_to_file(summary_filename, df_summary_all, "All data")
df_corr_all = get_corr(data)
write_to_file(corr_filename, df_corr_all, "All data", "%.3f")
#print("Title")
df_styler = corr_styler(df_corr_all, 2, "Corellation All")
df_styler = styler_highlight_between(df_styler, 0.7, 0.99)
df_styler = styler_highlight_between(df_styler, -0.7, -0.99)
display(df_styler)

for item in class_names:
    # Extract the data related to one class of iris
    iris_data = data[data["Class"] == item].copy()
    # Strip the class column before passing to function
    iris_data.drop(columns = "Class", inplace=True) 
    # Get the stats and write to file
    df_summary = get_summary_stats(iris_data)
    df_styler_stats = my_styler(df_summary, 2, item+" Summary")
    display(df_styler_stats)
    #   write_to_file(summary_filename, df_summary, item)
    df_corr =  get_corr(iris_data)      
  #  write_to_file(corr_filename, df_corr, item, "%.2f")
  #  df_styler = df_corr.style.format(precision=2).set_caption(item+" Correlation")
   # df_styler = df_styler.background_gradient(axis=None, vmin=0.75, vmax=0.99, cmap='YlOrRd')
    df_styler = my_styler(df_corr, 2, "Correlation "+item)
    df_styler = styler_highlight_between(df_styler, 0.7, 0.99)
    df_styler = styler_highlight_between(df_styler, -0.7, -0.99) 
    display(df_styler)


Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width
Sepal Length,1.0,-0.11,0.87,0.82
Sepal Width,-0.11,1.0,-0.42,-0.36
Petal Length,0.87,-0.42,1.0,0.96
Petal Width,0.82,-0.36,0.96,1.0


Unnamed: 0,Min (cm),Max (cm),Mean (cm),Median (cm),StDev (cm)
Sepal Length,4.3,5.8,5.01,5.0,0.35
Sepal Width,2.3,4.4,3.42,3.4,0.38
Petal Length,1.0,1.9,1.46,1.5,0.17
Petal Width,0.1,0.6,0.24,0.2,0.11


Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width
Sepal Length,1.0,0.75,0.26,0.28
Sepal Width,0.75,1.0,0.18,0.28
Petal Length,0.26,0.18,1.0,0.31
Petal Width,0.28,0.28,0.31,1.0


Unnamed: 0,Min (cm),Max (cm),Mean (cm),Median (cm),StDev (cm)
Sepal Length,4.9,7.0,5.94,5.9,0.52
Sepal Width,2.0,3.4,2.77,2.8,0.31
Petal Length,3.0,5.1,4.26,4.35,0.47
Petal Width,1.0,1.8,1.33,1.3,0.2


Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width
Sepal Length,1.0,0.53,0.75,0.55
Sepal Width,0.53,1.0,0.56,0.66
Petal Length,0.75,0.56,1.0,0.79
Petal Width,0.55,0.66,0.79,1.0


Unnamed: 0,Min (cm),Max (cm),Mean (cm),Median (cm),StDev (cm)
Sepal Length,4.9,7.9,6.59,6.5,0.64
Sepal Width,2.2,3.8,2.97,3.0,0.32
Petal Length,4.5,6.9,5.55,5.55,0.55
Petal Width,1.4,2.5,2.03,2.0,0.27


Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width
Sepal Length,1.0,0.46,0.86,0.28
Sepal Width,0.46,1.0,0.4,0.54
Petal Length,0.86,0.4,1.0,0.32
Petal Width,0.28,0.54,0.32,1.0
