# Programming Assignment 1
#### Drew Rosales

## Data Analysis

In [60]:
import numpy as np
import pandas as pd
from scipy.stats import kurtosis, skew

First, let's look the dataset into a pandas dataframe and view the contents

In [61]:
iris_df = pd.read_csv("iris.csv")
iris_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


### Perform test statistics for each feature and class: minimum, maximum, mean, trimmed mean, alpha trimmed mean, standard deviation, skewness, and kurtosis

In [62]:
def create_test_statistics(x, p, alpha):
    n = len(x)
    x = list(sorted(x))
    np.min(x)
    np.max(x)
    mu = np.mean(x, axis=0) # column-wise mean
    mu_trim = np.mean(x[p:-p], axis=0)
    mu_alpha = np.mean(x[int(np.floor(n*alpha)) :-int(np.floor(n*alpha))], axis=0) 
    sigma = np.std(x, axis=0)
    skewness = skew(x, axis=0)
    kurt = kurtosis(x, axis=0)
    return (mu, mu_trim, mu_alpha, sigma, skewness, kurt)
    

In [63]:
def separate_features(df):
    sepal_length = df["sepal_length"]
    sepal_width = df["sepal_width"]
    petal_length = df["petal_length"]
    petal_width = df["petal_width"]
    return [sepal_length, sepal_width, petal_length, petal_width]

Separate per class into `setosa`, `virginica` and `versicolor`

In [64]:
setosa_df = iris_df[iris_df["species"] == "setosa"]
versicolor_df = iris_df[iris_df["species"] == "versicolor"]
virginica_df = iris_df[iris_df["species"] == "virginica"]

Each test statistic is represented as a tuple in the following format: `(minimum, maximum, mean, trimmed mean, alpha trimmed mean, standard deviation, skewness, kurtosis)`. A tuple is placed in a list in the following order for the features of the three species: `[sepal_length, sepal_width, petal_length, petal_width]`.

In [65]:
# create the test statistics 3-values trimmed and 5% trimmed
setosa_stats = [create_test_statistics(feature.values, 3, 0.05) for feature in separate_features(setosa_df)]
versicolor_stats = [create_test_statistics(feature.values, 3, 0.05) for feature in separate_features(versicolor_df)]
virginica_stats = [create_test_statistics(feature.values, 3, 0.05) for feature in separate_features(virginica_df)]

In [66]:
setosa_stats

[(np.float64(5.006),
  np.float64(5.0),
  np.float64(5.002173913043478),
  np.float64(0.3489469873777391),
  np.float64(0.11645392749203061),
  np.float64(-0.34576467558246327)),
 (np.float64(3.418),
  np.float64(3.4090909090909096),
  np.float64(3.415217391304348),
  np.float64(0.3771949098277971),
  np.float64(0.10381408207478497),
  np.float64(0.6851340609499266)),
 (np.float64(1.464),
  np.float64(1.4636363636363638),
  np.float64(1.46304347826087),
  np.float64(0.17176728442867112),
  np.float64(0.06967253443431645),
  np.float64(0.8136648848819621)),
 (np.float64(0.244),
  np.float64(0.23636363636363641),
  np.float64(0.23695652173913048),
  np.float64(0.10613199329137281),
  np.float64(1.161022111125348),
  np.float64(1.29647549715909))]

In [67]:
versicolor_stats

[(np.float64(5.935999999999999),
  np.float64(5.9363636363636365),
  np.float64(5.934782608695652),
  np.float64(0.5109833656783751),
  np.float64(0.10218956951841443),
  np.float64(-0.5988273407672811)),
 (np.float64(2.7700000000000005),
  np.float64(2.777272727272727),
  np.float64(2.7739130434782604),
  np.float64(0.31064449134018135),
  np.float64(-0.3518674965401781),
  np.float64(-0.4482718999167745)),
 (np.float64(4.26),
  np.float64(4.281818181818182),
  np.float64(4.273913043478261),
  np.float64(0.4651881339845203),
  np.float64(-0.5881586743962585),
  np.float64(-0.07440182314533761)),
 (np.float64(1.3259999999999998),
  np.float64(1.3227272727272728),
  np.float64(1.3217391304347827),
  np.float64(0.19576516544063705),
  np.float64(-0.030236304298168794),
  np.float64(-0.48783258702455035))]

In [68]:
virginica_stats

[(np.float64(6.587999999999999),
  np.float64(6.588636363636363),
  np.float64(6.593478260869564),
  np.float64(0.6294886813914926),
  np.float64(0.11444474247296789),
  np.float64(-0.08794223159166314)),
 (np.float64(2.974),
  np.float64(2.961363636363636),
  np.float64(2.965217391304347),
  np.float64(0.31925538366643086),
  np.float64(0.35487781319939143),
  np.float64(0.519765935495565)),
 (np.float64(5.552),
  np.float64(5.527272727272727),
  np.float64(5.53695652173913),
  np.float64(0.546347874526844),
  np.float64(0.5328219357082351),
  np.float64(-0.25647236234660253)),
 (np.float64(2.026),
  np.float64(2.031818181818182),
  np.float64(2.0304347826086957),
  np.float64(0.2718896835115301),
  np.float64(-0.12555979315824958),
  np.float64(-0.6613480055552245))]