# Data Analysis

Display information for all numerical feature:
- Count - number of values in the dataset
- Mean - average value
- Std deviation - measure of the dispersion of the values
- Min - smallest value
- 25% (Q1)
- 50% (Q2)
- 75% (Q3)
- Max - largest value
- Mode - most frequently occuring value

**Additional infos:**
- Interquartile Range (IQR)
- Skewness - measure of assymetry of the distribution of values
- Kurtosis - A measure of 'tailedness' of the distribution

## Describe Program

In [102]:
import math
import sys
import pandas as pd
import numpy as np

sys.argv = ['describe.py', './datasets/dataset_train.csv']

class StopExecution(Exception):
    def _render_traceback_(self):
        print(self)
        pass

def min(X):
    min = X[0]
    for x in X:
        if x < min:
            min = x
    return min

def max(X):
    max = X[0]
    for x in X:
        if x > max:
            max = x
    return max

def calculate_statistics(data, name):
    #filter NAN
    data = data[~np.isnan(data)]
    
    count = len(data)

    mean = 0 if count == 0 else sum(data) / count

    variance = 0 if count == 0 else sum((x - mean) ** 2 for x in data) / count
    std = math.sqrt(variance)
    
    minimum = min(data)
    maximum = max(data)

    sorted_data = sorted(data)
    q1 = sorted_data[int(0.25 * count)]
    median = sorted_data[int(0.5 * count)]
    q3 = sorted_data[int(0.75 * count)]

    print(f"{name[:15]:<18}{count:<12.3f}{mean:<12.3f}{std:<12.3f}{minimum:<12.3f}{q1:<12.3f}{median:<12.3f}{q3:<12.3f}{maximum:<12.3f}")


if __name__ == '__main__':
    if len(sys.argv) != 2:
        raise StopExecution('Argument is missing')

    try:
        df = pd.read_csv(sys.argv[1])
    except FileNotFoundError:
        raise StopExecution('Error: file not found')
    except pd.errors.EmptyDataError:
        raise StopExecution("Error: The CSV file is empty.")
    except pd.errors.ParserError as pe:
        raise StopExecution(f"Error parsing the CSV file: {pe}")
    except Exception as e:
        raise StopExecution(f"An unexpected error occurred: {e}")

    print("Parsed csv successfully!")
    num_rows, num_cols = df.shape
    print(f". Number of Rows: {num_rows}")
    print(f". Number of Columns: {num_cols}")

    # Remove 'Index' column
    if 'Index' in df.columns:
        df = df.drop('Index', axis=1)

    # filter numeric features
    numeric_features = df.select_dtypes(include=['number']).columns

    # Display information for numeric features
    print(f"\n\033[1m{'Feature':<18}{'Count':<12}{'Mean':<12}{'Std':<12}{'Min':<12}{'25%':<12}{'50%':<12}{'75%':<12}{'Max':<12}\033[0m")
    for column in numeric_features:
        calculate_statistics(df[column], column)
    

Parsed csv successfully!
. Number of Rows: 1600
. Number of Columns: 19

[1mFeature           Count       Mean        Std         Min         25%         50%         75%         Max         [0m
Arithmancy        1566.000    49634.570   16679.806   -24370.000  38511.500   49013.500   60811.250   104956.000  
Arithmancy        1566.000    49634.570   16674.480   -24370.000  38510.000   49018.000   60828.000   104956.000  
Astronomy         1568.000    39.797      520.298     -966.741    -489.551    260.289     524.772     1016.212    
Astronomy         1568.000    39.797      520.132     -966.741    -489.494    261.645     525.910     1016.212    
Herbology         1567.000    1.141       5.220       -10.296     -4.308      3.469       5.419       11.613      
Herbology         1567.000    1.141       5.218       -10.296     -4.312      3.469       5.421       11.613      
Defense Against   1569.000    -0.388      5.213       -10.162     -5.259      -2.589      4.905       9.667       