# Data Analysis

Display information for all numerical feature:
- Count - number of values in the dataset
- Mean - average value
- Std deviation - measure of the dispersion of the values
- Min - smallest value
- 25% (Q1) - median of the lower half of the dataset
- 50% (Q2) - median of the dataset
- 75% (Q3) - median of the upper half of the dataset
- Max - largest value
- Mode - most frequently occurring value

**Additional information:**
- Interquartile Range (IQR) - Range between Q1 and Q3, measure of dispersion of the middle 50% of the dataset
- Skewness - measure of asymmetry of the distribution of values
    - a value of 0 means that the distribution is approximately symmetric
- Kurtosis - A measure of 'tailedness' of the distribution


![skewness-kurtosis\_1JPG-.jpg (733×536)](https://excelrcom.b-cdn.net/assets/admin/ckfinder/userfiles/images/tableau1/tableau2/tableau3/tableau4/tableau5/tableau6/skewness-kurtosis_1JPG-.jpg)


## Describe Program

In [15]:
import math
import sys
import pandas as pd
import numpy as np

sys.argv = ['describe.py', './datasets/dataset_train.csv']

class StopExecution(Exception):
    def _render_traceback_(self):
        print(self)
        pass

def get_min(data):
    min = data[0]
    for x in data:
        if x < min:
            min = x
    return min

def get_max(data):
    max = data[0]
    for x in data:
        if x > max:
            max = x
    return max

def get_mode(data):
    counts = {}
    for value in data:
        if value in counts:
            counts[value] += 1
        else:
            counts[value] = 1

    # Collect all the values with the maximum count
    mode_values = [key for key, value in counts.items() if value == get_max(list(counts.values()))]
    if not mode_values:
        return np.nan

    return mode_values[0]

def get_skewness(data, mean, std, count):
    if std == 0:
        return 0
        
    # Standardize every value
    # The use of cubic is a way to emphasize the impact of extreme values on skewness
    scaled_data = [(x - mean)**3 / std**3 for x in data]
    
    # compute the mean of the new array
    return 0 if count == 0 else sum(scaled_data) / count
    
def calculate_statistics(data, name):
    #filter NAN
    data = data[~np.isnan(data)]
    
    count = len(data)

    mean = 0 if count == 0 else sum(data) / count

    variance = 0 if count == 0 else sum((x - mean) ** 2 for x in data) / count
    std = math.sqrt(variance)
    
    minimum = get_min(data)
    maximum = get_max(data)

    sorted_data = sorted(data)
    q1 = sorted_data[int(0.25 * count)]
    median = sorted_data[int(0.5 * count)]
    q3 = sorted_data[int(0.75 * count)]

    iqr = q3 - q1

    mode = get_mode(data)

    skewness = get_skewness(data, mean, std, count)

    print(f"{name[:15]:<18}{count:<12}{mean:<12.3f}{std:<12.3f}{minimum:<12.3f}{q1:<12.3f}{median:<12.3f}{q3:<12.3f}{maximum:<12.3f}{iqr:<12.3f}{mode:<12.3f}{skewness:<12.3f}")


if __name__ == '__main__':
    if len(sys.argv) != 2:
        raise StopExecution('Argument is missing')

    try:
        df = pd.read_csv(sys.argv[1])
    except FileNotFoundError:
        raise StopExecution('Error: file not found')
    except pd.errors.EmptyDataError:
        raise StopExecution("Error: The CSV file is empty.")
    except pd.errors.ParserError as pe:
        raise StopExecution(f"Error parsing the CSV file: {pe}")
    except Exception as e:
        raise StopExecution(f"An unexpected error occurred: {e}")

    print("Parsed csv successfully!")
    num_rows, num_cols = df.shape
    print(f". Number of Rows: {num_rows}")
    print(f". Number of Columns: {num_cols}")

    # Remove 'Index' column
    if 'Index' in df.columns:
        df = df.drop('Index', axis=1)

    # filter numeric features
    numeric_features = df.select_dtypes(include=['number']).columns

    # Display information for numeric features
    print(f"\n\033[1m{'Feature':<18}{'Count':<12}{'Mean':<12}{'Std':<12}{'Min':<12}{'25%':<12}{'50%':<12}{'75%':<12}{'Max':<12}{'IQR':<12}{'Mode':<12}{'Skewness':<12}\033[0m")
    for column in numeric_features:
        calculate_statistics(df[column], column)
    

Parsed csv successfully!
. Number of Rows: 1600
. Number of Columns: 19

[1mFeature           Count       Mean        Std         Min         25%         50%         75%         Max         IQR         Mode        Skewness    [0m
Arithmancy        1566        49634.570   16674.480   -24370.000  38510.000   49018.000   60828.000   104956.000  22318.000   67239.000   -0.042      
Astronomy         1568        39.797      520.132     -966.741    -489.494    261.645     525.910     1016.212    1015.403    -487.886    -0.095      
Herbology         1567        1.141       5.218       -10.296     -4.312      3.469       5.421       11.613      9.733       5.727       -0.398      
Defense Against   1569        -0.388      5.211       -10.162     -5.259      -2.589      4.905       9.667       10.164      4.879       0.093       
Divination        1561        3.154       4.154       -8.727      3.099       4.624       5.667       10.032      2.568       5.437       -1.379      
Muggle Studie