In [7]:
from matplotlib import pyplot as plt
from datetime import datetime
from math import sqrt

In [8]:
def to_date(n: int) -> str:
    return datetime.fromtimestamp(n).strftime("%d-%m-%Y")

def to_unix(date: str, format: str="%Y-%m-%d") -> int:
    return datetime.strptime(date, format).timestamp()

In [16]:
FILE = r'datasets/dataset_train.csv'

with open(FILE) as file:
    lines = file.readlines()

data = {}
headers = []

for header in lines[0].split(','):
    header = header.lower()
    if header not in ['index', 'hogwarts house', 'first name', 'last name', 'best hand']:
        data[header] = []
    headers.append(header)

for line in lines[1:]:
    line = line.replace(',,,', ',0,0,').replace(',,', ',0,').split(',')
    for i, value in enumerate(line):
        if headers[i] in ['index', 'hogwarts house', 'first name', 'last name', 'best hand']:
            continue
        if headers[i] == 'birthday':
            value = to_unix(value)
        else:
            try:
                value = float(value)
            except ValueError:
                print(f"|{line}|{value}| is not a number")
        data[headers[i]].append(value)

In [17]:

def ft_mean(values: list[int|float]) -> float:
    return sum(values) / len(values)

def ft_std(values: list[int|float]) -> float:
    mean = ft_mean(values)
    summation = 0
    for n in values:
        summation += (n - mean) ** 2
    return sqrt(summation / (len(values) - 1))

def ft_min(values: list[int|float]) -> float:
    minimum = values[0]
    for value in values:
        if value < minimum:
            minimum = value
    return minimum

def ft_q(values: list[int|float], perc: float) -> float:
    values = sorted(values)
    index = perc * (len(values) + 1)
    if index.is_integer():
        return values[int(index)]

    i_low = int(index)
    i_high = int(index) + 1
    return values[i_low] + (index - i_low) * (values[i_high] - values[i_low])

def ft_q1(values: list[int|float]) -> float:
    return ft_q(values, 0.25)

def ft_q2(values: list[int|float]) -> float:
    return ft_q(values, 0.5)

def ft_q3(values: list[int|float]) -> float:
    return ft_q(values, 0.75)

def ft_max(values: list[int|float]) -> float:
    maximum = values[0]
    for value in values:
        if value > maximum:
            maximum = value
    return maximum


In [18]:
methods = {
    'Count': len,
    'Mean': ft_mean,
    'Std': ft_std,
    'Min': ft_min,
    '25%': ft_q1,
    '50%': ft_q2,
    '75%': ft_q3,
    'Max': ft_max,
}

In [None]:
PAD = 20

s = " " * PAD
for header in data.keys():
    s += f"{header:>{PAD}}"
print(s)

for method, fun in methods.items():
    s = f"{method:<{PAD}}"
    for header in data.keys():
        s += f"{round(fun(data[header]), 5):>{PAD}}"
    print(s)


                                birthday          arithmancy           astronomy           herbologydefense against the dark arts          divination      muggle studies       ancient runes    history of magic     transfiguration             potionscare of magical creatures              charms             flying

Count                               1600                1600                1600                1600                1600                1600                1600                1600                1600                1600                1600                1600                1600                1600
Mean                        924892323.75         48579.83562            39.00119             1.11749            -0.38035             3.07703          -219.67701           484.90348             2.88346          1008.20739              5.8388            -0.05209          -243.37441            21.95801
Std                        45130918.2829            17988.08           515.09587   