In [1]:
import math

import numpy as np
import pandas as pd

# Uncomment the following lines to create a new CSV file with random data
# data = pd.DataFrame(np.random.default_rng().choice(range(1, 101), size=(200, 2)), columns=['x', 'y'])
# data.to_csv("data.csv", index=False)


data = pd.read_csv("data.csv")

x: pd.Series = data.copy()["x"]
y: pd.Series = data.copy()["y"]
x_sorted = x.sort_values()
y_sorted = y.sort_values()
data.head(200)

Unnamed: 0,x,y
0,83,4
1,90,37
2,100,69
3,68,69
4,76,9
...,...,...
195,37,61
196,50,55
197,35,72
198,59,28


# Average:
_________
### Manually

In [2]:
def mean(series: pd.Series) -> float:
    series_sum = series.sum()
    series_count = series.size
    series_mean = series_sum / series_count
    return series_mean


# x and y average manually
print(f"x average: {mean(x):.2f}\ny average: {mean(y):.2f}")


x average: 49.43
y average: 50.15


_________________
### Automatically

In [3]:
x_mean = x.mean()
y_mean = y.mean()

# x and y average automatically
print(f"x average: {x_mean:.2f}\ny average: {y_mean:.2f}")

x average: 49.43
y average: 50.15


# Mode:
_______
### Manually

In [4]:
def mode(series: pd.Series) -> list:
    value_counts = series.value_counts(ascending=False)
    modes = value_counts[value_counts == value_counts.max()].index.sort_values()
    return list(modes)


# x and y mode manually
print(f"x modes: {mode(x)}")
print(f"y modes: {mode(y)}")

x modes: [30]
y modes: [71]


_________________
### Automatically

In [5]:
x_modes = x.mode().values
y_modes = y.mode().values

# x and y mode automatically
print(f"x modes: {x_modes}\ny modes: {y_modes}")

x modes: [30]
y modes: [71]


# Median:
________
### Manually

In [6]:
def median(series: pd.Series) -> float:
    sorted_series = series.sort_values().reset_index(drop=True)
    series_len = sorted_series.size

    if series_len % 2 == 0:
        mid1 = series_len // 2 - 1
        mid2 = series_len // 2
        return (sorted_series.iloc[mid1] + sorted_series.iloc[mid2]) / 2
    else:
        mid = series_len // 2
        return sorted_series.iloc[mid]


x_median = median(x_sorted)
y_median = median(y_sorted)
print(f"x median: {x_median}\ny median: {y_median}")

x median: 49.5
y median: 48.5


_________________
### Automatically

In [7]:
x_median = x.median()
y_median = y.median()
print(f"x median: {x_median}\ny median: {y_median}")

x median: 49.5
y median: 48.5


# Variance and Standard Deviation:
________
### Manually

In [8]:
def data_minus_average(series: pd.Series) -> pd.Series:
    avg = series.mean()
    x_minus_avg = series - avg
    return x_minus_avg


def variance(series: pd.Series) -> float:
    series_power_2 = data_minus_average(series).pow(2)
    return series_power_2.sum() / series.size


def standard_deviation(series: pd.Series) -> float:
    std_dev = variance(series)
    return math.sqrt(std_dev)


print(f"x variance: {variance(x):.2f}\ny variance: {variance(y):.2f}\n")
print(f"x standard deviation: {standard_deviation(x):.2f}\ny standard deviation: {standard_deviation(y):.2f}\n")

x variance: 798.93
y variance: 877.35

x standard deviation: 28.27
y standard deviation: 29.62



_________________
### Automatically

In [9]:
print(f"x variance: {x.var(ddof=0):.2f}\ny variance: {y.var(ddof=0):.2f}\n")
print(f"x standard deviation: {x.std(ddof=0):.2f}\ny standard deviation: {y.std(ddof=0):.2f}\n")

x variance: 798.93
y variance: 877.35

x standard deviation: 28.27
y standard deviation: 29.62



# Coefficient of variation
________
### Manually

In [10]:
def coefficient_of_variation(series: pd.Series) -> float:
    std_dev = standard_deviation(series)
    cv = std_dev / series.mean()
    return cv


print(
    f"x coefficient of variation: {coefficient_of_variation(x):.2f}\ny coefficient of variation: {coefficient_of_variation(y):.2f}\n")

x coefficient of variation: 0.57
y coefficient of variation: 0.59



_________________
### Automatically

In [11]:
print(
    f"x coefficient of variation: {(x.std(ddof=0) / x.mean()):.2f}\ny coefficient of variation: {(y.std(ddof=0) / y.mean()):.2f}\n")

x coefficient of variation: 0.57
y coefficient of variation: 0.59



# Standard error of the mean
________
### Manually

In [12]:
def sem(series: pd.Series) -> float:
    s = series.std(ddof=0) / math.sqrt(series.size)
    return s


print(f"x SEM: {sem(x):.2f}\ny SEM: {sem(y):.2f}\n")

x SEM: 2.00
y SEM: 2.09



_________________
### Automatically

In [13]:
print(f"x SEM: {x.sem(ddof=0):.2f}\ny SEM: {y.sem(ddof=0):.2f}\n")

x SEM: 2.00
y SEM: 2.09



# Confidence interval

In [14]:
def ci_level(series: pd.Series, level=95) -> float:
    s = series.sem(ddof=0)
    if level == 99:
        return s * 2.58
    elif level == 95:
        return s * 1.96
    else:
        # 90%
        return s * 1.64


def ci_level_range(series: pd.Series,level=95) -> list:
    ci_lev = ci_level(series, level)
    return [float(x.mean() - ci_lev), float(x.mean() + ci_lev)]

low_x,high_x=ci_level_range(x,95)
low_y,high_y=ci_level_range(y,95)
print(f"x confidence interval 95%: {ci_level(x,95):.2f}\t\t\trange: [ {low_x:.2f} , {high_x:.2f} ]\ny confidence interval 95%: {ci_level(y,95):.2f}\t\t\trange: [ {low_y:.2f} , {high_y:.2f} ]\n")

low_x,high_x=ci_level_range(x,99)
low_y,high_y=ci_level_range(y,99)
print(f"x confidence interval 99%: {ci_level(x,99):.2f}\t\t\trange: [ {low_x:.2f} , {high_x:.2f} ]\ny confidence interval 99%: {ci_level(y,99):.2f}\t\t\trange: [ {low_y:.2f} , {high_y:.2f} ]\n")

x confidence interval 95%: 3.92			range: [ 45.51 , 53.35 ]
y confidence interval 95%: 4.11			range: [ 45.32 , 53.54 ]

x confidence interval 99%: 5.16			range: [ 44.27 , 54.59 ]
y confidence interval 99%: 5.40			range: [ 44.03 , 54.83 ]



# Pearson correlation
________
### Manually

In [15]:
def pearson_corr(x_series:pd.Series,y_series:pd.Series)->float:

    x_minus_avg=data_minus_average(x)
    y_minus_avg=data_minus_average(y)

    x_minus_avg_pow_2=x_minus_avg.pow(2)
    y_minus_avg_pow_2=y_minus_avg.pow(2)

    x_minus_avg_pow_2_sum=x_minus_avg_pow_2.sum()
    y_minus_avg_pow_2_sum=y_minus_avg_pow_2.sum()

    minus_avg_multi=x_minus_avg*y_minus_avg
    minus_avg_multi_sum=minus_avg_multi.sum()
    return minus_avg_multi_sum/math.sqrt(x_minus_avg_pow_2_sum*y_minus_avg_pow_2_sum)


print(f"Pearson correlation: {pearson_corr(x,y):.2f}")

Pearson correlation: 0.01


________
### Automatically

In [16]:
print(f"Pearson correlation: {data.corr(method="pearson").iloc[0,1]:.2f}")

Pearson correlation: 0.01


# Spearman correlation
________
### Manually

In [17]:
def spearman_corr(x_series:pd.Series,y_series:pd.Series)->float:
    #=1-(6*W2)/(COUNT(S2:S201)*(POWER(COUNT(S2:S201),2) -1))
    x_rank_avg=x_series.rank(method="average")
    y_rank_avg=y_series.rank(method="average")
    rank_avg_difference=x_rank_avg-y_rank_avg
    rank_avg_difference_pow_2=rank_avg_difference.pow(2)
    numerator=(6*rank_avg_difference_pow_2.sum())
    denominator=(x_series.size*(pow(x_series.size,2)-1))
    return 1-numerator/denominator

print(f"Spearman correlation: {spearman_corr(x,y):.2f}")

Spearman correlation: 0.01


________
### Automatically

In [18]:
print(f"Pearson correlation: {data.corr(method="spearman").iloc[0,1]:.2f}")

Pearson correlation: 0.01


# Regression

In [19]:
def regression(x_series:pd.Series,y_series:pd.Series)->tuple[float,float]:
    count=x_series.size
    xy_sum=(x_series*y_series).sum()
    x_sum=x_series.sum()
    y_sum=y_series.sum()
    x2_sum=(x_series.pow(2)).sum()
    b_numerator=count*xy_sum-x_sum*y_sum
    b_denominator=count*x2_sum-math.pow(x_sum,2)
    b=b_numerator/b_denominator
    a_numerator=y_sum-b*x_sum
    a_denominator=count
    a=a_numerator/a_denominator
    return a,b

print(f"Regression: y={regression(x,y)[0]:.2f}+{regression(x,y)[1]:.2f}x")
regression_series=regression(x,y)[0]+x*regression(x,y)[1]
regression_series.__round__(2).head(200)

Regression: y=49.58+0.01x


0      50.53
1      50.61
2      50.72
3      50.36
4      50.45
       ...  
195    50.00
196    50.15
197    49.98
198    50.25
199    50.16
Name: x, Length: 200, dtype: float64