In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

## Collecting data

In [3]:
import yfinance as yf

## Sample and variation series

A sample of size $n$ from a distribution $F$ is a set of $n$ independent and identically distributed (iid) random variables $X_{1},X_{2},...,X_{n}$, each having the probability distribution $F$.

This collection represents a "microcosm" of the population, allowing for the estimation of population parameters (like mean $\mu$ and variance $\sigma ^{2}$) using **statistics** such as the sample mean $\bar{X}$:

\begin{equation*}\bar{X} = \frac{1}{n}\sum_{k=1}^{n} X_{k}\end{equation*} 

and sample variance $s^{2}$:

\begin{equation*}
s^2 = \frac{1}{n}\sum_{k=1}^n (X_k - \bar{X})^2
\end{equation*}

A **statistic** is formally defined as any measurable function $S=s(X_{1},\dots ,X_{n})$ of sample observations (random variables) that does not depend on unknown parameters. 
As a random variable itself, a statistic allows for estimation of population parameters or hypothesis testing,, ensuring that different samples can produce different values. 

## Control work 1

In [4]:
apple = yf.Ticker("AAPL").history(start="2023-01-01", end="2025-09-24", interval="3mo")
microsoft = yf.Ticker("MSFT").history(start="2023-01-01", end="2025-09-24", interval="3mo")
variant04 = pd.DataFrame({
    "X":apple.Close.pct_change().round(decimals=2),
    "Y":microsoft.Close.pct_change().round(decimals=2)
})

In [5]:
google = yf.Ticker("GOOG").history(start="2023-01-01", end="2025-09-24", interval="3mo")
amazon = yf.Ticker("AMZN").history(start="2023-01-01", end="2025-09-24", interval="3mo")
variant01 = pd.DataFrame({
    "X":google.Close.pct_change().round(decimals=2),
    "Y":amazon.Close.pct_change().round(decimals=2)
})

In [44]:
byd = yf.Ticker("1211.HK").history(start="2023-01-01", end="2025-09-24", interval="3mo")
tesla = yf.Ticker("TSLA").history(start="2023-01-01", end="2025-09-24", interval="3mo")
variant02 = pd.DataFrame({
    "X":byd.Close.pct_change().round(decimals=2).values[:11],
    "Y":tesla.Close.pct_change().round(decimals=2).values
})

In [7]:
coke = yf.Ticker("COKE").history(start="2023-01-01", end="2025-09-24", interval="3mo")
pepsi = yf.Ticker("PEP").history(start="2023-01-01", end="2025-09-24", interval="3mo")
variant03 = pd.DataFrame({
    "X":coke.Close.pct_change().round(decimals=2),
    "Y":pepsi.Close.pct_change().round(decimals=2)
})

In [8]:
johnsonjohnson = yf.Ticker("JNJ").history(start="2023-01-01", end="2025-09-24", interval="3mo")
proctelGamble = yf.Ticker("PG").history(start="2023-01-01", end="2025-09-24", interval="3mo")
variant05 = pd.DataFrame({
    "X":johnsonjohnson.Close.pct_change().round(decimals=2),
    "Y":proctelGamble.Close.pct_change().round(decimals=2)
})

In [9]:
walmart = yf.Ticker("WMT").history(start="2023-01-01", end="2025-09-24", interval="3mo")
costco = yf.Ticker("COST").history(start="2023-01-01", end="2025-09-24", interval="3mo")
variant06 = pd.DataFrame({
    "X":walmart.Close.pct_change().round(decimals=2),
    "Y":costco.Close.pct_change().round(decimals=2)
})

In [82]:
len(kfc.Close.pct_change().round(decimals=2).values)

12

In [88]:
mcdonald = yf.Ticker("MCD").history(start="2023-01-01", end="2025-09-24", interval="3mo")
kfc = yf.Ticker("3420.T").history(start="2023-01-01", end="2025-09-24", interval="3mo")
variant07 = pd.DataFrame({
    "X":mcdonald.Close.pct_change().round(decimals=2).values,
    "Y":kfc.Close.pct_change().round(decimals=2).values[:11]
})

In [11]:
pfizer = yf.Ticker("PFE").history(start="2023-01-01", end="2025-09-24", interval="3mo")
variant08 = pd.DataFrame({
    "X":pfizer.Close.pct_change().round(decimals=2),
    "Y":johnsonjohnson.Close.pct_change().round(decimals=2)
})

In [12]:
monster = yf.Ticker("MNST").history(start="2023-01-01", end="2025-09-24", interval="3mo")
pepper = yf.Ticker("KDP").history(start="2023-01-01", end="2025-09-24", interval="3mo")
variant09 = pd.DataFrame({
    "X":monster.Close.pct_change().round(decimals=2),
    "Y":pepper.Close.pct_change().round(decimals=2)
})

In [99]:
variant10 = pd.DataFrame({
    "X":[0.26, -0.02, 0.2, 0.19, 0.07, -0.04, 0.18, -0.13, 0.15, 0],
    # google.Close.pct_change().round(decimals=2),
    "Y":[0.18, -0.07, 0.19, 0.12, 0.06, -0.04, -0.02, -0.11, 0.33, 0.04]
    # microsoft.Close.pct_change().round(decimals=2)
})

variant11 = pd.DataFrame({
    "X":microsoft.Close.pct_change().round(decimals=2),
    "Y":amazon.Close.pct_change().round(decimals=2)
})

## Statistics

In [1]:
x = microsoft.Open.round(decimals=3) 
y = google.Open.round(decimals=3) 

NameError: name 'microsoft' is not defined

In [18]:
x.var(ddof=0) - sum((x - x.mean())**2)/(len(x))

np.float64(0.0)

In [52]:
x.var(ddof=1) - sum((x - x.mean())**2)/(len(x)-1) # исправленнное выборочное отклонение

np.float64(0.0)

In [None]:
x.std(ddof=0) - np.sqrt(sum((x - x.mean())**2)/(len(x)))

In [None]:
x.std(ddof=1) - np.sqrt(sum((x - x.mean())**2)/(len(x)-1)) # исправленнное выборочное отклонение

np.float64(0.0)

In [17]:
x.std() - x.std(ddof=1)

np.float64(0.0)

In [48]:
x.cov(y, ddof=0)

np.float64(-6.277777777777777)

In [55]:
x.corr(y, method='pearson') - x.cov(y, ddof=0)/(x.std(ddof=0)*y.std(ddof=0))

np.float64(5.551115123125783e-17)

In [56]:
x.corr(y, method='pearson')

np.float64(-0.4188864267427589)

## Control Work 1

In [55]:
import math
from scipy.stats import pearsonr

In [102]:
# df = variant10.iloc[1:, :]
df = variant10
n = df.shape[0]

In [103]:
# df = pd.DataFrame({'X':df.X.values, 'Y': df.Y.values})
df

Unnamed: 0,X,Y
0,0.26,0.18
1,-0.02,-0.07
2,0.2,0.19
3,0.19,0.12
4,0.07,0.06
5,-0.04,-0.04
6,0.18,-0.02
7,-0.13,-0.11
8,0.15,0.33
9,0.0,0.04


In [104]:
x_avg = df.X.mean()
y_avg = df.Y.mean()
print(f"x_avg = {x_avg}; y_avg = {y_avg}")

x_avg = 0.086; y_avg = 0.06799999999999999


In [105]:
sum([num for num in (df.X-df.X.mean().round(decimals=2))*(df.Y - df.Y.mean().round(decimals=2))])*(1/(n))

0.01166

In [49]:
sum([num**2 for num in df.X-df.X.mean()])

0.27565

In [50]:
sum([num**2 for num in df.Y-df.Y.mean()])

0.7675600000000001

In [106]:
# исправленную выборочную дисперсию
sigmaX = df.X.std(ddof=1)
sigmaY = df.Y.std(ddof=1)

print(f"sigmaX = {sigmaX**2}; sigmaY = {sigmaY**2}")

sigmaX = 0.016493333333333332; sigmaY = 0.018862222222222225


In [107]:
sum([num for num in (df.X-df.X.mean().round(decimals=2))*(df.Y - df.Y.mean().round(decimals=2))])*(1/(n-1))

0.012955555555555553

In [72]:
0.0020111111111111102/0.02

0.10055555555555551

In [108]:
# выборочную ковариацию
cov = round((1/(n-1))*sum([num for num in (df.X-df.X.mean())*(df.Y - df.Y.mean())]), 7)
print(f"cov = {cov}")

cov = 0.0129467


In [18]:
# исправленную выборочную дисперсию
sigmaX = df.X.std(ddof=1)
sigmaY = df.Y.std(ddof=1)

print(f"sigmaX = {sigmaX**2}; sigmaY = {sigmaY**2}")

sigmaX = 0.022840000000000003; sigmaY = 0.016493333333333332


In [109]:
math.sqrt((1/(n-1))*sum([num**2 for num in df.X-df.X.mean()])) - sigmaX

np.float64(2.7755575615628914e-17)

In [110]:
# исправленную выборочную дисперсию
sigmaXX = df.X.std(ddof=0)
sigmaYY = df.Y.std(ddof=0)

print(f"sigmaX = {sigmaXX}; sigmaY = {sigmaYY}")

sigmaX = 0.12183595528414426; sigmaY = 0.13029197979921867


In [111]:
-0.01/(sigmaXX*sigmaYY)

np.float64(-0.629951107632329)

In [112]:
print("Correlation Matrix:")
print(df.corr())

Correlation Matrix:
          X         Y
X  1.000000  0.734019
Y  0.734019  1.000000


In [59]:
# Calculate Pearson correlation and p-value
correlation_coefficient, p_value = pearsonr(df.X, df.Y)
print(f"Pearson correlation coefficient: {correlation_coefficient}")
print(f"P-value: {p_value}")

Pearson correlation coefficient: -0.30849436102139904
P-value: 0.38580735027727525


## Symbolic

### Packages

In [7]:
from sympy import symbols, expand, integrate

### Examples

In [None]:
x = symbols

In [11]:
# Define symbolic variables
x, y, n = symbols('x y n')
# Create a symbolic expression
expr = 1/y * (1-x/y) ** (n-1) - x/y**2 *(n-1)* (1-x/y) ** (n-2)

In [12]:
integrate(expr, x)

-Piecewise(((-x/y + 1)**n/n, Ne(n, 0)), (log(-x/y + 1), True)) - (n - 1)*Piecewise((x*y**2*log(x - y)/(x - y) - y**3*log(x - y)/(x - y) - y**3/(x - y), Eq(n, 0)), (-x*y - y**2*log(x - y), Eq(n, 1)), (n*x**2*(-x/y + 1)**(n - 2)/(n**2 - n) - n*x*y*(-x/y + 1)**(n - 2)/(n**2 - n) - x**2*(-x/y + 1)**(n - 2)/(n**2 - n) + 2*x*y*(-x/y + 1)**(n - 2)/(n**2 - n) - y**2*(-x/y + 1)**(n - 2)/(n**2 - n), True))/y**2

In [None]:
# Expand the expression
expanded_expr = expand(expr)
print(f"Expanded expression: {expanded_expr}")

In [None]:
# Integrate the expression with respect to x
integrated_expr = integrate(expanded_expr, x)
print(f"Integrated expression: {integrated_expr}")

## References