In [1]:
import numpy as np
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt

plt.style.use('ggplot')

In [2]:
import math

def format_value_with_uncertainty(value, variance, sample_size):
    def round_to_one_significant(x):
        if x == 0:
            return 0.0
        exponent = np.floor(np.log10(abs(x)))
        mantissa = x / (10 ** exponent)
        rounded_mantissa = np.round(mantissa)
        if rounded_mantissa == 10:
            rounded_mantissa = 1
            exponent += 1
        return rounded_mantissa * (10 ** exponent)
    
    def get_decimal_places(x):
        s = "{0:.10f}".format(x).rstrip('0').rstrip('.')
        return len(s.split('.')[1]) if '.' in s else 0
    
    std_dev = np.sqrt(variance)
    standard_error = std_dev / np.sqrt(sample_size)
    
    if standard_error == 0:
        return f"{round(value):.0f} ± 0"
    
    rounded_uncertainty = round_to_one_significant(standard_error)
    decimal_places = get_decimal_places(rounded_uncertainty)
    
    formatted_value = round(value, decimal_places)
    formatted_uncertainty = round(rounded_uncertainty, decimal_places)
    
    return f"{formatted_value:.{decimal_places}f} ± {formatted_uncertainty:.{decimal_places}f}"

In [3]:
def print_hist_dencity(dist, plot_range, save_path):
    a = dict()
    a[10] = dist.rvs(10)
    a[50] = dist.rvs(50)
    a[1000] = dist.rvs(1000)
    fig, axes = plt.subplots(1, 3, figsize=(12, 5))
    for num, ax in zip(a.keys(), axes):
        ax.set_title(f'Number of samples {num}')
        if isinstance(dist.dist, stats.rv_continuous):
            ax.hist(a[num], bins=np.sqrt(num).astype(int), range=plot_range, density=True, color='blue', alpha=0.5)
            x = np.linspace(plot_range[0], plot_range[1], num=int(1e5))
            ax.plot(x, dist.pdf(x), color='red')
        else:
            x = np.arange(plot_range[0], plot_range[1])
            ax.hist(a[num], bins=min(np.sqrt(num).astype(int), (plot_range[1]-plot_range[0])), range=plot_range, density=True, color='blue', alpha=0.5)
            ax.plot(x, dist.pmf(x))
    if save_path:
        plt.savefig(save_path)
    plt.show()

def q_mid(x):
    return np.percentile(x, [75, 25]).mean()

def statistics(dist):
    stats = np.empty((3, 6, 1000))
    for i in range(1000):
        a = dict().fromkeys(range(3))
        a[0] = dist.rvs(10)
        a[1] = dist.rvs(100)
        a[2] = dist.rvs(1000)
        for j in range(3):
            stats[j, :3, i] = np.array([a[j].mean(), np.median(a[j]), q_mid(a[j])])
            stats[j, 3:, i] = stats[j, :3, i] ** 2
    ans = stats.mean(axis=2)
    ans[:, 3:] = ans[:, 3:]-ans[:,:3]**2
    ans = pd.DataFrame(ans, columns=['Mean','Median','$z_Q$', 'Mean variance','Median variance','$z_Q$ variance'])
    ans = pd.concat((pd.DataFrame(np.array([10, 100, 1000]), columns=['Sample size']), ans), axis=1)
    for i in range(3):
        ans['Mean'][i] = format_value_with_uncertainty(ans['Mean'][i], ans['Mean variance'][i], ans['Sample size'][i])
        ans['Median'][i] = format_value_with_uncertainty(ans['Median'][i], ans['Median variance'][i], ans['Sample size'][i])
        ans['$z_Q$'][i] = format_value_with_uncertainty(ans['$z_Q$'][i], ans['$z_Q$ variance'][i], ans['Sample size'][i])
    return ans[['Sample size', 'Mean', 'Median', '$z_Q$']]

In [4]:
def print_boxplot(dist, save_path=None):
    a = dict()
    a[10] = dist.rvs(20)
    a[50] = dist.rvs(100)
    a[1000] = dist.rvs(1000)
    fig, axes = plt.subplots(1, 3, figsize=(12, 5))
    outliers = dict.fromkeys(a.keys())
    for num, ax in zip(a.keys(), axes):
        ax.set_title(f'Number of samples {num}')
        bplot = ax.boxplot(a[num])
        outliers[num] = len(bplot["fliers"][0].get_ydata())
    if save_path:
        plt.savefig(save_path)
    plt.show()
    return outliers

In [5]:
class MixtureModel(stats.rv_continuous):
    def __init__(self, submodels, *args, weights = None, **kwargs):
        super().__init__(*args, **kwargs)
        self.submodels = submodels
        if weights is None:
            weights = [1 for _ in submodels]
        if len(weights) != len(submodels):
            raise(ValueError(f'There are {len(submodels)} submodels and {len(weights)} weights, but they must be equal.'))
        self.weights = [w / sum(weights) for w in weights]
        
    def pdf(self, x):
        pdf = self.submodels[0].pdf(x) * self.weights[0]
        for submodel, weight in zip(self.submodels[1:], self.weights[1:]):
            pdf += submodel.pdf(x)  * weight
        return pdf
            
    def _sf(self, x):
        sf = self.submodels[0].sf(x) * self.weights[0]
        for submodel, weight in zip(self.submodels[1:], self.weights[1:]):
            sf += submodel.sf(x)  * weight
        return sf

    def _cdf(self, x):
        cdf = self.submodels[0].cdf(x) * self.weights[0]
        for submodel, weight in zip(self.submodels[1:], self.weights[1:]):
            cdf += submodel.cdf(x)  * weight
        return cdf

        

    def rvs(self, size):
        submodel_choices = np.random.choice(range(len(self.submodels)), size=size, p = self.weights)
        submodel_samples = [submodel.rvs(size=size) for submodel in self.submodels]
        rvs = np.empty(submodel_samples[0].shape)
        for i, choice in enumerate(submodel_choices):
            rvs[i] = submodel_samples[choice][i]
        return rvs


In [6]:
def corr_statistics(dist):
    statistics = dict.fromkeys([20, 60, 100], {'Pearson Corr': np.empty(1000), 'Spearman Corr': np.empty(1000)})
    ans = dict.fromkeys([20, 60, 100], {'Pearson Corr': '', 'Spearman Corr': ''})
    for i in range(1000):
        a = {20: dist.rvs(20), 60: dist.rvs(60), 100: dist.rvs(100)}
        for j in a.keys():
            statistics[j]['Pearson Corr'][i] = stats.pearsonr(a[j][:, 0], a[j][:, 1]).statistic
            statistics[j]['Spearman Corr'][i] = stats.spearmanr(a[j][:, 0], a[j][:, 1]).statistic
    for k in statistics.keys():
        m = np.mean(statistics[k]['Pearson Corr'])
        v = np.var(statistics[k]['Pearson Corr'])
        ans[k]['Pearson Corr'] = format_value_with_uncertainty(m, v, k)
        m = np.mean(statistics[k]['Spearman Corr'])
        v = np.var(statistics[k]['Spearman Corr'])
        ans[k]['Spearman Corr'] = format_value_with_uncertainty(m, v, k)
    return ans

In [7]:
def plot_level_lines(dist, plot_range_x, plot_range_y, save_path):
    f = dist.pdf
    x, y = np.mgrid[plot_range_x[0]:plot_range_x[1]:0.01*(plot_range_x[1]-plot_range_x[0]), plot_range_y[0]:plot_range_y[1]:0.01*(plot_range_y[1]-plot_range_y[0])]
    pos = np.dstack((x, y))
    z = f(pos)
    plt.contour(x, y, z, levels=25)
    if save_path:
        plt.savefig(save_path)
    plt.show()

In [8]:
def LsS(x, y, true_a, true_b):
    a = ((x*y).mean()-(x.mean()*y.mean()))/((x**2).mean()-(x.mean())**2)
    b = y.mean()-x.mean()*a
    print(a, a/true_a, b, b/true_b)
    return a, a/true_a, b, b/true_b

In [9]:
from scipy.optimize import minimize

def LAV(x, y, true_a, true_b):
    def m_a_v(a, b):
        return np.abs(a*x+b-y).mean()
    w = minimize(lambda w: m_a_v(w[0], w[1]), (1, 1), method='Nelder-Mead').x
    print(w[0], w[0]/true_a, w[1], w[1]/true_b)
    return w[0], w[0]/true_a, w[1], w[1]/true_b

In [10]:
def chi_square_norm_test(samples, k, alpha):
    treshhold = stats.chi2.ppf(1-alpha, k-1)
    print(treshhold)
    H_0_dist = stats.norm(*stats.norm.fit(samples))
    bins = np.linspace(H_0_dist.mean()-3*H_0_dist.std(), H_0_dist.mean()+3*H_0_dist.std(), k-1, endpoint=True)
    bins = np.append(bins, np.inf)
    bins = np.insert(bins, 0, -np.inf)
    P = H_0_dist.cdf(bins[1:])-H_0_dist.cdf(bins[:-1])
    freq = np.histogram(samples, bins)[0]
    statistics = ((freq-len(samples)*P)**2/(len(samples)*P))
    criterion = statistics.sum()
    if criterion >= treshhold:
        result = 'Reject'
    else:
        result = 'Fail to reject'
    table = pd.DataFrame(np.column_stack((bins[:-1].T, bins[1:], (bins[:-1]-H_0_dist.mean())/H_0_dist.std(), (bins[1:]-H_0_dist.mean())/H_0_dist.std(),
                         P, statistics)), columns=['Left Edge', 'Right Edge', 'NLeft Edge', 'NRight Edge', 'Probability', 'Stat'])
    return table, criterion, result

In [11]:
def lab6(save_path):
    X1 = np.random.normal(loc=0, scale=0.95, size=1000)
    X2 = np.random.normal(loc=1, scale=1.05, size=1000)
    
    def inner_interval(data):
        q1, q3 = np.percentile(data, [25, 75])
        return (q1, q3)
    
    def outer_interval(data):
        return (np.min(data), np.max(data))
    
    def jaccard_index(interval1, interval2):
        a1, b1 = interval1
        a2, b2 = interval2
        intersection_start = max(a1, a2)
        intersection_end = min(b1, b2)
        if intersection_start >= intersection_end:
            return 0.0
        union_start = min(a1, a2)
        union_end = max(b1, b2)
        intersection_length = intersection_end - intersection_start
        union_length = union_end - union_start
        return intersection_length / union_length
    
    def calculate_J(a, X1, X2):
        X1_shifted = X1 + a
        im1 = inner_interval(X1_shifted)
        im2 = inner_interval(X2)
        out1 = outer_interval(X1_shifted)
        out2 = outer_interval(X2)
        return jaccard_index(im1, im2), jaccard_index(out1, out2)
    
    a_values = np.linspace(-2, 4, 1000)
    J_Im = np.array([])
    J_Out = np.array([])
    
    for a in a_values:
        j_im, j_out = calculate_J(a, X1, X2)
        J_Im = np.append(J_Im, j_im)
        J_Out = np.append(J_Out, j_out)
    
    # Построение графиков
    plt.figure(figsize=(10, 6))
    plt.plot(a_values, J_Im, label='$J_{Im}$')
    plt.plot(a_values, J_Out, label='$J_{Out}$')
    plt.xlabel('$a$')
    plt.ylabel('Индекс Жаккара')
    plt.legend()
    plt.savefig(save_path)
    plt.show()
    
    # Нахождение оптимальных a
    a_Im = a_values[np.argwhere(np.isclose(J_Im, np.max(J_Im), rtol=1e-3))]
    a_Out = a_values[np.argwhere(np.isclose(J_Out, np.max(J_Out), rtol=1e-3))]
    
    print(np.min(a_Im), np.max(a_Im))
    print(np.min(a_Out), np.max(a_Out))

In [21]:
import numpy as np
from scipy.stats import t, norm, chi2

def normal_mean_ci(x_bar, s, n, alpha):
    dof = n - 1
    t_crit = t.ppf(1 - alpha/2, dof)
    margin = t_crit * s / np.sqrt(n-1)
    return (x_bar - margin, x_bar + margin)

def normal_std_ci(s, n, alpha):
    dof = n - 1
    chi2_lower = chi2.ppf(alpha/2, dof)
    chi2_upper = chi2.ppf(1 - alpha/2, dof)
    lower = s * np.sqrt(n) / np.sqrt(chi2_upper)
    upper = s * np.sqrt(n) / np.sqrt(chi2_lower)
    return (lower, upper)

def asymptotic_mean_ci(x_bar, s, n, alpha):
    u_crit = norm.ppf(1 - alpha/2)
    margin = u_crit * s / np.sqrt(n)
    return (x_bar - margin, x_bar + margin)

def asymptotic_std_ci(s, n, alpha, e):
    u_crit = norm.ppf(1 - alpha/2)
    U = u_crit * np.sqrt((e + 2) / n)
    lower = s * (1 - 0.5 * U)
    upper = s * (1 + 0.5 * U)
    return (lower, upper)

def lab7():
    alpha = 0.05
    sample_sizes = [20, 100]
    
    normal_samples = {
        20: np.random.normal(loc=0, scale=1, size=20),
        100: np.random.normal(loc=0, scale=1, size=100)
    }

    exp_samples = {
        20: np.random.exponential(scale=1, size=20),
        100: np.random.exponential(scale=1, size=100)
    }
    
    for n in sample_sizes:
        x_bar = np.mean(normal_samples[n])
        s = np.std(normal_samples[n], ddof=1)
        m_low, m_high = normal_mean_ci(x_bar, s, n, alpha)
        sigma_low, sigma_high = normal_std_ci(s, n, alpha)
        print(f"Size {n} Normal \n ({m_low:.3f} < {x_bar:.3f} < {m_high:.3f})")
        print(f"({sigma_low:.3f} < {s:.3f} < {sigma_high:.3f})")

        x_bar = np.mean(exp_samples[n])
        s = np.std(exp_samples[n], ddof=1)
        e = stats.moment(exp_samples[n], 4)/(s **4)-3
        m_low, m_high = asymptotic_mean_ci(x_bar, s, n, alpha)
        sigma_low, sigma_high = asymptotic_std_ci(s, n, alpha, e)
        print(f"Size {n} Exp \n ({m_low:.3f} < {x_bar:.3f} < {m_high:.3f})")
        print(f"({sigma_low:.3f} < {s} < {sigma_high:.3f})")

In [22]:
lab7()

Size 20 Normal 
 (-0.698 < -0.091 < 0.515)
(0.985 < 1.263 < 1.892)
Size 20 Exp 
 (0.496 < 0.968 < 1.440)
(0.740 < 1.0774007033717594 < 1.415)
Size 100 Normal 
 (-0.277 < -0.075 < 0.127)
(0.893 < 1.011 < 1.181)
Size 100 Exp 
 (0.794 < 0.997 < 1.200)
(0.766 < 1.0365852292048057 < 1.307)
