In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import yaml
import matplotlib.pyplot as plt
from math import trunc

In [2]:
cfg = None
with open('config.yaml', 'r') as config:
    cfg = yaml.safe_load(config)["Lab_2"]

In [3]:
def is_digit(string: str) -> bool:
    if string.isdigit():
       return True
    else:
        try:
            float(string)
            return True
        except ValueError:
            return False

def split_by_spaces(line: str, not_used: list[int], sym_split: str = " ", isDigit: bool = True) -> list[float]:
    line: list[str] = line.split(sym_split)
    line = [s.strip() for s in line.copy() if s != '' and (is_digit(s) if isDigit else True)]
    line = [item for idx, item in enumerate(line) if idx not in not_used]
    return [float(s) if isDigit else s for s in line]

def load_dataset(path: str,
                 columns: list[str],
                 not_used: list[int] = [],
                 sym_split: str = " ",
                 isDigit: bool = True
                 ) -> pd.DataFrame:
    
    with open(path, 'r') as f:
        lines = f.readlines()
    data = [split_by_spaces(line, not_used, sym_split, isDigit) for line in lines]
    df = pd.DataFrame(data, columns=columns)
    
    return df

In [4]:
def analyze_weight(df,
                   alpha = 0.05,
                   critery = stats.kstest,
                   args = [],
                   kwargs = {}
                   ):
    # Точечные оценки среднего и стандартного отклонения
    mean_df = np.mean(df)
    std_df = np.std(df, ddof=1)  # ddof=1 для корректировки стандартного отклонения
    
    results = critery(df, *args, **kwargs)
    stat, p_value = results.statistic, results.pvalue
    n = len(df)
    t_value = stats.t.ppf(1 - alpha / 2, n - 1)
    
    if alpha < p_value:
    # Доверительный интервал для среднего
        mean_ci = (mean_df - t_value * std_df / np.sqrt(n), mean_df + t_value * std_df / np.sqrt(n))
    
    # Доверительный интервал для стандартного отклонения
        std_ci = ((n - 1) * std_df**2 / stats.chi2.ppf(1 - alpha / 2, n - 1),
              (n - 1) * std_df**2 / stats.chi2.ppf(alpha / 2, n - 1))
        
    else:
        mean_ci = None
        std_ci = None
    
    results = {
        "statistic": stat,
        "p_value": p_value,
        "mean_ci": mean_ci,
        "std_ci": std_ci,
    }
    return results

# Dataset "Babyroom"

In [5]:
columns = ["Time of birth recorded on the 24-hour clock",
           "Sex of the child",
           "Birth weight in grams",
           "Number of minutes after midnight of each birth"]

df = load_dataset(cfg['babyroom'], columns, not_used = [])

df["Time of birth recorded on the 24-hour clock"] = df["Time of birth recorded on the 24-hour clock"] / 100

In [6]:
df.head(20)

Unnamed: 0,Time of birth recorded on the 24-hour clock,Sex of the child,Birth weight in grams,Number of minutes after midnight of each birth
0,0.05,1.0,3837.0,5.0
1,1.04,1.0,3334.0,64.0
2,1.18,2.0,3554.0,78.0
3,1.55,2.0,3838.0,115.0
4,2.57,2.0,3625.0,177.0
5,4.05,1.0,2208.0,245.0
6,4.07,1.0,1745.0,247.0
7,4.22,2.0,2846.0,262.0
8,4.31,2.0,3166.0,271.0
9,7.08,2.0,3520.0,428.0


проверка на нормальность весов

In [7]:
property_names = [
                    "statistic",
                    "p_value",
                  ]
keys = ["all", "boy", "girl"]
data = dict.fromkeys(keys, None)
data["all"] = df["Birth weight in grams"]
data["boy"] = df[df["Sex of the child"] == 2]["Birth weight in grams"]
data["girl"] = df[df["Sex of the child"] == 1]["Birth weight in grams"]

In [8]:
all = analyze_weight(data["all"])
boy = analyze_weight(data["boy"])
girl = analyze_weight(data["girl"])

keys = ["statistic", "p_value", "mean_ci", "std_ci"]
result = dict.fromkeys(keys, None)

for key in keys:
    result[key] = [all[key], boy[key], girl[key]]
    
pd.DataFrame(result, index=["all", "boy", "girl"])

KeyError: 'cdf'

**Вывод:** каждая соответствует нормальному распределению т.к p_value > $\alpha = 0.05$

In [None]:
data = df["Number of minutes after midnight of each birth"]
stat, p_value = stats.kstest(data, 'norm', args=(np.mean(data), np.std(data)))
pd.DataFrame({"stat": [stat], "p_value":[p_value]}, index=["Number of minutes after midnight of each birth"])

Unnamed: 0,stat,p_value
Number of minutes after midnight of each birth,0.110417,0.617173


**Вывод:** гипотеза о том, что `Number of minutes after midnight of each birth` соответствует экспоненциальному распределению правдива

In [None]:
num_in_hour = [0] * 24

for time in df["Time of birth recorded on the 24-hour clock"]:
    num_in_hour[trunc(time)] += 1
    
print(num_in_hour)

[1, 3, 1, 0, 4, 0, 0, 2, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 4, 3, 2, 1, 2]


У распределения Пуассона Математическое ожидание и диспрерсия: $\lambda$ и $\lambda^2$

In [None]:
np.mean(num_in_hour), np.std(num_in_hour, ddof=1)

(np.float64(1.8333333333333333), np.float64(1.2394482175782784))

Среднее отклонение и математическое ожидание сильно отличаются => это не распредление Пуассона

#  Датасет "euroweight"

In [None]:
columns = ["weight", "batch"]

df = load_dataset(cfg['euroweight'], columns, not_used = [0], sym_split="\t")

In [None]:
df.head(10)

Unnamed: 0,weight,batch
0,7.512,1.0
1,7.502,1.0
2,7.461,1.0
3,7.562,1.0
4,7.528,1.0
5,7.459,1.0
6,7.518,1.0
7,7.537,1.0
8,7.517,1.0
9,7.605,1.0


**kstest**

In [None]:
bathes = set(df["batch"])
bathes.add("all")
data = dict.fromkeys(bathes, None)

keys = ["statistic", "p_value", "mean_ci", "std_ci"]
results = dict.fromkeys(keys, None)

for key in bathes:
    if key == "all":
        data["all"] = df["weight"]
    else:
        data[key] = df[df["batch"] == key]["weight"]  
    data[key] = analyze_weight(data[key],
                               alpha=0.05, args=["norm"],
                               kwargs={
                                   "args": (np.mean(data[key]), np.std(data[key]))
                                   }
                               )  
        
     
        
for key in keys:
    results[key] = [data[batch][key] for batch in bathes]
    
pd.DataFrame(results, index=list(bathes))



Unnamed: 0,statistic,p_value,mean_ci,std_ci
1.0,0.038766,0.83231,"(7.515375800837306, 7.523936199162694)","(0.000997941600156461, 0.0014189906188519683)"
2.0,0.032487,0.946727,"(7.518747761295764, 7.527588238704238)","(0.0010643111637179115, 0.0015133626623224801)"
3.0,0.077438,0.094581,"(7.504930027624747, 7.514157972375253)","(0.0011596505853189991, 0.0016489274537266117)"
4.0,0.045768,0.654318,"(7.5274397498218875, 7.534768250178113)","(0.0007313872864565761, 0.0010399723772080087)"
5.0,0.035403,0.90149,"(7.527705750509117, 7.535086249490885)","(0.0007418030754424451, 0.0010547827697766443)"
6.0,0.055526,0.409069,"(7.511076559213748, 7.519403440786251)","(0.0009442390480820679, 0.0013426300206065141)"
7.0,0.042779,0.733428,"(7.518905563172776, 7.527126436827222)","(0.0009203502349233712, 0.0013086620992748617)"
8.0,0.06996,0.165003,"(7.512205019334981, 7.521266980665018)","(0.001118308406076351, 0.0015901422858380133)"
all,0.02334,0.222494,"(7.519724882523983, 7.522740117476017)","(0.0011119429197252748, 0.0012587628116088848)"


**shapiro**

In [None]:
bathes = set(df["batch"])
bathes.add("all")
data = dict.fromkeys(bathes, None)

keys = ["statistic", "p_value", "mean_ci", "std_ci"]
results = dict.fromkeys(keys, None)

for key in bathes:
    if key == "all":
        data["all"] = df["weight"]
    else:
        data[key] = df[df["batch"] == key]["weight"]  
    data[key] = analyze_weight(data[key],critery=stats.shapiro)
        
     
        
for key in keys:
    results[key] = [data[batch][key] for batch in bathes]
    
pd.DataFrame(results, index=list(bathes))


Unnamed: 0,statistic,p_value,mean_ci,std_ci
1.0,0.995507,0.6830017,"(7.515375800837306, 7.523936199162694)","(0.000997941600156461, 0.0014189906188519683)"
2.0,0.9909,0.121877,"(7.518747761295764, 7.527588238704238)","(0.0010643111637179115, 0.0015133626623224801)"
3.0,0.863432,4.089444e-14,,
4.0,0.995505,0.6826586,"(7.5274397498218875, 7.534768250178113)","(0.0007313872864565761, 0.0010399723772080087)"
5.0,0.991034,0.1289928,"(7.527705750509117, 7.535086249490885)","(0.0007418030754424451, 0.0010547827697766443)"
6.0,0.984059,0.006756499,,
7.0,0.990701,0.1119834,"(7.518905563172776, 7.527126436827222)","(0.0009203502349233712, 0.0013086620992748617)"
8.0,0.93672,6.827697e-09,,
all,0.975473,5.023277e-18,,


**$\chi^2$**

In [None]:
bathes = set(df["batch"])
bathes.add("all")
data = dict.fromkeys(bathes, None)

keys = ["statistic", "p_value", "mean_ci", "std_ci"]
results = dict.fromkeys(keys, None)

for key in bathes:
    if key == "all":
        data["all"] = df["weight"]
    else:
        data[key] = df[df["batch"] == key]["weight"]  
    data[key] = analyze_weight(data[key],critery=stats.chisquare)
        
     
        
for key in keys:
    results[key] = [data[batch][key] for batch in bathes]
    
pd.DataFrame(results, index=list(bathes))


Unnamed: 0,statistic,p_value,mean_ci,std_ci
1.0,0.039097,1.0,"(7.515375800837306, 7.523936199162694)","(0.000997941600156461, 0.0014189906188519683)"
2.0,0.041678,1.0,"(7.518747761295764, 7.527588238704238)","(0.0010643111637179115, 0.0015133626623224801)"
3.0,0.045493,1.0,"(7.504930027624747, 7.514157972375253)","(0.0011596505853189991, 0.0016489274537266117)"
4.0,0.02861,1.0,"(7.5274397498218875, 7.534768250178113)","(0.0007313872864565761, 0.0010399723772080087)"
5.0,0.029017,1.0,"(7.527705750509117, 7.535086249490885)","(0.0007418030754424451, 0.0010547827697766443)"
6.0,0.037015,1.0,"(7.511076559213748, 7.519403440786251)","(0.0009442390480820679, 0.0013426300206065141)"
7.0,0.036041,1.0,"(7.518905563172776, 7.527126436827222)","(0.0009203502349233712, 0.0013086620992748617)"
8.0,0.043829,1.0,"(7.512205019334981, 7.521266980665018)","(0.001118308406076351, 0.0015901422858380133)"
all,0.314133,1.0,"(7.519724882523983, 7.522740117476017)","(0.0011119429197252748, 0.0012587628116088848)"


**cramer**

In [None]:
bathes = set(df["batch"])
bathes.add("all")
data = dict.fromkeys(bathes, None)

keys = ["statistic", "p_value", "mean_ci", "std_ci"]
results = dict.fromkeys(keys, None)

for key in bathes:
    if key == "all":
        data["all"] = df["weight"]
    else:
        data[key] = df[df["batch"] == key]["weight"]  
        
    data[key] = analyze_weight(data[key],
                               alpha=0.05, args=["norm"],
                               critery=stats.cramervonmises,
                               kwargs={
                                   "args": (np.mean(data[key]), np.std(data[key]))
                                   }
                               ) 
        
     
        
for key in keys:
    results[key] = [data[batch][key] for batch in bathes]
    
pd.DataFrame(results, index=list(bathes))


Unnamed: 0,statistic,p_value,mean_ci,std_ci
1.0,0.063642,0.791521,"(7.515375800837306, 7.523936199162694)","(0.000997941600156461, 0.0014189906188519683)"
2.0,0.044312,0.910156,"(7.518747761295764, 7.527588238704238)","(0.0010643111637179115, 0.0015133626623224801)"
3.0,0.323922,0.115869,"(7.504930027624747, 7.514157972375253)","(0.0011596505853189991, 0.0016489274537266117)"
4.0,0.067563,0.767051,"(7.5274397498218875, 7.534768250178113)","(0.0007313872864565761, 0.0010399723772080087)"
5.0,0.040035,0.93332,"(7.527705750509117, 7.535086249490885)","(0.0007418030754424451, 0.0010547827697766443)"
6.0,0.16971,0.334883,"(7.511076559213748, 7.519403440786251)","(0.0009442390480820679, 0.0013426300206065141)"
7.0,0.055091,0.845284,"(7.518905563172776, 7.527126436827222)","(0.0009203502349233712, 0.0013086620992748617)"
8.0,0.223537,0.226211,"(7.512205019334981, 7.521266980665018)","(0.001118308406076351, 0.0015901422858380133)"
all,0.181918,0.305435,"(7.519724882523983, 7.522740117476017)","(0.0011119429197252748, 0.0012587628116088848)"


**Вывод:** нормальному распределению соответствуют  все

# Датасет Iris

In [None]:
columns = ["sepal length", "sepal width", "petal length", "petal width", "class"]

df = load_dataset(cfg['iris'], columns, not_used = [], sym_split=",", isDigit=False)

In [None]:
classes = set(df["class"])
data = dict.fromkeys(classes, None)

keys = ["statistic", "p_value", "mean_ci", "std_ci"]
results = dict.fromkeys(keys, None)

for key in classes:
    data[key] = df[df["class"] == key]["sepal length"]  
    data[key] = np.float64(data[key])
    data[key] = analyze_weight(data[key],
                               alpha=0.05, args=["norm"],
                               kwargs={
                                   "args": (np.mean(data[key]), np.std(data[key]))
                                   }
                               )  
        
     
        
for key in keys:
    results[key] = [data[c][key] for c in classes]
    
pd.DataFrame(results, index=list(classes))



Unnamed: 0,statistic,p_value,mean_ci,std_ci
Iris-setosa,0.113818,0.500501,"(4.905823539299264, 5.106176460700737)","(0.086698814392647, 0.19293982309232224)"
Iris-virginica,0.115589,0.480893,"(6.407285019117501, 6.768714980882495)","(0.28214353500191136, 0.6278831390170433)"
Iris-versicolor,0.097907,0.687322,"(5.789305783106826, 6.082694216893174)","(0.18591215164726593, 0.4137295060009336)"


In [None]:
classes = set(df["class"])
data = dict.fromkeys(classes, None)

keys = ["statistic", "p_value", "mean_ci", "std_ci"]
results = dict.fromkeys(keys, None)

for key in classes:
    data[key] = df[df["class"] == key]["sepal length"]  
    data[key] = np.float64(data[key])
    data[key] = analyze_weight(data[key],
                               critery=stats.shapiro
                               )  
        
     
        
for key in keys:
    results[key] = [data[c][key] for c in classes]
    
pd.DataFrame(results, index=list(classes))



Unnamed: 0,statistic,p_value,mean_ci,std_ci
Iris-setosa,0.977699,0.459513,"(4.905823539299264, 5.106176460700737)","(0.086698814392647, 0.19293982309232224)"
Iris-virginica,0.971179,0.258315,"(6.407285019117501, 6.768714980882495)","(0.28214353500191136, 0.6278831390170433)"
Iris-versicolor,0.977836,0.464737,"(5.789305783106826, 6.082694216893174)","(0.18591215164726593, 0.4137295060009336)"


**Вывод:** Все нормальное распределение