# Библиотеки

In [2]:
from zlib import crc32

import numpy as np
import pandas as pd
import scipy.stats as st
import re
import statsmodels.api as sm
from sklearn.metrics import accuracy_score

# Код генерации выборки

In [3]:
EMAIL_REGEX = re.compile(r"[^@]+@phystech.edu")

def generate_dataset(code):
    rs = np.random.RandomState(code)
    tip = rs.randint(low=0, high=3)
    if tip == 1:
        w = [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
    if tip == 2:
        w= [1, 1, 1, 1, 1, 0, 0, 0, 0, 0][::-1]
    if tip == 0:
        w = [0]*10
    x = rs.randn(1000, 10)
    y = (np.dot(x, w)+rs.randn(1000)*1.0)
    y[100:] += 1 * rs.randn(900) * rs.uniform(size = 900)
    y = 1.0*(y >= 0)
    return np.hstack([y.reshape(1000, 1), x])

# Генерация выборки для вашей почты

<span style="color:red">
    ВАЖНО!
    Почта, которую укажете ниже и почта с которой Вы отправляете домашнее задание должна совпадать!
    В момент проверки задания алгоритм генерирует выборку на основе почты из анкеты!
</span>

Внимательно проверьте почту для которой выполняется задание!

In [4]:
task = dict()
task['mail'] = input(prompt='Enter your mail: ')
assert EMAIL_REGEX.match(task['mail']), 'Not a mail in the phystech.edu domain'
task['id'] = crc32(task['mail'].encode('utf-8'))
task['data'] = generate_dataset(task['id'])

task

Enter your mail: bogdanov.ai@phystech.edu


{'mail': 'bogdanov.ai@phystech.edu',
 'id': 1162440694,
 'data': array([[ 1.        ,  1.08945958,  0.32178445, ...,  2.66738059,
         -0.45650835, -0.35949748],
        [ 1.        , -1.39968299, -1.59509462, ..., -0.03786994,
          1.55498625, -0.66885577],
        [ 1.        ,  0.3648031 , -0.64772248, ...,  1.80220501,
          0.96381658, -0.06935707],
        ...,
        [ 0.        , -0.73637306, -0.89386267, ...,  0.60072037,
          1.20160435,  0.23308067],
        [ 0.        , -0.34561843, -0.66330805, ...,  0.62327406,
         -1.20261697,  0.37706688],
        [ 1.        , -0.86797526,  1.52408046, ..., -0.62932746,
          1.00534656,  0.23907955]])}

# Работа с выборкой

In [5]:
data = task['data']

## Постройте модель логистической регрессии Y от X и свободного коэффициента. Каково отношение шансов Y=1 к событию f(X) = 1, где f(X) - отклик?

In [6]:
data = pd.DataFrame(data)
column_names = ['Y', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9']
data.columns = column_names

In [7]:
X = data.drop('Y', axis=1)
y = data['Y']

In [8]:
model = sm.Logit(endog=y, exog=sm.add_constant(X)).fit()
model.summary()

Optimization terminated successfully.
         Current function value: 0.328067
         Iterations 8


0,1,2,3
Dep. Variable:,Y,No. Observations:,1000.0
Model:,Logit,Df Residuals:,989.0
Method:,MLE,Df Model:,10.0
Date:,"Fri, 19 Apr 2024",Pseudo R-squ.:,0.5266
Time:,18:41:30,Log-Likelihood:,-328.07
converged:,True,LL-Null:,-693.05
Covariance Type:,nonrobust,LLR p-value:,2.313e-150

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0603,0.099,0.609,0.542,-0.134,0.254
X0,1.4791,0.133,11.153,0.000,1.219,1.739
X1,1.6276,0.134,12.126,0.000,1.365,1.891
X2,1.5593,0.130,11.983,0.000,1.304,1.814
X3,1.4852,0.133,11.129,0.000,1.224,1.747
X4,1.5596,0.138,11.324,0.000,1.290,1.829
X5,-0.0476,0.099,-0.482,0.630,-0.241,0.146
X6,0.1210,0.098,1.233,0.218,-0.071,0.313
X7,-0.0121,0.093,-0.130,0.897,-0.194,0.170


In [9]:
p_endog = y.sum() / len(y)

f = model.predict(sm.add_constant(X))
response = list(map(round, f))
p_resp = len(list(filter(lambda x: x == 1, response))) / len(response)

OR = p_endog / (1 - p_endog) * (1 - p_resp) / p_resp

print(f"{OR:.2f}")

1.02


## Какова точность (Accuracy) классификации модели при пороге вероятности для классификации p0 = 0.5 ?

In [11]:
print(f"Точность: {accuracy_score(y, response).round(2)}")

Точность: 0.84


## Постройте модель логистической регрессии Y от X и свободного коэффициента для первых 100 элементов выборки. Имеет ли смысл данная модель согласно критерию Вальда? (p-value=0.05)

In [12]:
model100 = sm.OLS(endog=y[:100], exog=sm.add_constant(X[:100])).fit()
model100.summary()

0,1,2,3
Dep. Variable:,Y,R-squared:,0.631
Model:,OLS,Adj. R-squared:,0.59
Method:,Least Squares,F-statistic:,15.23
Date:,"Fri, 19 Apr 2024",Prob (F-statistic):,1.8e-15
Time:,18:41:48,Log-Likelihood:,-21.983
No. Observations:,100,AIC:,65.97
Df Residuals:,89,BIC:,94.62
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.5571,0.033,16.946,0.000,0.492,0.622
X0,0.2084,0.030,6.842,0.000,0.148,0.269
X1,0.1768,0.030,5.870,0.000,0.117,0.237
X2,0.2098,0.032,6.469,0.000,0.145,0.274
X3,0.1650,0.041,3.988,0.000,0.083,0.247
X4,0.2516,0.042,5.926,0.000,0.167,0.336
X5,-0.0159,0.032,-0.505,0.615,-0.079,0.047
X6,0.0226,0.035,0.654,0.515,-0.046,0.091
X7,0.0295,0.032,0.921,0.360,-0.034,0.093

0,1,2,3
Omnibus:,3.414,Durbin-Watson:,1.951
Prob(Omnibus):,0.181,Jarque-Bera (JB):,1.968
Skew:,-0.001,Prob(JB):,0.374
Kurtosis:,2.313,Cond. No.,1.89


In [13]:
model100.wald_test('X0 = X1 = X2 = X3 = X4 = X5 = X6 = X7 = X8 = X9 = 0', scalar=False)
print(model100.f_pvalue < 0.05)

True


## Можно ли обнулить первые четыре переменные согласно критерию Вальда? (p-value=0.05)

In [14]:
model100.wald_test('X0 = X1 = X2 = X3 = 0', scalar=False)
print(model100.f_pvalue >= 0.05)

False
