In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import  LinearRegression
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr

In [2]:
np.random.seed(42)

n = 1000
W = np.random.normal(1, 1, n)
X = W + np.random.normal(1, 1, n)
Z = np.random.normal(1, 1, n)
error = np.random.normal(1, 1, n)
Y = X + Z + W + error

correlation_X_error, _ = pearsonr(X, error)
print(correlation_X_error)

-0.04937193125719182


In [3]:
noise = np.random.normal(0, 1, n)
epsilon = np.random.normal(0, 1, n)
X = W + noise
error_term = W + epsilon

corr, _ = pearsonr(X, error_term)
print(corr)

0.45800211823124604


In [5]:
df = pd.read_csv("homework_7.1.csv")
df.head(5)

Unnamed: 0.1,Unnamed: 0,X,W,Z,Y
0,0,1.137055,1.221768,0.327829,1.944532
1,1,-0.112905,0.465835,0.59965,0.655514
2,2,2.077755,1.795414,-0.063393,5.934411
3,3,0.456373,-0.512159,1.177413,-0.188064
4,4,-1.012402,0.080002,-0.275697,-0.533775


In [6]:
coefficients = {}
bin_centers = [-1, 0, 1]
bin_width = 0.2

for center in bin_centers:
    lower = center - bin_width
    upper = center + bin_width
    subset = df[(df["W"] >= lower) & (df["W"] <= upper)]

    X_subset = subset[["X", "Z"]]
    y_subset = subset["Y"]

    model = LinearRegression().fit(X_subset, y_subset)
    coefficients[center] = model.coef_[0]

print(coefficients)

{-1: np.float64(0.8699370837921339), 0: np.float64(1.4177428595930386), 1: np.float64(1.9343326364400248)}


In [7]:
def make_error(corr_const, num): 

    err = list() 

    prev = np.random.normal(0, 1) 

    for n in range(num): 

        prev = corr_const * prev + (1 - corr_const) * np.random.normal(0, 1) 

        err.append(prev) 

    return np.array(err) 

In [11]:
import statsmodels.api as sm
n = 100
trials = 500
corr_values = [0.2, 0.5, 0.8]

results = {}


for corr in corr_values:
    coef_estimates = []
    std_errors = []

    for _ in range(trials):
        error_X = make_error(corr, n)
        error_Y = make_error(corr, n)

        Z = np.random.normal(0, 1, n)  
        X = Z + error_X  
        Y = 2 * X + Z + error_Y  

        X_design = sm.add_constant(np.column_stack((X, Z)))
        model = sm.OLS(Y, X_design).fit()

        coef_estimates.append(model.params[1])       
        std_errors.append(model.bse[1])              

    results[corr] = {
        "std_of_estimates": np.std(coef_estimates),
        "mean_std_error": np.mean(std_errors),
        "ratio": np.std(coef_estimates) / np.mean(std_errors)
    }

results

{0.2: {'std_of_estimates': np.float64(0.10800787274892333),
  'mean_std_error': np.float64(0.10222266481784854),
  'ratio': np.float64(1.0565941803746117)},
 0.5: {'std_of_estimates': np.float64(0.13154489442823616),
  'mean_std_error': np.float64(0.1007595874045566),
  'ratio': np.float64(1.30553228547944)},
 0.8: {'std_of_estimates': np.float64(0.2538145705681364),
  'mean_std_error': np.float64(0.1013609518676083),
  'ratio': np.float64(2.504066564998857)}}