In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import  LinearRegression
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr

In [2]:
np.random.seed(42)

n = 1000
W = np.random.normal(1, 1, n)
X = W + np.random.normal(1, 1, n)
Z = np.random.normal(1, 1, n)
error = np.random.normal(1, 1, n)
Y = X + Z + W + error

correlation_X_error, _ = pearsonr(X, error)
print(correlation_X_error)

-0.04937193125719182


In [3]:
import numpy as np

np.random.seed(0)

N = 1000
W = np.random.normal(0, 1, N)
X = W + np.random.normal(0, 1, N)
Z = np.full(N, 1)
eps = np.random.normal(0, 1, N)

Y = X + Z + W + eps

# Regress Y on X
X_design = np.column_stack((np.ones(N), X))
beta_hat = np.linalg.inv(X_design.T @ X_design) @ (X_design.T @ Y)
Y_hat = X_design @ beta_hat
residuals = Y - Y_hat

# Compute correlation of X and residuals
corr = np.corrcoef(X, residuals)[0,1]
print(f"Correlation between X and residuals: {corr:.4f}")



Correlation between X and residuals: 0.0000


In [17]:
import numpy as np
from sklearn.linear_model import LinearRegression

np.random.seed(0)
N = 1000

W = np.random.normal(0, 1, N)            
X = W + np.random.normal(0, 1, N)           
eps = np.random.normal(0, 1, N)
Y = X + W + eps                             

# Model 1: omit W
model1 = LinearRegression().fit(X.reshape(-1, 1), Y)
print("Coefficient for X (W omitted):", model1.coef_[0])

# Model 2: include W
XW = np.column_stack((X, W))
model2 = LinearRegression().fit(XW, Y)
print("Coefficient for X (W included):", model2.coef_[0])
print("Coefficient for W (W included):", model2.coef_[1])


Coefficient for X (W omitted): 1.4904738930492833
Coefficient for X (W included): 0.9981919455639509
Coefficient for W (W included): 0.965323275097229


In [5]:
noise = np.random.normal(0, 1, n)
epsilon = np.random.normal(0, 1, n)
X = W + noise
error_term = W + epsilon

corr, _ = pearsonr(X, error_term)
print(corr)

0.4871334705471999


In [6]:
df = pd.read_csv("homework_7.1.csv")
df.head(5)

Unnamed: 0.1,Unnamed: 0,X,W,Z,Y
0,0,1.137055,1.221768,0.327829,1.944532
1,1,-0.112905,0.465835,0.59965,0.655514
2,2,2.077755,1.795414,-0.063393,5.934411
3,3,0.456373,-0.512159,1.177413,-0.188064
4,4,-1.012402,0.080002,-0.275697,-0.533775


In [7]:
coefficients = {}
bin_centers = [-1, 0, 1]
bin_width = 0.2

for center in bin_centers:
    lower = center - bin_width
    upper = center + bin_width
    subset = df[(df["W"] >= lower) & (df["W"] <= upper)]

    X_subset = subset[["X", "Z"]]
    y_subset = subset["Y"]

    model = LinearRegression().fit(X_subset, y_subset)
    coefficients[center] = model.coef_[0]

print(coefficients)

{-1: np.float64(0.8699370837921339), 0: np.float64(1.4177428595930386), 1: np.float64(1.9343326364400248)}


In [8]:
import pandas as pd
import statsmodels.api as sm

# Load the data
df = pd.read_csv('homework_7.1.csv')

# Define the centers and window for W slices
w_centers = [-1, 0, 1]
window = 0.2

print("Coefficient of X for slices of W:\n")

for w0 in w_centers:
    # Select rows where W is close to the center
    df_slice = df[(df['W'] >= w0 - window) & (df['W'] <= w0 + window)]
    if len(df_slice) < 10:
        print(f"Warning: Few samples for W ≈ {w0}. Result may be unstable.")
    # Set up regression: Y ~ X + Z (with intercept)
    X_reg = sm.add_constant(df_slice[['X', 'Z']])
    model = sm.OLS(df_slice['Y'], X_reg).fit()
    coef_x = model.params['X']
    print(f"W ≈ {w0}:   coef(X) = {coef_x:.4f}")

# This will print the coefficient of X for each slice.


Coefficient of X for slices of W:

W ≈ -1:   coef(X) = 0.8699
W ≈ 0:   coef(X) = 1.4177
W ≈ 1:   coef(X) = 1.9343


In [9]:
def make_error(corr_const, num): 

    err = list() 

    prev = np.random.normal(0, 1) 

    for n in range(num): 

        prev = corr_const * prev + (1 - corr_const) * np.random.normal(0, 1) 

        err.append(prev) 

    return np.array(err) 

In [10]:
import statsmodels.api as sm
n = 100
trials = 500
corr_values = [0.2, 0.5, 0.8]

results = {}


for corr in corr_values:
    coef_estimates = []
    std_errors = []

    for _ in range(trials):
        error_X = make_error(corr, n)
        error_Y = make_error(corr, n)

        Z = np.random.normal(0, 1, n)  
        X = Z + error_X  
        Y = 2 * X + Z + error_Y  

        X_design = sm.add_constant(np.column_stack((X, Z)))
        model = sm.OLS(Y, X_design).fit()

        coef_estimates.append(model.params[1])       
        std_errors.append(model.bse[1])              

    results[corr] = {
        "std_of_estimates": np.std(coef_estimates),
        "mean_std_error": np.mean(std_errors),
        "ratio": np.std(coef_estimates) / np.mean(std_errors)
    }

results

{0.2: {'std_of_estimates': np.float64(0.10603887627032757),
  'mean_std_error': np.float64(0.10071318052504205),
  'ratio': np.float64(1.0528798288120917)},
 0.5: {'std_of_estimates': np.float64(0.13012825686008395),
  'mean_std_error': np.float64(0.1020644093905921),
  'ratio': np.float64(1.2749621306492238)},
 0.8: {'std_of_estimates': np.float64(0.23325105039876434),
  'mean_std_error': np.float64(0.10095703268946613),
  'ratio': np.float64(2.310399228117387)}}

In [11]:
# reflection q1
Z = np.random.normal(0, 1, n)
X = 0.5 * Z + np.random.normal(0, 1, n)
# Y depends on X and Z, plus random noise
beta_X = 1
beta_Z = 2
epsilon = np.random.normal(0, 1, n)
Y = beta_X * X + beta_Z * Z + epsilon
# True correlation between X and Y
true_corr = np.corrcoef(X, Y)[0, 1]
true_corr

np.float64(0.7596914265127124)

In [12]:
# Omitting Z model
model_omit = LinearRegression().fit(X.reshape(-1, 1), Y)
coef_omit = model_omit.coef_[0]
coef_omit

np.float64(2.030203565474906)

In [13]:
# Control Z model
XZ = np.column_stack((X, Z))
model_full = LinearRegression().fit(XZ, Y)
coef_full = model_full.coef_[0]
coef_full

np.float64(1.0390797508558818)

In [14]:
import statsmodels.api as sm

In [15]:
# reflection q2
n = 100
runs =  1000
p_values = []

for _ in range(runs):
    W = np.random.normal(0, 1, n)
    X = np.random.normal(0, 1, n)
    Y = 2 * X + np.random.normal(0, 1, n)
    
    predictors = np.column_stack((X, W))
    predictors = sm.add_constant(predictors)
    
    model = sm.OLS(Y, predictors).fit()
    
    p = model.pvalues[2]
    p_values.append(p)


print("Smallest p-value among 1000 simulations:", min(p_values))

Smallest p-value among 1000 simulations: 0.0002955710656396053
