In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import NearestNeighbors

In [4]:
#load homework_1.1.csv
df_1_1 = pd.read_csv('homework_1.1.csv')

In [5]:
#Perform a linear regression to predict Y from X1, X2, and X3 from df_1_1
X = df_1_1[['X1', 'X2', 'X3']]
y = df_1_1['Y']
model = LinearRegression()
model.fit(X, y)



In [6]:
# Store results
simple_coefs = {}
multi_coefs = dict(zip(['X1', 'X2', 'X3'], model.coef_))

for xi in ['X1', 'X2', 'X3']:
    X_simple = df_1_1[[xi]]
    model_simple = LinearRegression()
    model_simple.fit(X_simple, y)
    simple_coefs[xi] = model_simple.coef_[0]

# Compute differences
coef_diffs = {xi: abs(simple_coefs[xi] - multi_coefs[xi]) for xi in ['X1', 'X2', 'X3']}

# Find Xi with greatest difference
greatest_xi = max(coef_diffs, key=coef_diffs.get)

print('Simple regression coefficients:', simple_coefs)
print('Multiple regression coefficients:', multi_coefs)
print('Absolute differences:', coef_diffs)
print(f"Xi with greatest difference: {greatest_xi}")

Simple regression coefficients: {'X1': np.float64(1.8417610991461835), 'X2': np.float64(4.083612579373998), 'X3': np.float64(3.097041202049842)}
Multiple regression coefficients: {'X1': np.float64(1.0071376550530298), 'X2': np.float64(1.964568594874394), 'X3': np.float64(2.9754885351634344)}
Absolute differences: {'X1': np.float64(0.8346234440931537), 'X2': np.float64(2.119043984499604), 'X3': np.float64(0.12155266688640776)}
Xi with greatest difference: X2


In [7]:
# Use statsmodels to get t-statistics for each coefficient
X_with_const = sm.add_constant(X)
model_sm = sm.OLS(y, X_with_const).fit()

# Get t-statistics (excluding the intercept)
t_stats = model_sm.tvalues.drop('const')
most_significant_xi = t_stats.abs().idxmax()

print('t-statistics for each Xi:')
print(t_stats)
print(f"Most significant Xi (by absolute t-statistic): {most_significant_xi}")

t-statistics for each Xi:
X1     60.984011
X2     53.283212
X3    196.645240
dtype: float64
Most significant Xi (by absolute t-statistic): X3


In [8]:
#load homework_1.2.csv
df_1_2 = pd.read_csv('homework_1.2.csv')

In [9]:

# Split data into X=0 and X=1 groups
X0 = df_1_2[df_1_2['X'] == 0].reset_index(drop=True)
X1 = df_1_2[df_1_2['X'] == 1].reset_index(drop=True)

# Fit NearestNeighbors on Z values of X=1 group
nn = NearestNeighbors(n_neighbors=1)
nn.fit(X1[['Z']])

# Find the closest X=1 for each X=0
match_distances, match_indices = nn.kneighbors(X0[['Z']])

# Prepare and display the matches
matches = []
for i, (idx, dist) in enumerate(zip(match_indices.flatten(), match_distances.flatten())):
    match = {
        'X0_index': X0.index[i],
        'X0_Z': X0.loc[i, 'Z'],
        'Matched_X1_index': X1.index[idx],
        'Matched_X1_Z': X1.loc[idx, 'Z'],
        'Distance': dist
    }
    matches.append(match)


matches_df = pd.DataFrame(matches)
print(matches_df.head())


   X0_index      X0_Z  Matched_X1_index  Matched_X1_Z  Distance
0         0  0.548814                17      0.568434  0.019620
1         1  0.602763                38      0.604846  0.002082
2         2  0.544883                17      0.568434  0.023551
3         3  0.423655                41      0.414263  0.009392
4         4  0.528895                17      0.568434  0.039539


In [10]:
#order the matches by distance furthest to closest
# Sort matches by distance
matches_df = matches_df.sort_values(by='Distance', ascending=False).reset_index(drop=True)
print(matches_df.head())


   X0_index      X0_Z  Matched_X1_index  Matched_X1_Z  Distance
0        51  0.004695                39      0.120197  0.115501
1        16  0.018790                39      0.120197  0.101407
2        50  0.020108                39      0.120197  0.100089
3         8  0.020218                39      0.120197  0.099978
4        36  0.039188                39      0.120197  0.081009


In [11]:
print("Farthest match distance:", matches_df['Distance'].max())

Farthest match distance: 0.11550108502062191
