In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import NearestNeighbors

## Question 1 - 3

Perform a linear regression to predict Y from X1, X2, and X3. Use the file homework_1.1.csv.

In [21]:
#load homework_1.1.csv
df_1_1 = pd.read_csv('homework_1.1.csv')

In [22]:
#Perform a linear regression to predict Y from X1, X2, and X3 from df_1_1
X = df_1_1[['X1', 'X2', 'X3']]
y = df_1_1['Y']
model = LinearRegression()
model.fit(X, y)



In [23]:
# Store results
simple_coefs = {}
multi_coefs = dict(zip(['X1', 'X2', 'X3'], model.coef_))

for xi in ['X1', 'X2', 'X3']:
    X_simple = df_1_1[[xi]]
    model_simple = LinearRegression()
    model_simple.fit(X_simple, y)
    simple_coefs[xi] = model_simple.coef_[0]

# Compute differences
coef_diffs = {xi: abs(simple_coefs[xi] - multi_coefs[xi]) for xi in ['X1', 'X2', 'X3']}

# Find Xi with greatest difference
greatest_xi = max(coef_diffs, key=coef_diffs.get)

print('Simple regression coefficients:', simple_coefs)
print('Multiple regression coefficients:', multi_coefs)
print('Absolute differences:', coef_diffs)
print(f"Xi with greatest difference: {greatest_xi}")

Simple regression coefficients: {'X1': np.float64(1.8417610991461835), 'X2': np.float64(4.083612579373998), 'X3': np.float64(3.097041202049842)}
Multiple regression coefficients: {'X1': np.float64(1.0071376550530298), 'X2': np.float64(1.964568594874394), 'X3': np.float64(2.9754885351634344)}
Absolute differences: {'X1': np.float64(0.8346234440931537), 'X2': np.float64(2.119043984499604), 'X3': np.float64(0.12155266688640776)}
Xi with greatest difference: X2


In [24]:
# Use statsmodels to get t-statistics for each coefficient
X_with_const = sm.add_constant(X)
model_sm = sm.OLS(y, X_with_const).fit()

# Get t-statistics (excluding the intercept)
t_stats = model_sm.tvalues.drop('const')
most_significant_xi = t_stats.abs().idxmax()

print('t-statistics for each Xi:')
print(t_stats)
print(f"Most significant Xi (by absolute t-statistic): {most_significant_xi}")

t-statistics for each Xi:
X1     60.984011
X2     53.283212
X3    196.645240
dtype: float64
Most significant Xi (by absolute t-statistic): X3


## Quesstions 4 & 5

- Use NearestNeighbors to match data based on variables Z, given the file homework_1.2.csv.
- Pick the best match in X = 0 corresponding to each X = 1. Use the Z values to perform the match: a good match with X = 1 is the item whose Z value is closest to the given sample's Z value with X = 0. I suggest using sklearn's NearestNeighbors to do this, but there are many ways to do it.  

In [25]:
#load homework_1.2.csv
df_1_2 = pd.read_csv('homework_1.2.csv')

In [26]:

# Split data into X=0 and X=1 groups
X0 = df_1_2[df_1_2['X'] == 0].reset_index(drop=True)
X1 = df_1_2[df_1_2['X'] == 1].reset_index(drop=True)

# Fit NearestNeighbors on Z values of X=1 group
nn = NearestNeighbors(n_neighbors=1)
nn.fit(X0[['Z']])

# Find the closest X=1 for each X=0
match_distances, match_indices = nn.kneighbors(X1[['Z']])

# Prepare and display the matches
matches = []
for i, (idx, dist) in enumerate(zip(match_indices.flatten(), match_distances.flatten())):
    match = {
        'X1_index': X1.index[i],
        'X1_Z': X1.loc[i, 'Z'],
        'Matched_X0_index': X0.index[idx],
        'Matched_X0_Z': X0.loc[idx, 'Z'],
        'Distance': dist
    }
    matches.append(match)


matches_df = pd.DataFrame(matches)
print(matches_df.head())


   X1_index      X1_Z  Matched_X0_index  Matched_X0_Z  Distance
0         0  0.715189                48      0.716327  0.001138
1         1  0.645894                25      0.653108  0.007214
2         2  0.437587                18      0.437032  0.000555
3         3  0.891773                 9      0.778157  0.113616
4         4  0.963663                 9      0.778157  0.185506


In [27]:
X0.Y.mean() - X1.Y.mean()

np.float64(-0.7938524968121783)

In [28]:
#order the matches by distance furthest to closest
# Sort matches by distance
matches_df = matches_df.sort_values(by='Distance', ascending=False).reset_index(drop=True)
print(matches_df.head())


   X1_index      X1_Z  Matched_X0_index  Matched_X0_Z  Distance
0        29  0.988374                 9      0.778157  0.210217
1        10  0.978618                 9      0.778157  0.200462
2        37  0.976761                 9      0.778157  0.198604
3        36  0.976459                 9      0.778157  0.198303
4         4  0.963663                 9      0.778157  0.185506


In [29]:
print("Farthest match distance:", matches_df['Distance'].max())

Farthest match distance: 0.2102170871093757


In [30]:
df_X0_matched = X0.iloc[matches_df['Matched_X0_index']]

In [31]:
#find the  z-score of the distance of the farthest match
farthest_distance = matches_df['Distance'].max()
mean_distance = matches_df['Distance'].mean()
std_distance = matches_df['Distance'].std()
z_score_farthest = (farthest_distance - mean_distance) / std_distance
print(f"Z-score of the farthest match distance: {z_score_farthest:.2f}")

Z-score of the farthest match distance: 2.35


In [32]:
X1.Y.mean() - df_X0_matched.Y.mean()

np.float64(0.5433600651913856)

### Questions 6 & 7

In [33]:
nn_2 = NearestNeighbors(n_neighbors=1, radius=0.2)
nn_2.fit(X1[['Z']])
prediction = nn_2.radius_neighbors(X0[['Z']])


In [34]:
X0.iloc[0]

Unnamed: 0    0.000000
X             0.000000
Y             0.548814
Z             0.548814
Name: 0, dtype: float64

In [35]:
X1.iloc[[ 0,  1,  2,  5, 12, 14, 17, 18, 19, 21, 22, 23, 26, 27, 28, 33, 38,
              41, 42, 44, 46]]

Unnamed: 0.1,Unnamed: 0,X,Y,Z
0,1,1,1.215189,0.715189
1,5,1,1.145894,0.645894
2,6,1,0.937587,0.437587
5,9,1,0.883442,0.383442
12,22,1,0.961479,0.461479
14,25,1,1.139921,0.639921
17,33,1,1.068434,0.568434
18,35,1,1.117635,0.617635
19,36,1,1.112096,0.612096
21,39,1,1.18182,0.68182


In [36]:
# For each X0, find neighbors in X1 within radius 0.2, compute mean Y for each group, then average
radius = 0.2
nn_X0 = NearestNeighbors(radius=radius)
nn_X0.fit(X1[['Z']])
indices_X0 = nn_X0.radius_neighbors(X0[['Z']], return_distance=False)

mean_Y_X0 = []
for idxs in indices_X0:
    if len(idxs) > 0:
        mean_Y_X0.append(X1.iloc[idxs]['Y'].mean())
    else:
        mean_Y_X0.append(np.nan)
mean_Y_X0 = pd.Series(mean_Y_X0).dropna()

# For each X1, find neighbors in X0 within radius 0.2, compute mean Y for each group, then average
nn_X1 = NearestNeighbors(radius=radius)
nn_X1.fit(X0[['Z']])
indices_X1 = nn_X1.radius_neighbors(X1[['Z']], return_distance=False)

mean_Y_X1 = []
for idxs in indices_X1:
    if len(idxs) > 0:
        mean_Y_X1.append(X0.iloc[idxs]['Y'].mean())
    else:
        mean_Y_X1.append(np.nan)
mean_Y_X1 = pd.Series(mean_Y_X1).dropna()

# Compute the effect
effect = mean_Y_X1.mean() - mean_Y_X0.mean()
print(f"Estimated effect (neighbor mean difference): {effect}")


Estimated effect (neighbor mean difference): -0.33800751142229835


In [37]:
radius = 0.2
nn = NearestNeighbors(radius=radius)
nn.fit(X0[['Z']])
indices = nn.radius_neighbors(X1[['Z']], return_distance=False)

# Flatten the list of neighbor indices and count occurrences
from collections import Counter
all_neighbor_indices = [idx for group in indices for idx in group]
counts = Counter(all_neighbor_indices)

duplicate_count = sum(count - 1 for count in counts.values() if count > 1)
print(f"Number of duplicates (all but first in each group): {duplicate_count}")


Number of duplicates (all but first in each group): 685


## Homework Reflection 

1. In Coding Quiz 1, you are asked to find the distance of the farthest match in a set.  Is this farthest match distance too far to be a meaningful match?  How can you decide this?

To determine if a distance is too far, you first have to define what too far is. If you have some domain knowledge, you may already have a definition of what may be considered too far. Without domain knowledge, you can rely on statistical analysis to calculate if the distances are outliers. For example, if the furthest match has a z-score higher than 2 or 3, it may be considered an outlier and thus can be deemed too far.  
Using statistical calculations, the z-score for the farthest match is 2.35, which may be considered too far. 

2. In Coding Quiz 1, there are two approaches to matching: 
(A) Picking the best match X = 0 corresponding to each X = 1 using Z values.
(B) Using radius_neighbors to pick all matches X = 0 within a distance or 0.2 of each X = 1.
Invent your own type of matching similar to 1 and 2 (or look one up on the internet), which has a different way to pick the matches in X = 0.  Clearly explain the approach you invented or found.

Building on best match and radius neighbors, you could combine the two approaches and calculate points that are close and in a dense region of similar points. This would make the matching process more robust to outliers and sparsely populated regions.
