In [5]:

import pandas as pd
import statsmodels.formula.api as smf

# Load data
df = pd.read_csv('simulated_slowness_data.csv')

# One-hot encode categorical features
df_encoded = pd.get_dummies(df, columns=['User_region', 'CPU_type'], drop_first=True)
df_encoded.columns = df_encoded.columns.str.replace(" ", "_")

# Fit OLS regression model
model = smf.ols(
    "slowness ~ treatment + User_region_US_East + User_region_US_West + CPU_type_Intel_i7 + CPU_type_Ryzen_5",
    data=df_encoded.rename(columns={
        'User_region_US-East': 'User_region_US_East',
        'User_region_US-West': 'User_region_US_West'
    })
).fit()

print(model.summary())


                            OLS Regression Results                            
Dep. Variable:               slowness   R-squared:                       0.609
Model:                            OLS   Adj. R-squared:                  0.607
Method:                 Least Squares   F-statistic:                     309.5
Date:                Thu, 07 Aug 2025   Prob (F-statistic):          9.07e-200
Time:                        15:59:24   Log-Likelihood:                 191.35
No. Observations:                1000   AIC:                            -370.7
Df Residuals:                     994   BIC:                            -341.2
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
Intercept         

In [9]:
df_encoded.head()

Unnamed: 0,Computer_type,region_num,cpu_num,treatment,slowness,User_region_US-East,User_region_US-West,CPU_type_Intel_i7,CPU_type_Ryzen_5,propensity_score
0,Desktop,2,2,0,0.837573,False,False,False,True,0.36679
1,VM,0,2,1,0.983933,True,False,False,True,0.368884
2,VM,2,2,1,1.024601,False,False,False,True,0.36679
3,Desktop,2,2,0,0.815156,False,False,False,True,0.36679
4,Desktop,0,0,0,1.015274,True,False,False,False,0.38974


In [6]:
from sklearn.linear_model import LogisticRegression

X_psm = df_encoded[["User_region_US-East", "User_region_US-West", "CPU_type_Intel_i7", "CPU_type_Ryzen_5"]]
y_psm = df_encoded["treatment"]

ps_model = LogisticRegression(solver="liblinear")
ps_model.fit(X_psm, y_psm)

# Save the predicted probabilities
df_encoded["propensity_score"] = ps_model.predict_proba(X_psm)[:, 1]


In [7]:
from sklearn.neighbors import NearestNeighbors

treated = df_encoded[df_encoded["treatment"] == 1]
control = df_encoded[df_encoded["treatment"] == 0]

nn = NearestNeighbors(n_neighbors=1)
nn.fit(control[["propensity_score"]])
_, indices = nn.kneighbors(treated[["propensity_score"]])

matched_control = control.iloc[indices.flatten()]
matched_control.index = treated.index  # align index for subtraction


In [21]:
df_encoded.shape,treated.shape, control.shape

((1000, 10), (386, 10), (614, 10))

In [18]:
len(indices), len(treated[["propensity_score"]])

(386, 386)

In [17]:
control

Unnamed: 0,Computer_type,region_num,cpu_num,treatment,slowness,User_region_US-East,User_region_US-West,CPU_type_Intel_i7,CPU_type_Ryzen_5,propensity_score
0,Desktop,2,2,0,0.837573,False,False,False,True,0.366790
3,Desktop,2,2,0,0.815156,False,False,False,True,0.366790
4,Desktop,0,0,0,1.015274,True,False,False,False,0.389740
5,Desktop,0,2,0,0.618470,True,False,False,True,0.368884
6,Desktop,2,1,0,0.715950,False,False,True,False,0.379375
...,...,...,...,...,...,...,...,...,...,...
990,Desktop,1,1,0,0.534185,False,True,True,False,0.403266
992,Desktop,1,2,0,0.903363,False,True,False,True,0.390388
993,Desktop,2,0,0,1.071568,False,False,False,False,0.387600
997,Desktop,2,1,0,1.241284,False,False,True,False,0.379375


In [14]:
matched_control

Unnamed: 0,Computer_type,region_num,cpu_num,treatment,slowness,User_region_US-East,User_region_US-West,CPU_type_Intel_i7,CPU_type_Ryzen_5,propensity_score
1,Desktop,0,2,0,0.344416,True,False,False,True,0.368884
2,Desktop,2,2,0,0.837573,False,False,False,True,0.366790
7,Desktop,1,1,0,1.106416,False,True,True,False,0.403266
16,Desktop,1,1,0,1.106416,False,True,True,False,0.403266
18,Desktop,1,1,0,1.106416,False,True,True,False,0.403266
...,...,...,...,...,...,...,...,...,...,...
991,Desktop,2,1,0,1.230187,False,False,True,False,0.379375
994,Desktop,1,0,0,0.919535,False,True,False,False,0.411666
995,Desktop,1,0,0,0.919535,False,True,False,False,0.411666
996,Desktop,1,2,0,1.109367,False,True,False,True,0.390388


In [8]:
matched_control

Unnamed: 0,Computer_type,region_num,cpu_num,treatment,slowness,User_region_US-East,User_region_US-West,CPU_type_Intel_i7,CPU_type_Ryzen_5,propensity_score
1,Desktop,0,2,0,0.344416,True,False,False,True,0.368884
2,Desktop,2,2,0,0.837573,False,False,False,True,0.366790
7,Desktop,1,1,0,1.106416,False,True,True,False,0.403266
16,Desktop,1,1,0,1.106416,False,True,True,False,0.403266
18,Desktop,1,1,0,1.106416,False,True,True,False,0.403266
...,...,...,...,...,...,...,...,...,...,...
991,Desktop,2,1,0,1.230187,False,False,True,False,0.379375
994,Desktop,1,0,0,0.919535,False,True,False,False,0.411666
995,Desktop,1,0,0,0.919535,False,True,False,False,0.411666
996,Desktop,1,2,0,1.109367,False,True,False,True,0.390388


In [20]:
att_psm = (treated["slowness"] - matched_control["slowness"]).mean()
att_psm

0.22461344448521733