In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [2]:
df_1 = pd.read_csv("homework_3.1.csv")
df_1.head(10)

Unnamed: 0.1,Unnamed: 0,time,value1,value2,value3
0,0,0,1.764052,1.883151,-0.369182
1,1,1,0.420157,-1.327759,-0.219379
2,2,2,1.018738,-1.230485,1.13966
3,3,3,2.300893,1.029397,0.715264
4,4,4,1.947558,-1.093123,0.720132
5,5,5,-0.877278,2.043621,-1.516956
6,6,6,1.070088,-0.293619,0.095674
7,7,7,-0.011357,-0.607455,-0.598031
8,8,8,0.056781,2.082942,0.439925
9,9,9,0.590599,1.660515,0.08185


In [3]:
df_1["event_50"] = (df_1["time"] == 50).astype(int)

def run_point_jump_model(y_column):
    X = sm.add_constant(df_1[["time", "event_50"]])
    y = df_1[y_column]
    model = sm.OLS(y, X).fit()
    return model.rsquared, model.params

results = {}
for col in ["value1", "value2","value3"]:
    r2, params = run_point_jump_model(col)
    results[col] = {"R_squared": r2, "params": params}

results_df_1 = pd.DataFrame(results).T.sort_values(by="R_squared", ascending = False)
print(results_df_1)

       R_squared                                             params
value3  0.734688  const      -0.945642
time        0.062791
even...
value1  0.650851  const      -0.629894
time        0.056630
even...
value2  0.531161  const      -0.115035
time        0.039012
even...


In [4]:
df_2a = pd.read_csv("homework_3.2.a.csv")
df_2b = pd.read_csv("homework_3.2.b.csv")

In [5]:
df_2a.head(5)

Unnamed: 0.1,Unnamed: 0,group1,time1,outcome1
0,0,0,0,0.882026
1,1,0,1,1.600079
2,2,0,0,0.489369
3,3,0,1,2.520447
4,4,0,0,0.933779


In [6]:
df_2b.head(5)

Unnamed: 0.1,Unnamed: 0,group2,time2,outcome2
0,0,0,0,0.667155
1,1,0,1,2.470969
2,2,0,0,-0.506778
3,3,0,1,1.525657
4,4,0,0,0.273664


In [7]:
df_2a["interaction"] = df_2a["group1"] * df_2a["time1"]
df_2b["interaction"] = df_2b["group2"] * df_2b["time2"]

In [8]:
import statsmodels.formula.api as smf
model1 = smf.ols("outcome1 ~ group1 + time1 + interaction", data=df_2a).fit()
model2 = smf.ols("outcome2 ~ group2 + time2 + interaction", data=df_2b).fit()


print(f"Dataset 1 Treatment Effect: {model1.params['interaction']}")


print(f"Dataset 2 Treatment Effect: {model2.params['interaction']}")

Dataset 1 Treatment Effect: 0.6858469689928821
Dataset 2 Treatment Effect: 1.3498589246939923


In [9]:
treatment1 = model1.params["interaction"]
se1 = model1.bse["interaction"]
pval1 = model1.pvalues["interaction"]

treatment2 = model2.params["interaction"]
se2 = model2.bse["interaction"]
pval2 = model2.pvalues["interaction"]
print(f"Model 1 p value : {pval1}, Model 2 p value: {pval2}")
print(f"Model 1 standard error : {se1}, Model 2 standard error : {se2}")
print(f"Model 1 treatment effect : {treatment1}, Model 2 treatment effect : {treatment2}")

Model 1 p value : 1.640050297911803e-26, Model 2 p value: 2.432436181923841e-19
Model 1 standard error : 0.06252245270152706, Model 2 standard error : 0.1470470684853622
Model 1 treatment effect : 0.6858469689928821, Model 2 treatment effect : 1.3498589246939923


In [None]:
df_2a["time1_squared"] = df_2a["time1"] ** 2
df_2a["interaction_squared"] = df_2a["group1"] * df_2a["time1_squared"]


df_2b["time2_squared"] = df_2b["time2"] ** 2
df_2b["interaction_squared"] = df_2b["group2"] * df_2b["time2_squared"]


In [11]:
model1 = smf.ols("outcome1 ~ group1 + time1 + time1_squared + interaction + interaction_squared", data=df_2a).fit()
model2 = smf.ols("outcome2 ~ group2 + time2 + time2_squared + interaction + interaction_squared", data=df_2b).fit()
print(f"Dataset 1 2nd Derivative Change: {model1.params['interaction_squared']}")
print(f"Dataset 2 2nd Derivative Change: {model2.params['interaction_squared']}")


Dataset 1 2nd Derivative Change: 0.34292348449644705
Dataset 2 2nd Derivative Change: 0.6749294623470038


In [12]:
import pandas as pd

data = {
    'farm': ['Farm A', 'Farm A', 'Farm A', 'Farm B', 'Farm B', 'Farm B'],
    'time_period': ['before', 'after', 'after', 'before', 'after', 'after'],
    'group': ['treatment', 'treatment', 'treatment', 'control', 'control', 'control'],
    'milk_per_cow': [22.0, 24.0, 24.1, 22.1, 22.2, 22.0]
}

df = pd.DataFrame(data)


In [13]:
df

Unnamed: 0,farm,time_period,group,milk_per_cow
0,Farm A,before,treatment,22.0
1,Farm A,after,treatment,24.0
2,Farm A,after,treatment,24.1
3,Farm B,before,control,22.1
4,Farm B,after,control,22.2
5,Farm B,after,control,22.0


In [14]:
df['post'] = (df['time_period'] == 'after').astype(int)
df['treated'] = (df['group'] == 'treatment').astype(int)
df['interaction'] = df['post'] * df['treated']


In [15]:
import statsmodels.formula.api as smf

model = smf.ols("milk_per_cow ~ treated + post + interaction", data=df).fit()
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:           milk_per_cow   R-squared:                       0.995
Model:                            OLS   Adj. R-squared:                  0.988
Method:                 Least Squares   F-statistic:                     138.9
Date:                Thu, 12 Jun 2025   Prob (F-statistic):            0.00716
Time:                        15:05:42   Log-Likelihood:                 7.9283
No. Observations:                   6   AIC:                            -7.857
Df Residuals:                       2   BIC:                            -8.690
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept      22.1000      0.112    197.668      

  warn("omni_normtest is not valid with less than 8 observations; %i "
