In [6]:
import numpy as np
import pandas as pd
from statsmodels.formula.api import ols
data=pd.read_csv('Datasets/earning.csv')
train = data.sample(frac = 0.7,random_state=617)
test = data.drop(labels = train.index)

In [7]:
reg=ols('earn~age',data=train)
model=reg.fit()

In [8]:
model.summary()

0,1,2,3
Dep. Variable:,earn,R-squared:,0.004
Model:,OLS,Adj. R-squared:,0.003
Method:,Least Squares,F-statistic:,4.58
Date:,"Thu, 23 Oct 2025",Prob (F-statistic):,0.0325
Time:,09:35:20,Log-Likelihood:,-15398.0
No. Observations:,1250,AIC:,30800.0
Df Residuals:,1248,BIC:,30810.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,4.092e+04,4131.880,9.904,0.000,3.28e+04,4.9e+04
age,188.1059,87.892,2.140,0.033,15.674,360.538

0,1,2,3
Omnibus:,1429.792,Durbin-Watson:,2.001
Prob(Omnibus):,0.0,Jarque-Bera (JB):,224301.557
Skew:,5.471,Prob(JB):,0.0
Kurtosis:,67.706,Cond. No.,127.0


Scikit-Learn

In [10]:
data=pd.read_csv('Datasets/earning.csv')
y=data.earn
X=data.drop('earn',axis=1)

y_binned=pd.cut(x=y, bins=[-1,20000,40000,60000,80000,100000,200000,1000000])
from sklearn.model_selection import train_test_split
X_train_raw,X_test_raw,y_train,y_test=train_test_split(X,y,train_size=0.8,random_state=617)

categorical_features=['gender','ethnicity','smokenow']

X_train=pd.get_dummies(X_train_raw,prefix_sep='_',columns=categorical_features,drop_first=True)
X_test=pd.get_dummies(X_test_raw,prefix_sep='_',columns=categorical_features,drop_first=True)

X_test=X_test.reindex(columns=X_train.columns,fill_value=0)
X_train.head()

Unnamed: 0,age,height,weight,education,walk,exercise,tense,gender_male,ethnicity_Asian,ethnicity_Hispanic,ethnicity_White,smokenow_2
574,26,62,124,13,7,1,7,False,False,False,False,True
282,51,69,175,11,8,1,0,True,False,False,True,False
1172,63,66,165,16,7,1,0,False,False,False,True,True
630,33,68,120,16,8,3,0,True,False,False,True,True
867,33,63,103,12,8,1,0,False,False,False,True,False


In [14]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X_train[['age']],y_train)
y_pred_train=model.predict(X_train[['age']])
y_pred_test=model.predict(X_test[['age']])

In [22]:
from sklearn.metrics import root_mean_squared_error

print(f"The root mean squared error for training dataset is {root_mean_squared_error(y_train,y_pred_train)}\nThe root mean squared error for test dataset is {root_mean_squared_error(y_test,y_pred_test)}")

The root mean squared error for training dataset is 53685.65194044367
The root mean squared error for test dataset is 48115.32375561917


Polynomial Regression

In [36]:
from sklearn.preprocessing import PolynomialFeatures
model_poly=LinearRegression()

poly = PolynomialFeatures(degree=6,include_bias=False)
poly_features= poly.fit_transform(X_train[['age']])
poly_features_test= poly.transform(X_test[['age']])


In [37]:
model_poly.fit(poly_features,y_train)
y_pred_train=model_poly.predict(poly_features)
y_pred_test=model_poly.predict(poly_features_test)

In [38]:
print(f"The root mean squared error for training dataset is {root_mean_squared_error(y_train,y_pred_train)}\nThe root mean squared error for test dataset is {root_mean_squared_error(y_test,y_pred_test)}")

The root mean squared error for training dataset is 52140.61563215015
The root mean squared error for test dataset is 46118.950039275536


In [45]:
from sklearn.preprocessing import PolynomialFeatures
model_poly=LinearRegression()

poly = PolynomialFeatures(degree=2,include_bias=False)
poly_features= poly.fit_transform(X_train[['age','height','weight','education']])
poly_features_test= poly.transform(X_test[['age','height','weight','education']])

model_poly.fit(poly_features,y_train)
y_pred_train=model_poly.predict(poly_features)
y_pred_test=model_poly.predict(poly_features_test)

print(f"The root mean squared error for training dataset is {root_mean_squared_error(y_train,y_pred_train)}\nThe root mean squared error for test dataset is {root_mean_squared_error(y_test,y_pred_test)}")

The root mean squared error for training dataset is 47939.80000978203
The root mean squared error for test dataset is 41918.73512212258


In [46]:
poly_features.shape

(1428, 14)

In [48]:
numeric_train=X_train.select_dtypes('int')

In [49]:
numeric_test=X_test.select_dtypes('int')

In [51]:
model_poly=LinearRegression()

poly = PolynomialFeatures(degree=3,include_bias=False)
poly_features= poly.fit_transform(numeric_train)
poly_features_test= poly.transform(numeric_test)

model_poly.fit(poly_features,y_train)
y_pred_train=model_poly.predict(poly_features)
y_pred_test=model_poly.predict(poly_features_test)

print(f"The root mean squared error for training dataset is {root_mean_squared_error(y_train,y_pred_train)}\nThe root mean squared error for test dataset is {root_mean_squared_error(y_test,y_pred_test)}")

The root mean squared error for training dataset is 44296.08640727942
The root mean squared error for test dataset is 43459.17289788679


Splines

In [57]:
from sklearn.preprocessing import SplineTransformer

model_spline= LinearRegression()
spline=SplineTransformer(n_knots=2,degree=3)

spline_features= spline.fit_transform(numeric_train)
spline_features_test= spline.transform(numeric_test)

model_spline.fit(spline_features,y_train)
y_pred_train=model_spline.predict(spline_features)
y_pred_test=model_spline.predict(spline_features_test)


print(f"The root mean squared error for training dataset is {root_mean_squared_error(y_train,y_pred_train)}\nThe root mean squared error for test dataset is {root_mean_squared_error(y_test,y_pred_test)}")

The root mean squared error for training dataset is 47525.082076007326
The root mean squared error for test dataset is 40933.11716574688


In [58]:
model_spline.coef_

array([ 3.51097560e+17,  3.51097560e+17,  3.51097560e+17,  3.51097560e+17,
        1.86066173e+18,  1.86066173e+18,  1.86066173e+18,  1.86066173e+18,
       -9.09594760e+16, -9.09594760e+16, -9.09594760e+16, -9.09594760e+16,
        1.60938723e+18,  1.60938723e+18,  1.60938723e+18,  1.60938723e+18,
       -5.95706512e+18, -5.95706512e+18, -5.95706512e+18, -5.95706512e+18,
        3.81116982e+18,  3.81116982e+18,  3.81116982e+18,  3.81116982e+18,
       -2.81574070e+18, -2.81574070e+18, -2.81574070e+18, -2.81574070e+18])

# GAMs

library used is pygam.