In [1]:
# Import our librairies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# Read our data
data = pd.read_csv("teams.csv")

In [3]:
data

Unnamed: 0,team,year,athletes,events,age,height,weight,prev_medals,medals
0,AFG,1964,8,8,22.0,161.0,64.2,0.0,0
1,AFG,1968,5,5,23.2,170.2,70.0,0.0,0
2,AFG,1972,8,8,29.0,168.3,63.8,0.0,0
3,AFG,1980,11,11,23.6,168.4,63.2,0.0,0
4,AFG,2004,5,5,18.6,170.8,64.8,0.0,0
...,...,...,...,...,...,...,...,...,...
2009,ZIM,2000,26,19,25.0,179.0,71.1,0.0,0
2010,ZIM,2004,14,11,25.1,177.8,70.5,0.0,3
2011,ZIM,2008,16,15,26.1,171.9,63.7,3.0,4
2012,ZIM,2012,9,8,27.3,174.4,65.2,4.0,0


In [4]:
# Spilt our data
train, test = train_test_split(data, test_size=0.2, random_state=1)

In [5]:
# Choose our features and target
predictors = ["athletes", "events"]
target = "medals"

In [6]:
X = train[predictors].copy()

In [7]:
y = train[target].copy()

In [8]:
X

Unnamed: 0,athletes,events
1322,6,6
1872,119,80
953,4,4
1117,2,2
1993,43,25
...,...,...
1791,40,25
1096,36,23
1932,719,245
235,13,11


In [9]:
# Compute mean and std
x_mean = X.mean()
x_std = X.std()

In [10]:
x_mean

athletes    74.409063
events      35.990068
dtype: float64

In [11]:
# Features scaling
X = (X - x_mean) / x_std

In [12]:
X.describe()

Unnamed: 0,athletes,events
count,1611.0,1611.0
mean,-1.3386770000000001e-17,1.287852e-18
std,1.0,1.0
min,-0.5768883,-0.714393
25%,-0.5297371,-0.6123079
50%,-0.4197174,-0.4489717
75%,-0.02679027,0.183956
max,6.008571,4.634867


In [13]:
# Add intercept
X["intercept"] = 1
X = X[["intercept"] + predictors]

In [14]:
X

Unnamed: 0,intercept,athletes,events
1322,1,-0.537596,-0.612308
1872,1,0.350420,0.898552
953,1,-0.553313,-0.653142
1117,1,-0.569030,-0.693976
1993,1,-0.246829,-0.224384
...,...,...,...
1791,1,-0.270405,-0.224384
1096,1,-0.301839,-0.265219
1932,1,5.065546,4.267361
235,1,-0.482586,-0.510223


In [15]:
X.T

Unnamed: 0,1322,1872,953,1117,1993,385,1287,1831,0,1159,...,960,847,1669,715,905,1791,1096,1932,235,1061
intercept,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
athletes,-0.537596,0.35042,-0.553313,-0.56903,-0.246829,-0.482586,-0.537596,0.138239,-0.521879,-0.152527,...,-0.199678,-0.160386,-0.529737,-0.529737,-0.341132,-0.270405,-0.301839,5.065546,-0.482586,-0.19182
events,-0.612308,0.898552,-0.653142,-0.693976,-0.224384,-0.571474,-0.612308,0.102288,-0.571474,-0.163133,...,-0.285636,-0.101882,-0.612308,-0.591891,-0.367304,-0.224384,-0.265219,4.267361,-0.510223,0.041037


In [16]:
# Calculate penalty matrix
alpha = 2
I = np.identity(X.shape[1])

In [17]:
I

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [18]:
I[0][0] = 0

In [19]:
I

array([[0., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [20]:
penalty = alpha * I

In [21]:
penalty

array([[0., 0., 0.],
       [0., 2., 0.],
       [0., 0., 2.]])

In [22]:
# Calculate linear regression coefficient
B = np.linalg.inv(X.T @ X + penalty) @ X.T @ y

In [23]:
B.index = ["intercept", "athletes", "events"]

In [24]:
B

intercept    10.691496
athletes     61.857734
events      -34.632920
dtype: float64

In [25]:
# Features scaling on test data
test_X = test[predictors]
test_X = (test_X - x_mean) / x_std
test_X["intercept"] = 1
test_X = test_X[["intercept"] + predictors]

In [26]:
test_X

Unnamed: 0,intercept,athletes,events
309,1,-0.553313,-0.653142
285,1,0.594035,1.000637
919,1,-0.144668,0.102288
120,1,0.146098,0.531045
585,1,-0.301839,-0.122299
...,...,...,...
541,1,-0.380425,-0.408138
1863,1,-0.191820,0.143122
622,1,-0.058224,0.388126
1070,1,-0.569030,-0.693976


In [27]:
# Predict
predictions = test_X @ B

In [28]:
predictions

309     -0.914959
285     12.782156
919     -1.799893
120      1.337116
585     -3.744014
          ...    
541      1.294285
1863    -6.130765
622     -6.352080
1070    -0.472980
1196    -0.914959
Length: 403, dtype: float64

In [29]:
# Compare our implementation with the reference implementation from sklearn
from sklearn.linear_model import Ridge


In [30]:
# Instanciate the model
ridge = Ridge(alpha=alpha)

In [31]:
# Fit the model
ridge.fit(X[predictors], y)

Ridge(alpha=2)

In [32]:
# Look the coefficient and compare with the coefficient of our own algorithm
ridge.coef_

array([ 61.85773366, -34.63292036])

In [33]:
# Look the intercept and compare with the intercept of our own algorithm
ridge.intercept_

10.691495965238982

In [34]:
# Make predictions and compare with our implementation
sklearn_prediction = ridge.predict(test_X[predictors])

In [35]:
predictions - sklearn_prediction

309     1.987299e-14
285    -1.314504e-13
919    -1.034728e-13
120    -1.438849e-13
585    -8.526513e-14
            ...     
541    -3.552714e-15
1863   -1.429967e-13
622    -1.811884e-13
1070    2.492451e-14
1196    1.987299e-14
Length: 403, dtype: float64

In [37]:
# Find the optimum alpha
def ridge_fit(train, predictors, target, alpha):
    X = train[predictors].copy()
    y = train[target].copy()
    
    # Compute mean and std
    x_mean = X.mean()
    x_std = X.std()
    
    # Features scaling
    X = (X - x_mean) / x_std
    X["intercept"] = 1
    X = X[["intercept"] + predictors]
    
    # Calculate the penalty
    penalty = alpha * np.identity(X.shape[1])
    penalty[0][0] = 0
    
    #Calculate the linear Regression coefficient
    B = np.linalg.inv(X.T @ X + penalty) @ X.T @ y
    B.index = ["intercept", "athletes", "events"]
    return B, x_mean, x_std

In [38]:
# Predict
def ridge_predict(test, predictors, x_mean, x_std, B):
    test_X = test[predictors]
    test_X = (test_X - x_mean) / x_std
    test_X["intercept"] = 1
    test_X = test_X[["intercept"] + predictors]
    
    predictions = test_X @ B
    return predictions

In [39]:
from sklearn.metrics import mean_absolute_error

errors = []
alphas = [10**i for i in range(-2, 4)]

In [40]:
alphas

[0.01, 0.1, 1, 10, 100, 1000]

In [42]:
for alpha in alphas:
    B, x_mean, x_std = ridge_fit(train, predictors, target, alpha)
    predictions = ridge_predict(test, predictors, x_mean, x_std, B)
    errors.append(mean_absolute_error(test[target], predictions))

In [43]:
errors

[6.309640830161112,
 6.306044331952903,
 6.272283376431607,
 6.114051204717739,
 7.156811236590453,
 6.978054589575732]

### In Conclusion, we can affirm that our algorithm is good enough. Because its intercept and its coefficients are the same as for its reference implementation from sklearn