In [1]:
# Import our librairies
import pandas as pd 
import numpy as np

In [2]:
# Read our data
data = pd.read_csv("teams.csv")

In [3]:
data

Unnamed: 0,team,year,athletes,events,age,height,weight,prev_medals,medals
0,AFG,1964,8,8,22.0,161.0,64.2,0.0,0
1,AFG,1968,5,5,23.2,170.2,70.0,0.0,0
2,AFG,1972,8,8,29.0,168.3,63.8,0.0,0
3,AFG,1980,11,11,23.6,168.4,63.2,0.0,0
4,AFG,2004,5,5,18.6,170.8,64.8,0.0,0
...,...,...,...,...,...,...,...,...,...
2009,ZIM,2000,26,19,25.0,179.0,71.1,0.0,0
2010,ZIM,2004,14,11,25.1,177.8,70.5,0.0,3
2011,ZIM,2008,16,15,26.1,171.9,63.7,3.0,4
2012,ZIM,2012,9,8,27.3,174.4,65.2,4.0,0


In [4]:
X = data[["athletes", "prev_medals"]].copy()
y = data[["medals"]].copy()

In [5]:
X["intercept"] = 1

In [6]:
X = X[["intercept", "athletes", "prev_medals"]]

In [7]:
X

Unnamed: 0,intercept,athletes,prev_medals
0,1,8,0.0
1,1,5,0.0
2,1,8,0.0
3,1,11,0.0
4,1,5,0.0
...,...,...,...
2009,1,26,0.0
2010,1,14,0.0
2011,1,16,3.0
2012,1,9,4.0


In [8]:
# Create the Transpose Matrix
X_T = X.T

In [9]:
X_T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
intercept,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
athletes,8.0,5.0,8.0,11.0,5.0,4.0,6.0,3.0,4.0,5.0,...,52.0,20.0,47.0,28.0,21.0,26.0,14.0,16.0,9.0,31.0
prev_medals,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,15.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,0.0


In [10]:
# Calculate the coefficient
B = np.linalg.inv(X_T @ X) @ X_T @ y

In [11]:
B

Unnamed: 0,medals
0,-1.961889
1,0.071112
2,0.734137


In [12]:
# Set the index of our B dataframe
B.index = X.columns

In [13]:
B

Unnamed: 0,medals
intercept,-1.961889
athletes,0.071112
prev_medals,0.734137


In [14]:
# Compute predictions
predictions = X @ B

In [15]:
predictions

Unnamed: 0,medals
0,-1.392992
1,-1.606329
2,-1.392992
3,-1.179656
4,-1.606329
...,...
2009,-0.112974
2010,-0.966319
2011,1.378315
2012,1.614667


In [16]:
# Calculate Sum of Square Residual(SSR)
SSR = ((y - predictions) ** 2).sum()

In [17]:
# Calculate SST (Sum of Square Total)
SST = ((y - y.mean()) ** 2).sum()

In [18]:
SSR

medals    290620.124598
dtype: float64

In [19]:
SST

medals    2.276322e+06
dtype: float64

In [20]:
# Calculate r square
R2 = 1 - (SSR / SST)

In [21]:
R2

medals    0.872329
dtype: float64

In [22]:
# Compare our implementation with the reference implementation from sklearn
from sklearn.linear_model import LinearRegression

In [23]:
# Instanciate the model
lr = LinearRegression()

In [24]:
# Fit the model
lr.fit(data[["athletes", "prev_medals"]], data[["medals"]])

LinearRegression()

In [25]:
# Look the intercept and compare with the intercept of our own algorithm
lr.intercept_

array([-1.96188939])

In [26]:
# Look the coefficient and compare with the coefficient of our own algorithm
lr.coef_

array([[0.07111214, 0.73413679]])

### In Conclusion, we can affirm that our algorithm is good enough. Because its intercept and its coefficients are the same as for its reference implementation from sklearn