In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("teams.csv")

In [3]:
df.head()

Unnamed: 0,team,year,athletes,events,age,height,weight,prev_medals,medals
0,AFG,1964,8,8,22.0,161.0,64.2,0.0,0
1,AFG,1968,5,5,23.2,170.2,70.0,0.0,0
2,AFG,1972,8,8,29.0,168.3,63.8,0.0,0
3,AFG,1980,11,11,23.6,168.4,63.2,0.0,0
4,AFG,2004,5,5,18.6,170.8,64.8,0.0,0


In [4]:
df.shape

(2014, 9)

In [5]:
df.isna().sum()

team           0
year           0
athletes       0
events         0
age            0
height         0
weight         0
prev_medals    0
medals         0
dtype: int64

In [6]:
X = df[["athletes", "prev_medals"]]
y = df["medals"]

In [7]:
X.head()

Unnamed: 0,athletes,prev_medals
0,8,0.0
1,5,0.0
2,8,0.0
3,11,0.0
4,5,0.0


In [8]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: medals, dtype: int64

In [9]:
print(X.shape)
print(y.shape)

(2014, 2)
(2014,)


In [10]:
X['intercept']=1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['intercept']=1


In [11]:
X.head()

Unnamed: 0,athletes,prev_medals,intercept
0,8,0.0,1
1,5,0.0,1
2,8,0.0,1
3,11,0.0,1
4,5,0.0,1


In [12]:
X = X[['intercept', "athletes", "prev_medals"]]

In [13]:
X.head()

Unnamed: 0,intercept,athletes,prev_medals
0,1,8,0.0
1,1,5,0.0
2,1,8,0.0
3,1,11,0.0
4,1,5,0.0


predictions

![](predictions.jpeg)

actual value

![](actual.jpeg)

matrix form

![](matrix_form.jpeg)
1 column allows us to add the y intercept :)

matrix multiplication

![](matrix_multiplication.jpeg)

transpose of matrix times the matrix results in a square matrix

![](square_matrix.jpeg)
use? square matrix can be inverted

![](inverse.jpeg)

a matrix multiplied with its inverse gives identity matrix!!

![](identity.jpeg)

any matrix mulitplied with identity matrix results in the original matrix itself

assume error to be 0

![](math.jpeg)

In [14]:
X_T = X.T

In [15]:
X_T.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
intercept,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
athletes,8.0,5.0,8.0,11.0,5.0,4.0,6.0,3.0,4.0,5.0,...,52.0,20.0,47.0,28.0,21.0,26.0,14.0,16.0,9.0,31.0
prev_medals,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,15.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,0.0


In [16]:
B = np.linalg.inv(X_T @ X) @ X_T @ y

In [17]:
B

0   -1.961889
1    0.071112
2    0.734137
dtype: float64

In [18]:
B.shape

(3,)

In [19]:
B.index = X.columns

In [20]:
B

intercept     -1.961889
athletes       0.071112
prev_medals    0.734137
dtype: float64

In [21]:
preds = X @ B

In [22]:
preds

0      -1.392992
1      -1.606329
2      -1.392992
3      -1.179656
4      -1.606329
          ...   
2009   -0.112974
2010   -0.966319
2011    1.378315
2012    1.614667
2013    0.242587
Length: 2014, dtype: float64

In [23]:
sum_of_squared_error = ((y-preds)**2).sum()

In [24]:
sum_of_squared_error

290620.1245981143

In [25]:
sum_of_squared_total = ((y-y.mean())**2).sum()

In [26]:
sum_of_squared_total

2276321.801390268

In [27]:
r_square = 1 - (sum_of_squared_error/sum_of_squared_total)

In [28]:
r_square

0.8723290685786969

In [29]:
from sklearn.linear_model import LinearRegression

In [30]:
lr = LinearRegression()

In [31]:
lr.fit(df[["athletes", "prev_medals"]], df[["medals"]])

LinearRegression()

In [32]:
lr.intercept_

array([-1.96188939])

In [33]:
lr.coef_

array([[0.07111214, 0.73413679]])