In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression

# LINEAR REGRESSION

In simple terms, linear regression finds the best straight line that fits your data. This line is called the “regression line”. The equation of this line is usually written as *y = mx + c*, where:

y is the dependent variable (in our case, the number of goals scored) -
x is the independent variable (in our case, the number of hours practiced) - 
m is the slope of the line (how much y changes for each unit change in x) -
c is the y-intercept (the value of y when x is 0)

In [42]:
df = pd.DataFrame({
    'Player': ['A','B','C','D','E'],
    'Hours Practiced per Week (x)': [5,10,15,20,25],
    'Goals Scored in a Season (y)': [9,14,22,40,50]
})

In [43]:
df

Unnamed: 0,Player,Hours Practiced per Week (x),Goals Scored in a Season (y)
0,A,5,9
1,B,10,14
2,C,15,22
3,D,20,40
4,E,25,50


In [44]:
lm = LinearRegression()
x = df[['Hours Practiced per Week (x)']]
y = df['Goals Scored in a Season (y)']

In [50]:
a = lm.fit(x,y)

In [51]:
a

In [52]:
Yhat=a.predict(x)

In [54]:
Yhat

array([ 5.4, 16.2, 27. , 37.8, 48.6])

In [57]:
a.intercept_

-5.400000000000006

In [58]:
a.coef_

array([2.16])

Predict the number of goals for a player who practices 12 hours per week

In [25]:
hours = pd.DataFrame({'Hours Practiced per Week (x)': [1,12,50,75]})
predicted_goals = lm.predict(hours)
print(predicted_goals)

[ -3.24  20.52 102.6  156.6 ]


In [63]:
df3 = pd.DataFrame({
    'Player': ['A','B','C','D','E'],
    'Hours Practiced per Week (x)': [5,10,15,20,25],
    'Goals Scored in a Season (y)': [10,20,30,40,50]
})

pred3 = lm.fit(df3[['Hours Practiced per Week (x)']],df3['Goals Scored in a Season (y)'])

# Predict hours
hours3 = pd.DataFrame({'Hours Practiced per Week (x)': [1,12,50,75]})
predicted_goals3 = pred3.predict(hours)
print(predicted_goals3)


[  2.  24. 100. 150.]


# Multiple Linear Regression (MLR)

Multiple linear regression is an extension of linear regression into multiple dimensions. Instead of one independent variable, you have several. The equation is y = m1*x1 + m2*x2 + ... + mn*xn + c. Each x represents a different factor, and each m represents how much that factor contributes to the outcome

In [38]:
df2 = pd.DataFrame({
    'Player': ['A','B','C','D','E'],
    'Hours Practiced per Week (x)': [5,10,15,20,25],
    'Goals Scored in a Season (y)': [9,14,22,40,50],
    'Years Playing (x2)': [1,2,4,5,6]
})

In [39]:
z = df2[['Hours Practiced per Week (x)','Years Playing (x2)']]

In [40]:
lm.fit(z, df2['Goals Scored in a Season (y)'])

In [41]:
# Predict the number of goals for players who practice different hours per week and have different years of experience
hours_year = pd.DataFrame({
    'Hours Practiced per Week (x)': [1,12,50,75,100],
    'Years Playing (x2)': [1,2,4,5,6]
})
predicted_goals2 = lm.predict(hours_year)
print(predicted_goals2)


[ -8.09333333  24.34666667 143.2        222.86666667 302.53333333]
