In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from sklearn.linear_model import LinearRegression
link = "https://raw.githubusercontent.com/murpi/wilddata/master/quests/weather2019.csv"
df_weather = pd.read_csv(link)
df_weather.head()

Unnamed: 0,DATE,MAX_TEMPERATURE_C,MIN_TEMPERATURE_C,WINDSPEED_MAX_KMH,TEMPERATURE_MORNING_C,TEMPERATURE_NOON_C,TEMPERATURE_EVENING_C,PRECIP_TOTAL_DAY_MM,HUMIDITY_MAX_PERCENT,VISIBILITY_AVG_KM,...,WINDTEMP_MAX_C,WEATHER_CODE_MORNING,WEATHER_CODE_NOON,WEATHER_CODE_EVENING,TOTAL_SNOW_MM,UV_INDEX,SUNHOUR,OPINION,MONTH,DAY
0,2019-01-01,9,4,10,4,7,8,0.2,94,9.0,...,3,116,143,176,0,1,5.1,very bad,1,1
1,2019-01-02,8,5,18,7,7,5,0.0,90,9.0,...,3,119,116,116,0,1,8.7,very bad,1,2
2,2019-01-03,6,0,18,0,4,3,0.0,88,10.0,...,-4,116,116,116,0,1,8.7,very bad,1,3
3,2019-01-04,5,-1,15,-1,4,3,0.0,91,10.0,...,-4,116,116,122,0,1,5.1,very bad,1,4
4,2019-01-05,6,-1,8,-1,4,3,0.0,91,8.0,...,-2,143,116,116,0,1,8.7,very bad,1,5


Last time, you did a multivariate linear regression. But how can you be sure this multivariate linear regression is better than an univariate ? You have to measure it !

First regression
Let's begin with a first linear regression : create a new column 'predict_from_sun' whith the prediction of MAX temperature from the SUNHOUR variable.

In [5]:
model_from_sun = LinearRegression()
model_from_sun.fit(df_weather[['SUNHOUR']], df_weather['MAX_TEMPERATURE_C'])
df_weather['predict_from_sun'] = model_from_sun.predict(df_weather[['SUNHOUR']])

R2 score
The best possible R2 score is '1', when our prediction predicts perfectly the reality. Let's see what is our R2 score :

In [7]:
model_from_sun.score(df_weather[['SUNHOUR']], df_weather['MAX_TEMPERATURE_C'])

0.47654554059087306

Let's continue with 2 others regressions
Second regression : create a new column 'predict_from_min' whith the prediction of MAX temperature from the MIN temperature variable
Third regression : create a new column 'predict_from_both' whith the prediction of MAX temperature from the both variables (MIN temperature and Sunhours)

In [8]:
# First regression
reg = LinearRegression()
reg.fit(df_weather[['SUNHOUR']], df_weather['MAX_TEMPERATURE_C'])
df_weather['predict_from_sun'] = reg.predict(df_weather[['SUNHOUR']])

# Second regression
reg = LinearRegression()
reg.fit(df_weather[['MIN_TEMPERATURE_C']], df_weather['MAX_TEMPERATURE_C'])
df_weather['predict_from_min'] = reg.predict(df_weather[['MIN_TEMPERATURE_C']])

# Third regression
reg = LinearRegression()
reg.fit(df_weather[['MIN_TEMPERATURE_C', 'SUNHOUR']], df_weather['MAX_TEMPERATURE_C'])
df_weather['predict_from_both'] = reg.predict(df_weather[['MIN_TEMPERATURE_C', 'SUNHOUR']])

# R2 score for each model
X = df_weather[['SUNHOUR']]
y = df_weather['MAX_TEMPERATURE_C']
model_from_sun = LinearRegression().fit(X, y)
r2_sun = model_from_sun.score(X, y)

X = df_weather[['MIN_TEMPERATURE_C']]
y = df_weather['MAX_TEMPERATURE_C']
model_from_min = LinearRegression().fit(X, y)
r2_min = model_from_min.score(X, y)

X = df_weather[['MIN_TEMPERATURE_C', 'SUNHOUR']]
y = df_weather['MAX_TEMPERATURE_C']
model_from_both = LinearRegression().fit(X, y)
r2_both = model_from_both.score(X, y)

print("R2 score from SUNHOUR:", r2_sun)
print("R2 score from MIN_TEMPERATURE_C:", r2_min)
print("R2 score from both:", r2_both)

R2 score from SUNHOUR: 0.47654554059087306
R2 score from MIN_TEMPERATURE_C: 0.7689396999057355
R2 score from both: 0.8674787980774968
