In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from sklearn.linear_model import LinearRegression
link = "https://raw.githubusercontent.com/murpi/wilddata/master/quests/weather2019.csv"
df_weather = pd.read_csv(link)
df_weather.head()

Unnamed: 0,DATE,MAX_TEMPERATURE_C,MIN_TEMPERATURE_C,WINDSPEED_MAX_KMH,TEMPERATURE_MORNING_C,TEMPERATURE_NOON_C,TEMPERATURE_EVENING_C,PRECIP_TOTAL_DAY_MM,HUMIDITY_MAX_PERCENT,VISIBILITY_AVG_KM,...,WINDTEMP_MAX_C,WEATHER_CODE_MORNING,WEATHER_CODE_NOON,WEATHER_CODE_EVENING,TOTAL_SNOW_MM,UV_INDEX,SUNHOUR,OPINION,MONTH,DAY
0,2019-01-01,9,4,10,4,7,8,0.2,94,9.0,...,3,116,143,176,0,1,5.1,very bad,1,1
1,2019-01-02,8,5,18,7,7,5,0.0,90,9.0,...,3,119,116,116,0,1,8.7,very bad,1,2
2,2019-01-03,6,0,18,0,4,3,0.0,88,10.0,...,-4,116,116,116,0,1,8.7,very bad,1,3
3,2019-01-04,5,-1,15,-1,4,3,0.0,91,10.0,...,-4,116,116,122,0,1,5.1,very bad,1,4
4,2019-01-05,6,-1,8,-1,4,3,0.0,91,8.0,...,-2,143,116,116,0,1,8.7,very bad,1,5


Last time, you did a multivariate linear regression. But how can you be sure this multivariate linear regression is better than an univariate ? You have to measure it !

First regression
Let's begin with a first linear regression : create a new column 'predict_from_sun' whith the prediction of MAX temperature from the SUNHOUR variable.

In [15]:
model_from_sun = LinearRegression()
# X = df_weather[['SUNHOUR']]  # our explanatory variable (predictor)
# y = df_weather['MAX_TEMPERATURE_C'] # our target
model_from_sun.fit(df_weather[['SUNHOUR']], df_weather['MAX_TEMPERATURE_C'])

# create a new column with our prediction
df_weather['predict_from_sun'] = model_from_sun.predict(df_weather[['SUNHOUR']])

R2 score
The best possible R2 score is '1', when our prediction predicts perfectly the reality. Let's see what is our R2 score :

In [5]:
model_from_sun.score(df_weather[['SUNHOUR']], df_weather['MAX_TEMPERATURE_C'])

0.47654554059087306

Let's continue with 2 others regressions
Second regression : create a new column 'predict_from_min' whith the prediction of MAX temperature from the MIN temperature variable
Third regression : create a new column 'predict_from_both' whith the prediction of MAX temperature from the both variables (MIN temperature and Sunhours)

In [6]:
# First regression
reg = LinearRegression()
reg.fit(df_weather[['SUNHOUR']], df_weather['MAX_TEMPERATURE_C'])
df_weather['predict_from_sun'] = reg.predict(df_weather[['SUNHOUR']])

# Second regression
reg = LinearRegression()
reg.fit(df_weather[['MIN_TEMPERATURE_C']], df_weather['MAX_TEMPERATURE_C'])
df_weather['predict_from_min'] = reg.predict(df_weather[['MIN_TEMPERATURE_C']])

# Third regression
reg = LinearRegression()
reg.fit(df_weather[['MIN_TEMPERATURE_C', 'SUNHOUR']], df_weather['MAX_TEMPERATURE_C'])
df_weather['predict_from_both'] = reg.predict(df_weather[['MIN_TEMPERATURE_C', 'SUNHOUR']])

# R2 score for each model
X = df_weather[['SUNHOUR']]
y = df_weather['MAX_TEMPERATURE_C']
model_from_sun = LinearRegression().fit(X, y)
r2_sun = model_from_sun.score(X, y)

X = df_weather[['MIN_TEMPERATURE_C']]
y = df_weather['MAX_TEMPERATURE_C']
model_from_min = LinearRegression().fit(X, y)
r2_min = model_from_min.score(X, y)

X = df_weather[['MIN_TEMPERATURE_C', 'SUNHOUR']]
y = df_weather['MAX_TEMPERATURE_C']
model_from_both = LinearRegression().fit(X, y)
r2_both = model_from_both.score(X, y)

print("R2 score from SUNHOUR:", r2_sun)
print("R2 score from MIN_TEMPERATURE_C:", r2_min)
print("R2 score from both:", r2_both)

R2 score from SUNHOUR: 0.47654554059087306
R2 score from MIN_TEMPERATURE_C: 0.7689396999057355
R2 score from both: 0.8674787980774968


# model_from_both has a better score than model_from_min,
# this makes sense as model_from_both is fitted with more explanatory variables than model_from_min.

Calculate the R2 score of the 2 new predictions
Be careful : if you still use the same "X" name, you will overwrite it.

Which model has the best score ? Do you think it's logical ?

In [7]:
# Second regression
X = df_weather[['MIN_TEMPERATURE_C']]
y = df_weather['MAX_TEMPERATURE_C']
model_from_min = LinearRegression().fit(X, y)
r2_min = model_from_min.score(X, y)

# Third regression
X = df_weather[['MIN_TEMPERATURE_C', 'SUNHOUR']]
y = df_weather['MAX_TEMPERATURE_C']
model_from_both = LinearRegression().fit(X, y)
r2_both = model_from_both.score(X, y)

print("R2 score from MIN_TEMPERATURE_C:", r2_min)
print("R2 score from both:", r2_both)

R2 score from MIN_TEMPERATURE_C: 0.7689396999057355
R2 score from both: 0.8674787980774968


Train Test Split
One of biggest problems of Machine learning is : overfitting.

To be sure that machine didn't memorize the result, we use the Train Test Split methodology. We keep some data separate (often 25% of our initial dataset). Then we train our model on the 75% (the "Train set"). After, we can calculate a score on the "Test set".

In [8]:
from sklearn.model_selection import train_test_split

X = df_weather[['SUNHOUR']]
y = df_weather['MAX_TEMPERATURE_C']

# Here, we split our 2 datasets (the variables "X" and the target "y") into 4 datasets X and y for the train set and X and y for the test set.
# We set the size of the train set to 75%. And the rest is for the test set.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size = 0.75)
print("The length of the initial dataset is :", len(X))
print("The length of the train dataset is   :", len(X_train))
print("The length of the test dataset is    :", len(X_test))

# Here we train the model only on the train dataset.
newmodel = LinearRegression().fit(X_train, y_train)

# And now we compare both scores :
print("\nScore for the Train dataset :", newmodel.score(X_train, y_train))
print("Score for the Test dataset :", newmodel.score(X_test, y_test))

The length of the initial dataset is : 365
The length of the train dataset is   : 273
The length of the test dataset is    : 92

Score for the Train dataset : 0.47243569075679914
Score for the Test dataset : 0.4749360350733982


Both scores are very close, there is no overfitting, well done !
What happens if we don't randomize our dataset. Here, the model learns only on the 9 first months.

In [9]:
# Juste read and execute the code below
from sklearn.model_selection import train_test_split

X = df_weather[['MIN_TEMPERATURE_C']]
y = df_weather['MAX_TEMPERATURE_C']

# We set the size of the train set to 75%. And the rest is for the test set.
# We set the split NOT in random.
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, shuffle = False)


# Here we train the model only on the train dataset.
newmodel = LinearRegression().fit(X_train, y_train)

# And now we compare both scores :
print("\nScore for the Train dataset :", newmodel.score(X_train, y_train))
print("Score for the Test dataset :", newmodel.score(X_test, y_test))


Score for the Train dataset : 0.7875765302008688
Score for the Test dataset : 0.03610833322378593


There is an overfitting !
Indeed, the model get a good score on the Train dataset, because he learned in winter / spring / summer datas. But he gets a bad score in Falls...

Let's play !
Train a new model with all numeric variables (without your target of course) and try to have a better score than previously.

Remember to split randomly your dataset before training your model.

Display the Test score.

In [11]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Read the data
link = "https://raw.githubusercontent.com/murpi/wilddata/master/quests/weather2019.csv"
df_weather = pd.read_csv(link)

# Select numeric variables
numeric_variables = df_weather.select_dtypes(include=["int64", "float64"])
numeric_variables = numeric_variables.drop("MAX_TEMPERATURE_C", axis=1)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(numeric_variables, df_weather["MAX_TEMPERATURE_C"], random_state=42, train_size=0.75)

# Train the model
model = LinearRegression().fit(X_train, y_train)

# Evaluate the model on the test set
test_score = model.score(X_test, y_test)

# Print the test score
print("Test score:", test_score)

Test score: 0.9953728575100915
