# Execute the code below

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from sklearn.linear_model import LinearRegression
link = "https://raw.githubusercontent.com/LucaSainteCroix/teaching-resources/main/exercises-data/weather2019.csv"
df_weather = pd.read_csv(link)

# Scoring and metrics
Last time, you did a multivariate linear regression. But how can you be sure this multivariate linear regression is better than an univariate ? You have to measure it !


## First regression
Let's begin with a first linear regression : create a new column `'predict_from_sun'` whith the prediction of MAX temperature from the SUNHOUR variable.

In [2]:
# Your code here :
X = df_weather[['SUNHOUR']]  # our explanatory variable (predictor)
y = df_weather['MAX_TEMPERATURE_C'] # our target
model_from_sun = LinearRegression().fit(X, y)  # our model (LinearRegression) that we fit with our explicative variable and target, so it can learn from them

In [3]:
# create a new column with our prediction
df_weather['predict_from_sun'] = model_from_sun.predict(X)  # our prediction is made according to the model we just trained and according to our explicative variable

# you can check it has been done
df_weather.head()

Unnamed: 0,DATE,MAX_TEMPERATURE_C,MIN_TEMPERATURE_C,WINDSPEED_MAX_KMH,TEMPERATURE_MORNING_C,TEMPERATURE_NOON_C,TEMPERATURE_EVENING_C,PRECIP_TOTAL_DAY_MM,HUMIDITY_MAX_PERCENT,VISIBILITY_AVG_KM,...,WEATHER_CODE_MORNING,WEATHER_CODE_NOON,WEATHER_CODE_EVENING,TOTAL_SNOW_MM,UV_INDEX,SUNHOUR,OPINION,MONTH,DAY,predict_from_sun
0,2019-01-01,9,4,10,4,7,8,0.2,94,9.0,...,116,143,176,0,1,5.1,very bad,1,1,11.396823
1,2019-01-02,8,5,18,7,7,5,0.0,90,9.0,...,119,116,116,0,1,8.7,very bad,1,2,16.020019
2,2019-01-03,6,0,18,0,4,3,0.0,88,10.0,...,116,116,116,0,1,8.7,very bad,1,3,16.020019
3,2019-01-04,5,-1,15,-1,4,3,0.0,91,10.0,...,116,116,122,0,1,5.1,very bad,1,4,11.396823
4,2019-01-05,6,-1,8,-1,4,3,0.0,91,8.0,...,143,116,116,0,1,8.7,very bad,1,5,16.020019


## R2 score
The best possible R2 score is '1', when our prediction predicts perfectly the reality. Let's see what is our R2 score :

In [4]:
# Change the name of the model if it's necessary

# you can get R2 score with the .score fonction from a LinearRegression model

R2_score_model_from_sun = model_from_sun.score(X, y)
print(f'R2 score for mode_from_sun is : {round(R2_score_model_from_sun, 4)}')

R2 score for mode_from_sun is : 0.4765


## Let's continue with 2 others regressions
- Second regression : create a new column 'predict_from_min' whith the prediction of MAX temperature from the MIN temperature variable
- Third regression : create a new column 'predict_from_both' whith the prediction of MAX temperature from the both variables (MIN temperature and Sunhours)

In [5]:
# Your code here :

# second regression
X2 = df_weather[['MIN_TEMPERATURE_C']]
y2 = df_weather['MAX_TEMPERATURE_C']

# this second model is fitted from a different explanatory variable, we can name it accordingly
model_from_min = LinearRegression().fit(X2, y2)

In [6]:
# add a new column to the dataframe with our new prediction
df_weather['predict_from_min'] = model_from_min.predict(X2)

# check the dataframe
df_weather.head()

Unnamed: 0,DATE,MAX_TEMPERATURE_C,MIN_TEMPERATURE_C,WINDSPEED_MAX_KMH,TEMPERATURE_MORNING_C,TEMPERATURE_NOON_C,TEMPERATURE_EVENING_C,PRECIP_TOTAL_DAY_MM,HUMIDITY_MAX_PERCENT,VISIBILITY_AVG_KM,...,WEATHER_CODE_NOON,WEATHER_CODE_EVENING,TOTAL_SNOW_MM,UV_INDEX,SUNHOUR,OPINION,MONTH,DAY,predict_from_sun,predict_from_min
0,2019-01-01,9,4,10,4,7,8,0.2,94,9.0,...,143,176,0,1,5.1,very bad,1,1,11.396823,10.579999
1,2019-01-02,8,5,18,7,7,5,0.0,90,9.0,...,116,116,0,1,8.7,very bad,1,2,16.020019,11.802741
2,2019-01-03,6,0,18,0,4,3,0.0,88,10.0,...,116,116,0,1,8.7,very bad,1,3,16.020019,5.689031
3,2019-01-04,5,-1,15,-1,4,3,0.0,91,10.0,...,116,122,0,1,5.1,very bad,1,4,11.396823,4.466289
4,2019-01-05,6,-1,8,-1,4,3,0.0,91,8.0,...,116,116,0,1,8.7,very bad,1,5,16.020019,4.466289


In [7]:
# third regression :
X3 = df_weather[['MIN_TEMPERATURE_C','SUNHOUR']] # this time we have several explanatory variables
y3 = df_weather['MAX_TEMPERATURE_C']

model_from_both = LinearRegression().fit(X3, y3)

In [8]:
df_weather['predict_from_both'] = model_from_both.predict(X3)

df_weather.head()

Unnamed: 0,DATE,MAX_TEMPERATURE_C,MIN_TEMPERATURE_C,WINDSPEED_MAX_KMH,TEMPERATURE_MORNING_C,TEMPERATURE_NOON_C,TEMPERATURE_EVENING_C,PRECIP_TOTAL_DAY_MM,HUMIDITY_MAX_PERCENT,VISIBILITY_AVG_KM,...,WEATHER_CODE_EVENING,TOTAL_SNOW_MM,UV_INDEX,SUNHOUR,OPINION,MONTH,DAY,predict_from_sun,predict_from_min,predict_from_both
0,2019-01-01,9,4,10,4,7,8,0.2,94,9.0,...,176,0,1,5.1,very bad,1,1,11.396823,10.579999,8.980922
1,2019-01-02,8,5,18,7,7,5,0.0,90,9.0,...,116,0,1,8.7,very bad,1,2,16.020019,11.802741,12.353602
2,2019-01-03,6,0,18,0,4,3,0.0,88,10.0,...,116,0,1,8.7,very bad,1,3,16.020019,5.689031,7.410233
3,2019-01-04,5,-1,15,-1,4,3,0.0,91,10.0,...,122,0,1,5.1,very bad,1,4,11.396823,4.466289,4.037552
4,2019-01-05,6,-1,8,-1,4,3,0.0,91,8.0,...,116,0,1,8.7,very bad,1,5,16.020019,4.466289,6.421559


## Calculate the R2 score of the 2 new predictions
Be careful : if you still use the same "X" name, you will overwrite it.

Which model has the best score ? Do you think it's logic ?

In [9]:
# Your code here :

# R2 score for model_from_min
model_from_min.score(X2, y2)

0.7689396999057355

In [10]:
# R2 score for model_from_both
model_from_both.score(X3,y3)

0.8674787980774968

In [11]:
# model_from_both has a better score than model_from_min,
# this makes sense as model_from_both is fitted with more explanatory variables than model_from_min.

# Train Test Split
One of biggest problems of Machine learning is : **overfitting**.



To be sure that machine didn't memorize the result, we use the Train Test Split methodology. We keep some data separate (often 25% of our initial dataset). Then we train our model on the 75% (the "Train set").
After, we can calculate a score on the "Test set".

Let's do that !

In [12]:
# Just read and execute the code below
from sklearn.model_selection import train_test_split

X = df_weather[['SUNHOUR']]
y = df_weather['MAX_TEMPERATURE_C']

# Here, we split our 2 datasets (the variables "X" and the target "y") into 4 datasets X and y for the train set and X and y for the test set.
# We keep default size of the train set (75%). And the rest is for the test set.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print("The length of the initial dataset is :", len(X))
print("The length of the train dataset is   :", len(X_train))
print("The length of the test dataset is    :", len(X_test))

# Here we train the model only on the train dataset.
newmodel = LinearRegression().fit(X_train, y_train)

# And now we compare both scores :
print("\nScore for the Train dataset :", newmodel.score(X_train, y_train))
print("Score for the Test dataset :", newmodel.score(X_test, y_test))


The length of the initial dataset is : 365
The length of the train dataset is   : 273
The length of the test dataset is    : 92

Score for the Train dataset : 0.47243569075679914
Score for the Test dataset : 0.4749360350733982


## Both scores are very close, there is no overfitting, well done !

What happens if we don't randomize our dataset. Here, the model learns only on the 9 first months.

In [13]:
# Just read and execute the code below
from sklearn.model_selection import train_test_split

X = df_weather[['MIN_TEMPERATURE_C']]
y = df_weather['MAX_TEMPERATURE_C']

# We set the size of the train set to 75% (default value). And the rest is for the test set.
# We set the split NOT in random.
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = False)


# Here we train the model only on the train dataset.
newmodel = LinearRegression().fit(X_train, y_train)

# And now we compare both scores :
print("\nScore for the Train dataset :", newmodel.score(X_train, y_train))
print("Score for the Test dataset :", newmodel.score(X_test, y_test))


Score for the Train dataset : 0.7875765302008688
Score for the Test dataset : 0.03610833322378626


## There is an overfitting !
Indeed, the model get a good score on the Train dataset, because he learned in winter / spring / summer datas. But he get a bad score in Falls...

# Let's play !
Train a new model with all numeric variables (without your target of course) and try to have a better score than previously.

Remember to split randomly your dataset before training your model.

Display the Test score.

In [14]:
# Your code here :
from sklearn.model_selection import train_test_split

# select all the explanaroty variables (predictors) except target variables (MAX_TEMERATURE_C and all predicted columns)
X = df_weather.select_dtypes(include = 'number').drop(columns=['MAX_TEMPERATURE_C',
                                                               'predict_from_sun', 'predict_from_min', 'predict_from_both'])
y = df_weather['MAX_TEMPERATURE_C']

# split datasets randomly in 4 datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)  # train size is 75% (default value)

# fit a model of Linear Regression with the train datasets
model_train = LinearRegression().fit(X_train, y_train)

# score this model with both train datasets and test datasets :
print("\nScore for the Train dataset :", model_train.score(X_train, y_train))
print("Score for the Test dataset :", model_train.score(X_test, y_test))


Score for the Train dataset : 0.9933353831340123
Score for the Test dataset : 0.9953728575100915
