In [1]:
# import pandas 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics

In [2]:
pdf = pd.read_csv('Weather_Data.csv')
pdf.head()

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2/1/2008,19.5,22.4,15.6,6.2,0.0,W,41,S,SSW,...,92,84,1017.6,1017.4,8,8,20.7,20.9,Yes,Yes
1,2/2/2008,19.5,25.6,6.0,3.4,2.7,W,41,W,E,...,83,73,1017.9,1016.4,7,7,22.4,24.8,Yes,Yes
2,2/3/2008,21.6,24.5,6.6,2.4,0.1,W,41,ESE,ESE,...,88,86,1016.7,1015.6,7,8,23.5,23.0,Yes,Yes
3,2/4/2008,20.2,22.8,18.8,2.2,0.0,W,41,NNE,E,...,83,90,1014.2,1011.8,8,8,21.4,20.9,Yes,Yes
4,2/5/2008,19.7,25.7,77.4,4.8,0.0,W,41,NNE,W,...,88,74,1008.3,1004.8,8,8,22.5,25.5,Yes,Yes


In [3]:
pdf.nunique()

Date             3271
MinTemp           204
MaxTemp           247
Rainfall          208
Evaporation        82
Sunshine          137
WindGustDir        16
WindGustSpeed      44
WindDir9am         16
WindDir3pm         16
WindSpeed9am       26
WindSpeed3pm       29
Humidity9am        80
Humidity3pm        86
Pressure9am       361
Pressure3pm       369
Cloud9am           10
Cloud3pm            9
Temp9am           229
Temp3pm           240
RainToday           2
RainTomorrow        2
dtype: int64

In [4]:
pdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3271 entries, 0 to 3270
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           3271 non-null   object 
 1   MinTemp        3271 non-null   float64
 2   MaxTemp        3271 non-null   float64
 3   Rainfall       3271 non-null   float64
 4   Evaporation    3271 non-null   float64
 5   Sunshine       3271 non-null   float64
 6   WindGustDir    3271 non-null   object 
 7   WindGustSpeed  3271 non-null   int64  
 8   WindDir9am     3271 non-null   object 
 9   WindDir3pm     3271 non-null   object 
 10  WindSpeed9am   3271 non-null   int64  
 11  WindSpeed3pm   3271 non-null   int64  
 12  Humidity9am    3271 non-null   int64  
 13  Humidity3pm    3271 non-null   int64  
 14  Pressure9am    3271 non-null   float64
 15  Pressure3pm    3271 non-null   float64
 16  Cloud9am       3271 non-null   int64  
 17  Cloud3pm       3271 non-null   int64  
 18  Temp9am 

In [5]:
# Few Data preprocessing

# remove unwanted labels...

# remove Data column
pdf.drop('Date', axis=1, inplace=True)

pdf_processed = pd.get_dummies(data=pdf, columns=['WindGustDir', 'WindDir9am', 'WindDir3pm'])

In [6]:
pdf_processed.shape

(3271, 66)

In [7]:
pdf_processed['RainToday']

0       Yes
1       Yes
2       Yes
3       Yes
4       Yes
       ... 
3266     No
3267     No
3268     No
3269     No
3270     No
Name: RainToday, Length: 3271, dtype: object

In [8]:
# replace 'Yes/True' and 'No/False' with binary indicators.

pdf_processed = pdf_processed.replace(['Yes', 'No'], [1, 0])
pdf_processed = pdf_processed.replace([True, False], [1, 0])

In [9]:
pdf_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3271 entries, 0 to 3270
Data columns (total 66 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   MinTemp          3271 non-null   float64
 1   MaxTemp          3271 non-null   float64
 2   Rainfall         3271 non-null   float64
 3   Evaporation      3271 non-null   float64
 4   Sunshine         3271 non-null   float64
 5   WindGustSpeed    3271 non-null   int64  
 6   WindSpeed9am     3271 non-null   int64  
 7   WindSpeed3pm     3271 non-null   int64  
 8   Humidity9am      3271 non-null   int64  
 9   Humidity3pm      3271 non-null   int64  
 10  Pressure9am      3271 non-null   float64
 11  Pressure3pm      3271 non-null   float64
 12  Cloud9am         3271 non-null   int64  
 13  Cloud3pm         3271 non-null   int64  
 14  Temp9am          3271 non-null   float64
 15  Temp3pm          3271 non-null   float64
 16  RainToday        3271 non-null   int64  
 17  RainTomorrow  

In [10]:
# checking out the correlation
pdf_processed.corr(numeric_only=True)

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
MinTemp,1.000000,0.771005,0.013995,0.569316,-0.063475,0.127792,-0.298040,0.225905,0.053946,0.273269,...,-0.006096,-0.057882,-0.075180,0.071836,0.025171,-0.131773,-0.074196,-0.165429,-0.129172,-0.135055
MaxTemp,0.771005,1.000000,-0.139306,0.510334,0.327422,0.073184,-0.349382,0.155542,-0.185382,-0.150741,...,0.058478,-0.001910,-0.151647,0.008442,-0.050886,-0.213966,-0.087241,-0.114668,-0.017097,-0.108176
Rainfall,0.013995,-0.139306,1.000000,-0.110343,-0.308962,0.149811,0.153696,0.039415,0.332636,0.305635,...,-0.020313,-0.017927,0.096411,0.044433,0.053711,0.112996,0.036072,-0.034273,-0.049959,0.020607
Evaporation,0.569316,0.510334,-0.110343,1.000000,0.175932,0.245426,-0.077532,0.292469,-0.395651,-0.102295,...,-0.016018,-0.047131,-0.051291,0.069001,0.031471,-0.101030,-0.051831,-0.062852,-0.072257,-0.007111
Sunshine,-0.063475,0.327422,-0.308962,0.175932,1.000000,-0.029032,-0.058410,0.177813,-0.490631,-0.586476,...,-0.013700,-0.004798,-0.115154,-0.065347,-0.091772,-0.168845,-0.059655,0.033344,0.066551,0.041486
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WindDir3pm_SSW,-0.131773,-0.213966,0.112996,-0.101030,-0.168845,0.096481,0.137927,0.033670,0.082297,0.131597,...,-0.023685,-0.029608,-0.062162,-0.057801,-0.066055,1.000000,-0.024613,-0.055899,-0.053182,-0.040944
WindDir3pm_SW,-0.074196,-0.087241,0.036072,-0.051831,-0.059655,0.067767,0.077584,-0.031000,0.001275,-0.019017,...,-0.012215,-0.015270,-0.032058,-0.029809,-0.034066,-0.024613,1.000000,-0.028828,-0.027427,-0.021116
WindDir3pm_W,-0.165429,-0.114668,-0.034273,-0.062852,0.033344,0.100644,0.096074,0.096827,-0.134868,-0.235915,...,-0.027741,-0.034679,-0.072808,-0.067700,-0.077368,-0.055899,-0.028828,1.000000,-0.062290,-0.047956
WindDir3pm_WNW,-0.129172,-0.017097,-0.049959,-0.072257,0.066551,0.063259,0.055324,-0.031216,-0.139134,-0.277288,...,-0.026392,-0.032993,-0.069268,-0.064409,-0.073607,-0.053182,-0.027427,-0.062290,1.000000,-0.045625


In [11]:
# final part of preprocessing
y_data = pdf_processed['RainTomorrow']
x_data = pdf_processed.drop(columns='RainTomorrow', axis=1)

In [12]:
# segmenting Dataset
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, shuffle=True)

In [20]:
x_test.shape

(655, 65)

In [21]:
# creating LR Model
model = linear_model.LinearRegression()

In [22]:
# training model
model.fit(x_train, y_train)

In [23]:
# determine loss of the model

prediction = model.predict(x_test)

loss = metrics.mean_absolute_error(y_test, prediction)

In [24]:
loss

0.25678071466111047