# Prediction of weather

In [49]:
import pandas as pd
weather= pd.read_csv("weather.csv",index_col="DATE")
#reading the csv file and setting the index value

In [31]:
#for calculating the null percent in each row:

null_pct=weather.apply(pd.isnull).sum()/weather.shape[0]
valid_columns=weather.columns[null_pct<0.05]

In [32]:
null_pct

station    0.000000
name       0.000000
awnd       0.265101
fmtm       0.265101
pgtm       0.107274
prcp       0.000000
snow       0.000000
snwd       0.000000
tmax       0.000000
tmin       0.000000
wdf2       0.498626
wsf2       0.498626
target     0.000000
dtype: float64

In [33]:
valid_columns

Index(['station', 'name', 'prcp', 'snow', 'snwd', 'tmax', 'tmin', 'target'], dtype='object')

In [34]:
weather=weather[valid_columns].copy()
weather.columns=weather.columns.str.lower()

In [35]:
weather.index=pd.to_datetime(weather.index)
weather.index.year

Int64Index([1970, 1970, 1970, 1970, 1970, 1970, 1970, 1970, 1970, 1970,
            ...
            2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022],
           dtype='int64', name='DATE', length=19287)

In [36]:
weather["target"]=weather.shift(-1)["tmax"]
weather=weather.ffill()

In [37]:
weather

Unnamed: 0_level_0,station,name,prcp,snow,snwd,tmax,tmin,target
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1970-01-01,USW00094789,"JFK INTERNATIONAL AIRPORT, NY US",0.00,0.0,0.0,28,22,31.0
1970-01-02,USW00094789,"JFK INTERNATIONAL AIRPORT, NY US",0.00,0.0,0.0,31,22,38.0
1970-01-03,USW00094789,"JFK INTERNATIONAL AIRPORT, NY US",0.02,0.0,0.0,38,25,31.0
1970-01-04,USW00094789,"JFK INTERNATIONAL AIRPORT, NY US",0.00,0.0,0.0,31,23,35.0
1970-01-05,USW00094789,"JFK INTERNATIONAL AIRPORT, NY US",0.00,0.0,0.0,35,21,36.0
...,...,...,...,...,...,...,...,...
2022-10-17,USW00094789,"JFK INTERNATIONAL AIRPORT, NY US",0.08,0.0,0.0,67,54,58.0
2022-10-18,USW00094789,"JFK INTERNATIONAL AIRPORT, NY US",0.00,0.0,0.0,58,48,56.0
2022-10-19,USW00094789,"JFK INTERNATIONAL AIRPORT, NY US",0.00,0.0,0.0,56,43,61.0
2022-10-20,USW00094789,"JFK INTERNATIONAL AIRPORT, NY US",0.00,0.0,0.0,61,44,64.0


In [38]:
from sklearn.linear_model import Ridge
rr=Ridge(alpha=0.1) #The value indicates how much the values should be shrunk for colinearity
predictors=weather.columns[~ weather.columns.isin(["target","station","name"])]

In [39]:
predictors #columns

Index(['prcp', 'snow', 'snwd', 'tmax', 'tmin'], dtype='object')

In [45]:
def backtest( weather, model,predictors,start=3650,step=90):
    all_predictions=[]
    for i in range(start,weather.shape[0],step):
        train=weather.iloc[:i:]
        test=weather.iloc[i:(i+step):]
        model.fit(train[predictors],train["target"])
        preds=model.predict(test[predictors])
        preds= pd.Series(preds, index=test.index)
        combined=pd.concat([test["target"], preds],axis=1)
        combined.columns=["actual","prediction"]
        combined["diff"]=(combined["actual"]-combined["prediction"]).abs()
        all_predictions.append(combined)
    return pd.concat(all_predictions)


In [46]:
predictions=backtest(weather,rr,predictors)
#all The data before the i th index are in train dataset, all the data (90+i) are in the test data set

In [47]:
predictions

Unnamed: 0_level_0,actual,prediction,diff
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1979-12-30,43.0,50.229324,7.229324
1979-12-31,42.0,43.673798,1.673798
1980-01-01,41.0,41.579150,0.579150
1980-01-02,36.0,43.961887,7.961887
1980-01-03,30.0,40.204726,10.204726
...,...,...,...
2022-10-17,58.0,67.543412,9.543412
2022-10-18,56.0,60.511771,4.511771
2022-10-19,61.0,57.030435,3.969565
2022-10-20,64.0,59.784804,4.215196


In [48]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(predictions["actual"],predictions["prediction"])

5.13932667966084