In [1]:
# Reading in the initial data

import pandas as pd

weather = pd.read_csv("local_weather.csv", index_col="DATE")

In [2]:
weather.apply(pd.isnull).sum()/weather.shape[0]

STATION    0.000000
NAME       0.000000
ACMH       0.653360
ACSH       0.653360
AWND       0.522451
DAPR       0.999525
FMTM       0.870099
FRGT       0.999881
MDPR       0.999525
PGTM       0.495106
PRCP       0.016668
SNOW       0.324990
SNWD       0.317634
TAVG       0.879174
TMAX       0.000534
TMIN       0.000593
TSUN       0.931728
WDF1       0.653360
WDF2       0.522392
WDF5       0.527552
WDFG       0.746901
WSF1       0.653360
WSF2       0.522332
WSF5       0.527552
WSFG       0.746901
WT01       0.779939
WT02       0.980248
WT03       0.992941
WT04       0.999763
WT05       0.998339
WT07       0.999881
WT08       0.810368
WT09       0.999881
WT16       0.884038
WT18       0.999822
dtype: float64

In [3]:
core_weather = weather[["PRCP", "SNOW", "SNWD", "TMAX", "TMIN"]].copy()
core_weather.columns = ["precip", "snow", "snow_depth", "temp_max", "temp_min"]

In [4]:
core_weather.apply(pd.isnull).sum()

precip         281
snow          5479
snow_depth    5355
temp_max         9
temp_min        10
dtype: int64

In [5]:
core_weather["snow"].value_counts()

0.0    11379
1.0        1
Name: snow, dtype: int64

In [6]:
core_weather["snow_depth"].value_counts()

0.0    11504
Name: snow_depth, dtype: int64

In [7]:
del core_weather["snow"]

In [8]:
del core_weather["snow_depth"]

In [9]:
core_weather[pd.isnull(core_weather["precip"])]

Unnamed: 0_level_0,precip,temp_max,temp_min
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1983-10-29,,67.0,57.0
1983-10-30,,70.0,63.0
1983-10-31,,69.0,61.0
1983-11-12,,63.0,55.0
1983-11-13,,60.0,50.0
...,...,...,...
2013-12-15,,58.0,33.0
2016-05-01,,80.0,55.0
2016-05-02,,68.0,53.0
2016-05-08,,67.0,56.0


In [10]:
core_weather.loc["2013-12-15",:]

precip       NaN
temp_max    58.0
temp_min    33.0
Name: 2013-12-15, dtype: float64

In [11]:
core_weather["precip"].value_counts() / core_weather.shape[0]

0.00    0.810487
0.01    0.025980
0.02    0.011804
0.03    0.007236
0.04    0.006050
          ...   
1.19    0.000059
1.88    0.000059
2.39    0.000059
1.59    0.000059
2.37    0.000059
Name: precip, Length: 176, dtype: float64

In [12]:
core_weather["precip"] = core_weather["precip"].fillna(0)

In [13]:
core_weather.apply(pd.isnull).sum()

precip       0
temp_max     9
temp_min    10
dtype: int64

In [14]:
core_weather[pd.isnull(core_weather["temp_min"])]

Unnamed: 0_level_0,precip,temp_max,temp_min
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2004-11-20,0.0,,
2011-12-21,0.0,61.0,
2011-12-22,0.0,62.0,
2011-12-23,0.0,56.0,
2011-12-24,0.0,55.0,
2011-12-25,0.0,54.0,
2013-06-16,0.0,,
2020-08-29,0.0,,
2020-09-08,0.0,,
2020-09-09,0.0,,


In [15]:
core_weather.loc["2011-12-18":"2011-12-28"]

Unnamed: 0_level_0,precip,temp_max,temp_min
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2011-12-18,0.0,52.0,33.0
2011-12-19,0.0,55.0,35.0
2011-12-20,0.0,61.0,35.0
2011-12-21,0.0,61.0,
2011-12-22,0.0,62.0,
2011-12-23,0.0,56.0,
2011-12-24,0.0,55.0,
2011-12-25,0.0,54.0,
2011-12-26,0.0,50.0,32.0
2011-12-27,0.0,56.0,39.0


In [16]:
core_weather = core_weather.fillna(method="ffill")

In [17]:
core_weather.apply(pd.isnull).sum()

precip      0
temp_max    0
temp_min    0
dtype: int64

In [18]:
# Check for missing value defined in data documentation
core_weather.apply(lambda x: (x == 9999).sum())

precip      0
temp_max    0
temp_min    0
dtype: int64

In [19]:
core_weather.dtypes

precip      float64
temp_max    float64
temp_min    float64
dtype: object

In [20]:
core_weather.index

Index(['1960-01-01', '1960-01-02', '1960-01-03', '1960-01-04', '1960-01-05',
       '1960-01-06', '1960-01-07', '1960-01-08', '1960-01-09', '1960-01-10',
       ...
       '2022-01-19', '2022-01-20', '2022-01-21', '2022-01-22', '2022-01-23',
       '2022-01-24', '2022-01-25', '2022-01-26', '2022-01-27', '2022-01-28'],
      dtype='object', name='DATE', length=16859)

In [21]:
core_weather.index = pd.to_datetime(core_weather.index)

In [22]:
core_weather.index

DatetimeIndex(['1960-01-01', '1960-01-02', '1960-01-03', '1960-01-04',
               '1960-01-05', '1960-01-06', '1960-01-07', '1960-01-08',
               '1960-01-09', '1960-01-10',
               ...
               '2022-01-19', '2022-01-20', '2022-01-21', '2022-01-22',
               '2022-01-23', '2022-01-24', '2022-01-25', '2022-01-26',
               '2022-01-27', '2022-01-28'],
              dtype='datetime64[ns]', name='DATE', length=16859, freq=None)

In [23]:
core_weather.index.year

Int64Index([1960, 1960, 1960, 1960, 1960, 1960, 1960, 1960, 1960, 1960,
            ...
            2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022],
           dtype='int64', name='DATE', length=16859)

In [24]:
core_weather[["temp_max", "temp_min"]].plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7f1ad8af92b0>

In [25]:
core_weather.index.year.value_counts().sort_index()

1960    366
1961    365
1962    365
1963    365
1964    366
1965    365
1966    365
1967    365
1968    366
1969    365
1970    365
1971    365
1972    366
1973    365
1974    365
1975    365
1976    366
1977    365
1978    365
1979    365
1980    366
1983    184
1984    366
1985    365
1986    212
2000    365
2001    365
2002    365
2003    365
2004    366
2005    365
2006    365
2007    365
2008    366
2009    365
2010    365
2011    365
2012    365
2013    365
2014    365
2015    365
2016    366
2017    365
2018    365
2019    365
2020    366
2021    364
2022     28
Name: DATE, dtype: int64

In [26]:
core_weather["precip"].plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7f1ad8af92b0>

In [27]:
core_weather.groupby(core_weather.index.year).apply(lambda x: x["precip"].sum()).plot()

ValueError: view limit minimum -34851.15 is less than 1 and is an invalid Matplotlib date value. This often happens if you pass a non-datetime value to an axis that has datetime units

In [28]:
core_weather["target"] = core_weather.shift(-1)["temp_max"]

In [29]:
core_weather

Unnamed: 0_level_0,precip,temp_max,temp_min,target
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1960-01-01,0.0,49.0,30.0,49.0
1960-01-02,0.0,49.0,29.0,54.0
1960-01-03,0.0,54.0,35.0,54.0
1960-01-04,0.0,54.0,36.0,55.0
1960-01-05,0.0,55.0,33.0,53.0
...,...,...,...,...
2022-01-24,0.0,60.0,39.0,57.0
2022-01-25,0.0,57.0,43.0,57.0
2022-01-26,0.0,57.0,41.0,67.0
2022-01-27,0.0,67.0,39.0,64.0


In [30]:
core_weather = core_weather.iloc[:-1,:].copy()

In [31]:
core_weather

Unnamed: 0_level_0,precip,temp_max,temp_min,target
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1960-01-01,0.0,49.0,30.0,49.0
1960-01-02,0.0,49.0,29.0,54.0
1960-01-03,0.0,54.0,35.0,54.0
1960-01-04,0.0,54.0,36.0,55.0
1960-01-05,0.0,55.0,33.0,53.0
...,...,...,...,...
2022-01-23,0.0,60.0,41.0,60.0
2022-01-24,0.0,60.0,39.0,57.0
2022-01-25,0.0,57.0,43.0,57.0
2022-01-26,0.0,57.0,41.0,67.0


In [32]:
from sklearn.linear_model import Ridge

reg = Ridge(alpha=.1)

In [33]:
predictors = ["precip", "temp_max", "temp_min"]

In [34]:
train = core_weather.loc[:"2020-12-31"]
test = core_weather.loc["2021-01-01":]

In [35]:
train

Unnamed: 0_level_0,precip,temp_max,temp_min,target
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1960-01-01,0.00,49.0,30.0,49.0
1960-01-02,0.00,49.0,29.0,54.0
1960-01-03,0.00,54.0,35.0,54.0
1960-01-04,0.00,54.0,36.0,55.0
1960-01-05,0.00,55.0,33.0,53.0
...,...,...,...,...
2020-12-27,0.00,63.0,44.0,61.0
2020-12-28,0.10,61.0,42.0,60.0
2020-12-29,0.00,60.0,39.0,56.0
2020-12-30,0.07,56.0,36.0,62.0


In [36]:
test

Unnamed: 0_level_0,precip,temp_max,temp_min,target
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-01-01,0.00,60.0,40.0,57.0
2021-01-02,0.14,57.0,51.0,56.0
2021-01-03,0.00,56.0,49.0,62.0
2021-01-04,0.36,62.0,46.0,59.0
2021-01-05,0.00,59.0,42.0,59.0
...,...,...,...,...
2022-01-23,0.00,60.0,41.0,60.0
2022-01-24,0.00,60.0,39.0,57.0
2022-01-25,0.00,57.0,43.0,57.0
2022-01-26,0.00,57.0,41.0,67.0


In [37]:
reg.fit(train[predictors], train["target"])

Ridge(alpha=0.1)

In [38]:
predictions = reg.predict(test[predictors])

In [39]:
from sklearn.metrics import mean_squared_error

mean_squared_error(test["target"], predictions)

20.56066854811876

In [40]:
combined = pd.concat([test["target"], pd.Series(predictions, index=test.index)], axis=1)
combined.columns = ["actual", "predictions"]

In [41]:
combined

Unnamed: 0_level_0,actual,predictions
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-01-01,57.0,59.806024
2021-01-02,56.0,59.310181
2021-01-03,62.0,58.538685
2021-01-04,59.0,61.531814
2021-01-05,59.0,59.444266
...,...,...
2022-01-23,60.0,59.985714
2022-01-24,57.0,59.626333
2022-01-25,57.0,58.181680
2022-01-26,67.0,57.822299


In [42]:
combined.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7f1abc62e2e0>

In [43]:
reg.coef_

array([-2.20730384,  0.72113834,  0.17969047])

In [44]:
core_weather["month_max"] = core_weather["temp_max"].rolling(30).mean()

core_weather["month_day_max"] = core_weather["month_max"] / core_weather["temp_max"]

core_weather["max_min"] = core_weather["temp_max"] / core_weather["temp_min"]

In [46]:
core_weather = core_weather.iloc[30:,:].copy()

In [45]:
def create_predictions(predictors, core_weather, reg):
    train = core_weather.loc[:"2020-12-31"]
    test = core_weather.loc["2021-01-01":]

    reg.fit(train[predictors], train["target"])
    predictions = reg.predict(test[predictors])

    error = mean_squared_error(test["target"], predictions)
    
    combined = pd.concat([test["target"], pd.Series(predictions, index=test.index)], axis=1)
    combined.columns = ["actual", "predictions"]
    return error, combined

In [47]:
predictors = ["precip", "temp_max", "temp_min", "month_day_max", "max_min"]

error, combined = create_predictions(predictors, core_weather, reg)
error

20.170663808991097

In [48]:
combined.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7f1abc601460>

In [51]:
core_weather["monthly_avg"] = core_weather["temp_max"].groupby(core_weather.index.month).apply(lambda x: x.expanding(1).mean())

In [None]:
core_weather["day_of_year_avg"] = core_weather["temp_max"].groupby(core_weather.index.day_of_year).apply(lambda x: x.expanding(1).mean())