# Shanghai Weather Dataset

## load full dataset

In [1]:
import pandas as pd
from parallel_pandas import ParallelPandas

ParallelPandas.initialize()

In [2]:
df_raw = pd.read_csv('archive/Shanghai AQI and Wheather 2014-2021.csv')

In [3]:
df_raw

Unnamed: 0,date,maxtempC,mintempC,totalSnow_cm,sunHour,uvIndex,moon_illumination,DewPointC,FeelsLikeC,HeatIndexC,...,cloudcover,humidity,precipMM,pressure,tempC,visibility,winddirDegree,windspeedKmph,AQI,AQI_Explained
0,2014-01-01,15.0,5.0,0.0,8.7,4.0,0.0,-1.0,11.0,12.0,...,0.0,43.0,0.0,1021.0,15.0,10.0,242.0,12.0,319.0,Hazardous
1,2014-01-02,14.0,7.0,0.0,8.7,4.0,2.0,4.0,11.0,13.0,...,3.0,60.0,0.0,1019.0,14.0,10.0,141.0,14.0,352.0,Hazardous
2,2014-01-03,16.0,9.0,0.0,8.7,4.0,10.0,3.0,11.0,12.0,...,26.0,55.0,0.0,1017.0,16.0,10.0,295.0,14.0,338.0,Hazardous
3,2014-01-04,10.0,4.0,0.0,5.5,2.0,17.0,3.0,7.0,9.0,...,24.0,68.0,0.1,1022.0,10.0,10.0,169.0,14.0,355.0,Hazardous
4,2014-01-05,10.0,3.0,0.0,8.7,3.0,24.0,3.0,9.0,9.0,...,12.0,66.0,0.0,1024.0,10.0,10.0,117.0,6.0,343.0,Hazardous
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2497,2021-01-26,12.0,10.0,0.0,4.0,3.0,88.0,7.0,10.0,11.0,...,91.0,77.0,1.1,1023.0,12.0,8.0,233.0,9.0,229.0,Very unhealthy
2498,2021-01-27,11.0,7.0,0.0,8.7,2.0,96.0,4.0,7.0,9.0,...,55.0,72.0,0.0,1024.0,11.0,10.0,49.0,11.0,200.0,Unhealthy
2499,2021-01-28,10.0,4.0,0.0,8.7,3.0,100.0,-5.0,5.0,8.0,...,16.0,44.0,0.0,1028.0,10.0,10.0,291.0,19.0,309.0,Hazardous
2500,2021-01-29,9.0,6.0,0.0,8.7,3.0,90.0,-9.0,5.0,7.0,...,0.0,32.0,0.0,1030.0,9.0,10.0,188.0,10.0,320.0,Hazardous


## task definition
given weather data, predict today's temperature(avg)

## build complex features:
1. time window(temp)
+ past 1-day avg/high
+ past 3 days avg/high
+ past 7 days avg/high

In [4]:
DEFAULT_FEATURE_VALUE = -999.0

build time window features

In [5]:
df_raw['tempC_avg_today'] = df_raw.p_apply(lambda l: (l.maxtempC - l.mintempC) / 2, axis=1)

<LAMBDA> DONE:   0%|          | 0/2502 [00:00<?, ?it/s]

In [6]:
df_raw['temp_to_pred'] = DEFAULT_FEATURE_VALUE
for i in range(len(df_raw) - 1):
    df_raw.at[i, 'temp_to_pred'] = df_raw.iloc[i + 1]['tempC_avg_today']

In [7]:
def set_time_window_avg_feature(df, time_window: int, window_col_name: str, base_col_name: str):
    df[window_col_name] = DEFAULT_FEATURE_VALUE  # initialize
    for i in range(time_window, len(df)):
        df.at[i, window_col_name] = sum(df[base_col_name].iloc[i - time_window:i].tolist()) / time_window
    return df


def set_time_window_feature(df, time_window: int, window_col_name: str, base_col_name: str):
    df[window_col_name] = DEFAULT_FEATURE_VALUE  # initialize
    for i in range(time_window, len(df)):
        df.at[i, window_col_name] = df[base_col_name].iloc[i - time_window]
    return df

avg temperature

In [8]:
df_raw = set_time_window_avg_feature(df_raw, 1, 'tempC_avg_1days', 'tempC_avg_today')
df_raw = set_time_window_avg_feature(df_raw, 3, 'tempC_avg_3days', 'tempC_avg_today')
df_raw = set_time_window_avg_feature(df_raw, 7, 'tempC_avg_7days', 'tempC_avg_today')

high temperature

In [9]:
df_raw = set_time_window_feature(df_raw, 1, 'tempC_high_1days', 'maxtempC')
df_raw = set_time_window_feature(df_raw, 3, 'tempC_high_3days', 'maxtempC')
df_raw = set_time_window_feature(df_raw, 7, 'tempC_high_7days', 'maxtempC')

low temperature

In [10]:
df_raw = set_time_window_feature(df_raw, 1, 'tempC_low_1days', 'mintempC')
df_raw = set_time_window_feature(df_raw, 3, 'tempC_low_3days', 'mintempC')
df_raw = set_time_window_feature(df_raw, 7, 'tempC_low_7days', 'mintempC')

2. time feature
+ month

In [11]:
df_raw['month'] = df_raw.p_apply(lambda l: int(l.date[5:7]), axis=1)

<LAMBDA> DONE:   0%|          | 0/2502 [00:00<?, ?it/s]

In [12]:
df_raw = df_raw[6:-1]

## filter feature

In [13]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2495 entries, 6 to 2500
Data columns (total 34 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   date               2495 non-null   object 
 1   maxtempC           2495 non-null   float64
 2   mintempC           2495 non-null   float64
 3   totalSnow_cm       2495 non-null   float64
 4   sunHour            2495 non-null   float64
 5   uvIndex            2495 non-null   float64
 6   moon_illumination  2495 non-null   float64
 7   DewPointC          2495 non-null   float64
 8   FeelsLikeC         2495 non-null   float64
 9   HeatIndexC         2495 non-null   float64
 10  WindChillC         2495 non-null   float64
 11  WindGustKmph       2495 non-null   float64
 12  cloudcover         2495 non-null   float64
 13  humidity           2495 non-null   float64
 14  precipMM           2495 non-null   float64
 15  pressure           2495 non-null   float64
 16  tempC              2495 

In [14]:
df_raw.drop(['date', 'AQI_Explained'], inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_raw.drop(['date','AQI_Explained'],inplace=True,axis=1)


In [15]:
df_raw

Unnamed: 0,maxtempC,mintempC,totalSnow_cm,sunHour,uvIndex,moon_illumination,DewPointC,FeelsLikeC,HeatIndexC,WindChillC,...,tempC_avg_1days,tempC_avg_3days,tempC_avg_7days,tempC_high_1days,tempC_high_3days,tempC_high_7days,tempC_low_1days,tempC_low_3days,tempC_low_7days,month
6,12.0,9.0,0.0,5.5,3.0,39.0,8.0,8.0,10.0,8.0,...,3.0,3.166667,-999.000000,11.0,10.0,-999.0,5.0,4.0,-999.0,1
7,9.0,4.0,0.0,4.0,2.0,46.0,5.0,4.0,8.0,4.0,...,1.5,2.666667,3.285714,12.0,10.0,15.0,9.0,3.0,5.0,1
8,5.0,2.0,0.0,8.7,2.0,53.0,-2.0,1.0,4.0,1.0,...,2.5,2.333333,2.928571,9.0,11.0,14.0,4.0,5.0,7.0,1
9,7.0,1.0,0.0,8.7,2.0,61.0,0.0,4.0,6.0,4.0,...,1.5,1.833333,2.642857,5.0,12.0,16.0,2.0,9.0,9.0,1
10,7.0,5.0,0.0,4.0,2.0,68.0,3.0,3.0,6.0,3.0,...,3.0,2.333333,2.571429,7.0,9.0,10.0,1.0,4.0,4.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2496,14.0,9.0,0.0,7.1,3.0,81.0,7.0,11.0,12.0,11.0,...,3.0,1.666667,2.857143,12.0,12.0,14.0,6.0,10.0,1.0,1
2497,12.0,10.0,0.0,4.0,3.0,88.0,7.0,10.0,11.0,10.0,...,2.5,2.166667,2.285714,14.0,9.0,10.0,9.0,7.0,6.0,1
2498,11.0,7.0,0.0,8.7,2.0,96.0,4.0,7.0,9.0,7.0,...,1.0,2.166667,2.142857,12.0,12.0,14.0,10.0,6.0,9.0,1
2499,10.0,4.0,0.0,8.7,3.0,100.0,-5.0,5.0,8.0,5.0,...,2.0,1.833333,2.071429,11.0,14.0,20.0,7.0,9.0,12.0,1


## split train and test

In [16]:
import numpy as np

In [17]:
cap_train = int(0.8 * len(df_raw))

In [18]:
df_train = df_raw[:cap_train]
df_test = df_raw[cap_train:]

In [19]:
train_x = np.array(df_train.drop(['temp_to_pred'], axis=1))
train_y = np.array(df_train["temp_to_pred"])

In [20]:
test_x = np.array(df_test.drop(['temp_to_pred'], axis=1))
test_y = np.array(df_test["temp_to_pred"])

# modeling

In [21]:
from lightgbm import LGBMRegressor

In [25]:
model = LGBMRegressor(num_leaves=2 ** 11 - 1)
model.fit(train_x, train_y)

In [26]:
out = model.predict(test_x)

In [28]:
test_y - out

array([-5.09787065e-01,  3.49180614e-01, -3.10752896e-01,  4.74955220e-01,
       -2.13360441e-01,  1.61801234e+00,  1.71186370e-01,  3.15597986e+00,
        1.23257317e+00, -8.72652230e-02,  1.78423143e+00,  2.23758243e-01,
        1.86685329e+00, -6.25815722e-01,  4.71052900e-02,  5.17687149e-01,
        2.39417781e+00,  7.67736812e-02, -6.33647974e-02, -3.30473863e-01,
        4.54840879e-01,  1.01197640e-01, -5.72041139e-01, -4.24273983e-02,
        1.52231764e-01, -4.97098685e-01, -4.88168097e-01, -2.72690060e-01,
        7.24508059e-01, -1.45034667e+00, -5.19590477e-01,  1.25516787e-01,
        1.72292089e+00,  1.57895099e+00,  2.79063104e-01,  4.43630709e-01,
        5.15654389e-02, -2.10612096e-04, -7.83798762e-01,  6.31329047e-01,
       -1.39172989e+00,  1.03817240e+00,  1.31374824e+00,  4.76087306e-01,
       -2.18173245e-01, -6.25412265e-03,  5.98325572e-01,  3.18691419e-01,
       -1.43602414e+00,  4.38397356e-01, -9.28264952e-01, -1.52876542e+00,
        5.99216914e-01,  

In [33]:
abs(test_y - out).mean()

1.010522832968564

save trained model

In [None]:
model.get_params()

In [31]:
import lightgbm as lgb

In [33]:
train_set = lgb.Dataset(train_x, train_y)
test_set = lgb.Dataset(test_x, test_y, reference=train_set)

In [36]:
trained = lgb.train(model.get_params(),
          train_set=train_set,
          valid_sets=test_set)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1602
[LightGBM] [Info] Number of data points in the train set: 1996, number of used features: 30
[LightGBM] [Info] Start training from score 3.288577
[1]	valid_0's l2: 1.67641
[2]	valid_0's l2: 1.6366
[3]	valid_0's l2: 1.61365
[4]	valid_0's l2: 1.59052
[5]	valid_0's l2: 1.55647
[6]	valid_0's l2: 1.53819
[7]	valid_0's l2: 1.53501
[8]	valid_0's l2: 1.54028
[9]	valid_0's l2: 1.53711
[10]	valid_0's l2: 1.5404
[11]	valid_0's l2: 1.53883
[12]	valid_0's l2: 1.54579
[13]	valid_0's l2: 1.54807
[14]	valid_0's l2: 1.55056
[15]	valid_0's l2: 1.56176
[16]	valid_0's l2: 1.57306
[17]	valid_0's l2: 1.58111
[18]	valid_0's l2: 1.59039
[19]	valid_0's l2: 1.59604
[20]	valid_0's l2: 1.59625
[21]	valid_0's l2: 1.60215
[22]	valid_0's l2: 1.60451
[23]	valid_0's l2: 1.61422
[24]	valid_0's l2: 1.6164
[25]	valid_0's l2: 1.61637
[26]	valid_0's l2: 1.61888
[27]	vali

In [37]:
out2 = trained.predict(test_x)

In [38]:
abs(test_y-out2)

array([5.09787065e-01, 3.49180614e-01, 3.10752896e-01, 4.74955220e-01,
       2.13360441e-01, 1.61801234e+00, 1.71186370e-01, 3.15597986e+00,
       1.23257317e+00, 8.72652230e-02, 1.78423143e+00, 2.23758243e-01,
       1.86685329e+00, 6.25815722e-01, 4.71052900e-02, 5.17687149e-01,
       2.39417781e+00, 7.67736812e-02, 6.33647974e-02, 3.30473863e-01,
       4.54840879e-01, 1.01197640e-01, 5.72041139e-01, 4.24273983e-02,
       1.52231764e-01, 4.97098685e-01, 4.88168097e-01, 2.72690060e-01,
       7.24508059e-01, 1.45034667e+00, 5.19590477e-01, 1.25516787e-01,
       1.72292089e+00, 1.57895099e+00, 2.79063104e-01, 4.43630709e-01,
       5.15654389e-02, 2.10612096e-04, 7.83798762e-01, 6.31329047e-01,
       1.39172989e+00, 1.03817240e+00, 1.31374824e+00, 4.76087306e-01,
       2.18173245e-01, 6.25412265e-03, 5.98325572e-01, 3.18691419e-01,
       1.43602414e+00, 4.38397356e-01, 9.28264952e-01, 1.52876542e+00,
       5.99216914e-01, 4.82681158e-01, 2.94698448e-01, 3.38152919e-01,
      

In [40]:
abs(test_y-out2).mean()

1.010522832968564

In [41]:
trained.save_model('model_v1.txt')

<lightgbm.basic.Booster at 0x7f8b68de9730>