In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# load the data
df=pd.read_csv('data.csv',index_col=0)

# Time Extraction
I will replace utc time with hour and month information contained inside.

In [3]:
def time_extract(utc_time):
    month=int(utc_time[5:7])
    hour=int(utc_time[11:13])
    return month,hour

In [4]:
# extract hour and month information
n=len(df)
month,hour=np.zeros(n),np.zeros(n)
for i in range(n):
    month[i],hour[i]=time_extract(df['utc_time'][i])
month=np.reshape(month,(n,-1))
hour=np.reshape(hour,(n,-1))

# print the results
print(month[:10].T)
print(hour[:10].T)


[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]
[[16. 17. 18. 19. 20. 21. 22. 23.  0.  1.]]


# Categorize Wind Direction
I divide wind direction into five classes by similar criteria shown in [official description](https://biendata.com/competition/kdd_2018/data/) and [UCI PM2.5 data](http://archive.ics.uci.edu/ml/datasets/Beijing+PM2.5+Data).

In [5]:
def preprocess_parsed_col(df, column='wind_direction'):
    '''
    Redo parsing for wind direction
    '''
    df[column] = df[column].apply(lambda x: wind_categories(x))
    return df

def wind_categories(x):
    x = int(x)
    if x >= 0 and x <= 90:
        # Angular degrees from True north
        y = 'NE'
    if x > 90 and x <= 180:
        y = 'SE'
    if x > 180 and x <=270:
        y = 'SW'
    if x > 270 and x <=360:
        y = 'NW'
    if x==999017:
        y ='CV'
    return y

In [6]:
df=preprocess_parsed_col(df)

In [7]:
df['wind_direction'].value_counts()

SW    4619
SE    3553
NE    1855
NW    1607
CV       1
Name: wind_direction, dtype: int64

# One-Hot Encoding
Conduct one-hot encoding for wind direction and weather

In [8]:
from sklearn import preprocessing

In [9]:
df_sub=df.loc[:,['wind_direction','weather']]

In [10]:
print('Weather')
print(df_sub['weather'].value_counts())
print('')
print('Wind Direction')
print(df_sub['wind_direction'].value_counts())

Weather
Sunny/clear      8849
Haze             1304
Hail              687
Fog               617
Cloudy             82
Rain               55
Dust               14
Sleet              13
Snow                8
Light Rain          3
Sand                2
Thundershower       1
Name: weather, dtype: int64

Wind Direction
SW    4619
SE    3553
NE    1855
NW    1607
CV       1
Name: wind_direction, dtype: int64


In [11]:
def num_categories(df_input,colnm):
    # change character labels into numerical ones for further encoding
    ind_seq=list()
    labels=list(np.unique(df_input[colnm]))
    for i in range(len(df_input)):
        ind_seq.append(labels.index(df_input[colnm][i]))
    return(ind_seq)

df_numlb=np.zeros((len(df_sub),2))
df_numlb[:,0]=num_categories(df_sub,'weather')
df_numlb[:,1]=num_categories(df_sub,'wind_direction')

In [12]:
# one-hot encoding
enc = preprocessing.OneHotEncoder()
enc_array=enc.fit_transform(df_numlb)
enc_array=enc_array.toarray()

# Data Normalization

In [13]:
df.iloc[:,[0,2,3,4,7]]

Unnamed: 0,PM2.5,humidity,pressure,temperature,wind_speed
0,82.558824,16.0,1024.60,-2.85,1.70
1,89.764706,18.5,1024.40,-3.45,1.40
2,94.029412,19.0,1024.10,-3.65,1.05
3,92.558824,21.5,1023.55,-3.80,1.10
4,92.176471,24.0,1022.75,-4.10,0.90
5,93.352941,25.0,1022.35,-4.45,0.80
6,92.147059,27.5,1021.90,-4.70,0.75
7,97.558824,29.0,1022.00,-5.50,0.70
8,98.545455,31.5,1022.15,-6.20,0.60
9,101.969697,27.5,1022.15,-3.75,0.70


In [14]:
sc_array=preprocessing.scale(df.iloc[:,[0,2,3,4,7]].values)
sc_array

array([[ 0.49237545, -1.31158826,  1.50796416, -1.33484601, -0.12597744],
       [ 0.63266203, -1.2111115 ,  1.48764176, -1.38667722, -0.40885155],
       [ 0.71568878, -1.19101615,  1.45715816, -1.4039543 , -0.73887134],
       ...,
       [-0.48676759,  0.73813757, -0.28548783,  0.34102985, -1.11603682],
       [-0.47130744,  0.59747011, -0.24992363,  0.47924642, -0.78601703],
       [-0.45928288,  0.05489563, -0.20927882,  0.8161493 , -0.83316271]])

# New Data For Neural Network

In [15]:
final_array=np.hstack((sc_array,month,hour,enc_array))
print('The shape is:',final_array.shape)
final_array

The shape is: (11635, 24)


array([[ 0.49237545, -1.31158826,  1.50796416, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.63266203, -1.2111115 ,  1.48764176, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.71568878, -1.19101615,  1.45715816, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.48676759,  0.73813757, -0.28548783, ...,  0.        ,
         0.        ,  1.        ],
       [-0.47130744,  0.59747011, -0.24992363, ...,  0.        ,
         1.        ,  0.        ],
       [-0.45928288,  0.05489563, -0.20927882, ...,  0.        ,
         1.        ,  0.        ]])

In [16]:
np.savetxt('dat4nn.txt',final_array)