In [13]:
import pandas as pd
filtered_wind_data = pd.read_parquet('../data/windmodel_data.parquet')

all_means_df = pd.read_parquet('../data/all_means_df.pqt')

In [72]:
# check the na counts per column
na_counts = pd.DataFrame(
    [
        all_means_df[col].isna().value_counts()
        for col 
        in all_means_df.columns
    ],index=all_means_df.columns)
# check the na counts, should be zero everywhere
na_counts

Unnamed: 0,False,True
temp,1087,1
dwpt,1087,1
rhum,1087,1
prcp,1086,2
wdir,1087,1
wspd,1087,1
wpgt,375,713
pres,1085,3
tsun,319,769


In [14]:
too_high = 70
high_enough = 18
perfect_speed = 19

In [18]:
def categorize(mean, max):
  if max > too_high:
    cat = 3
  else:
    cat = int(mean>high_enough) + (mean>perfect_speed)
  return cat

In [3]:
def categorize_binary(mean):
    return (mean>high_enough)

In [15]:
filtered_wind_data.loc[:,'suitable'] = [categorize_binary(x)
                              for x
                              in filtered_wind_data['wspd_mean']
]

# df.drop(['wspd_median_mean','above_median_mean','hour_mean'],inplace=True,axis=1)
# df.set_index('id',inplace=True)
filtered_wind_data.suitable.value_counts()

False    209
True     133
Name: suitable, dtype: int64

In [18]:
filtered_wind_data.columns

Index(['id', 'hour_mean', 'temp_mean', 'dwpt_mean', 'rhum_mean', 'prcp_mean',
       'snow_mean', 'wdir_mean', 'wspd_mean', 'wpgt_mean', 'pres_mean',
       'tsun_mean', 'coco_mean', 'wspd_median_mean', 'above_median_mean',
       'wspd_max', 'suitable'],
      dtype='object')

In [19]:
filtered_wind_data.drop(['hour_mean','snow_mean','above_median_mean','wspd_median_mean','coco_mean'],axis=1,inplace=True)

In [20]:
filtered_wind_data

Unnamed: 0,id,temp_mean,dwpt_mean,rhum_mean,prcp_mean,wdir_mean,wspd_mean,wpgt_mean,pres_mean,tsun_mean,wspd_max,suitable
0,10004,10.451261,7.449866,82.171136,0.117534,206.769842,40.445665,53.387573,1011.236250,0.000000,90.8,True
1,10007,10.127578,6.887558,80.854592,0.146445,212.271240,39.625527,52.256785,1011.780119,6.000000,83.0,True
2,10015,9.815853,6.324004,79.548739,0.117322,207.985671,36.419412,49.797198,1012.139140,8.709530,81.4,True
3,10018,10.416622,7.136504,81.362685,0.144268,218.759507,30.651869,47.688045,1011.417852,,72.4,True
4,10020,9.980520,6.284137,78.926902,0.096464,222.283311,35.266428,51.367079,1011.330899,11.393763,78.8,True
...,...,...,...,...,...,...,...,...,...,...,...,...
337,EDBO0,10.700174,4.939533,71.727355,0.081431,210.433600,16.238745,,,,47.9,False
338,EDCE0,11.299701,4.989851,68.957264,0.070006,205.720700,15.745480,,1011.700000,20.216193,37.4,False
339,EDHK0,11.825259,7.167886,74.981166,,201.171599,21.042723,,1011.540667,,57.6,True
340,EDMG0,9.948297,4.835818,73.485027,0.101214,173.896123,14.728448,,,,52.6,False


In [21]:
model_ids = filtered_wind_data.id

In [23]:
all_means_df.head()

Unnamed: 0_level_0,temp,dwpt,rhum,prcp,wdir,wspd,wpgt,pres,tsun
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10004,11.183591,8.360795,83.261821,0.080176,198.54831,30.126426,41.561255,1013.817822,0.0
10007,11.081906,7.946762,81.555184,0.105059,203.36173,29.454347,40.476545,1014.108209,3.0
10015,10.936814,7.477791,79.947582,0.089194,204.287527,26.965759,37.540909,1014.087925,12.65069
10018,10.798407,7.650422,82.24656,0.103887,204.459598,22.528988,36.471962,1013.564366,
10020,10.451094,6.789513,79.452712,0.074267,205.829421,26.275399,38.887773,1013.542898,12.665966


In [27]:
all_means_df.index = all_means_df.index.astype('str')

In [31]:
filtered_wind_data.set_index('id',inplace=True)
filtered_wind_data.index = filtered_wind_data.index.astype('str')

In [34]:
model_data_unfiltered = all_means_df.loc[model_ids]

In [36]:
model_data_unfiltered.columns

Index(['temp', 'dwpt', 'rhum', 'prcp', 'wdir', 'wspd', 'wpgt', 'pres', 'tsun'], dtype='object')

In [44]:
zscore_df = [
    (model_data_unfiltered[col] - model_data_unfiltered[col].mean()) / model_data_unfiltered[col].std(ddof=0)
    for col
    in model_data_unfiltered.columns
]

zscore_df = [
    series.to_frame()
    for series
    in zscore_df
]
# construct a df
zscore_df = pd.concat(zscore_df,axis=1)
# fill na with mean
zscore_df=zscore_df.fillna(0)
# rnadomize the order
zscore_df = zscore_df.sample(frac=1)

In [45]:
zscore_df

Unnamed: 0_level_0,temp,dwpt,rhum,prcp,wdir,wspd,wpgt,pres,tsun
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
D1639,0.259961,0.069890,-0.330297,-0.143661,-2.116394,-0.324526,0.000000,0.417643,0.662918
10859,0.025205,-0.001900,-0.019368,-0.466180,-0.547070,-0.259482,0.000000,0.511231,0.000000
10312,0.516816,0.636215,-0.005788,-0.120535,-0.753699,-0.263184,-0.176103,-0.101334,-0.740718
10438,0.061794,0.284229,0.296204,-0.151332,1.242861,-0.565185,-0.623552,0.533008,0.000000
10490,0.466929,-0.071290,-1.016155,-0.226715,0.232344,-0.371499,-0.502235,0.051325,-0.188882
...,...,...,...,...,...,...,...,...,...
10384,0.816924,-0.016594,-1.714530,-0.220842,0.355529,-0.004451,-0.058367,0.015689,-0.133209
10513,0.876567,0.497604,-0.931609,-0.077402,-0.575077,-0.152667,-0.393308,0.270957,-0.625449
10736,0.292926,0.284248,0.129175,-0.139082,-1.253787,-1.260655,-1.441968,0.834160,0.469504
10433,-0.108090,0.130469,0.410147,-0.072537,0.690872,-0.159471,-0.201299,0.108507,-0.659238


In [41]:
# set y-column (suitable)
y = (filtered_wind_data.suitable)

In [46]:
na_counts = pd.DataFrame(
    [
        zscore_df[col].isna().value_counts()
        for col 
        in zscore_df.columns
    ],index=zscore_df.columns)
# check the na counts, should be zero everywhere
na_counts

In [49]:
from sklearn.model_selection import train_test_split

In [73]:
# set the x columns
x = zscore_df.drop(['wspd','wpgt','tsun'],axis=1)

In [88]:
xtrain, xtest, ytrain, ytest = train_test_split(
    x,
    y,
    test_size=.2,
    stratify=y,
    random_state=1
)
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=3, random_state=1)
from sklearn.metrics import accuracy_score, confusion_matrix
dt.fit(xtrain,ytrain)
ypred = dt.predict(xtest)
confusion_matrix(ytest,ypred)
accuracy_score(ytest,ypred)

0.5797101449275363

In [89]:
cm = pd.DataFrame(confusion_matrix(ytest,ypred))
cm.index.name = 'true'
cm.columns.name = 'pred'
cm

pred,0,1
true,Unnamed: 1_level_1,Unnamed: 2_level_1
0,36,6
1,23,4


In [90]:
pd.Series(dt.feature_importances_,index=xtrain.columns)

temp    0.382730
dwpt    0.000000
rhum    0.000000
prcp    0.121208
wdir    0.241480
pres    0.254583
dtype: float64