In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import math
import string
import re

from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier  ## decision tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
import xgboost as xgb

warnings.filterwarnings('ignore')

In [2]:
df_train = pd.read_csv('train.csv')
train_end_idx = len(df_train) 
df_test = pd.read_csv('test.csv')
df_test['RainToday'] = np.zeros((len(df_test),))

# 為了處理方便，把 'train.csv' 和 'test.csv' 合併起來，'test.csv'的 RainToday 欄位用 0 補起來。
# 以 train_end_idx 作為 'train.csv' 和 'test.csv' 分界列，
df = pd.concat([df_train, df_test], sort=False)


In [3]:
df.info()
df = df.drop(columns=['Date'],axis=1)
df.shape
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70000 entries, 0 to 34843
Data columns (total 22 columns):
Date             55943 non-null object
Location         55883 non-null object
MinTemp          56082 non-null float64
MaxTemp          56174 non-null float64
Evaporation      56019 non-null float64
Sunshine         55954 non-null float64
WindGustDir      52611 non-null object
WindGustSpeed    56166 non-null float64
WindDir9am       52126 non-null object
WindDir3pm       54552 non-null object
WindSpeed9am     55971 non-null float64
WindSpeed3pm     56072 non-null float64
Humidity9am      55944 non-null float64
Humidity3pm      55944 non-null float64
Pressure9am      56036 non-null float64
Pressure3pm      55896 non-null float64
Cloud9am         56090 non-null float64
Cloud3pm         56064 non-null float64
Temp9am          55970 non-null float64
Temp3pm          56063 non-null float64
RainToday        70000 non-null float64
RISK_MM          56129 non-null float64
dtypes: float64(

Unnamed: 0,Location,MinTemp,MaxTemp,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM
0,,19.5,,5.69496,0.5,ENE,61.0,ENE,ENE,31.0,...,86.641375,41.810006,1020.4,1021.9,,7.0,,19.6,0.0,
1,Canberra,2.1,15.7,,5.7,E,26.0,,SE,9.0,...,53.275715,76.598048,1023.7,1020.8,4.0,1.0,9.0,14.1,0.0,0.0
2,Woomera,20.9,36.1,5.69496,,S,39.0,S,SW,31.0,...,15.471574,52.163254,1017.1,1014.9,4.167266,4.27812,24.2,33.9,0.0,0.0
3,Tuggeranong,11.5,23.9,5.69496,7.993227,NNE,35.0,W,,7.0,...,46.442398,67.664733,1010.2,,4.167266,4.27812,13.8,21.8,1.0,0.2
4,Hobart,8.2,20.5,3.2,,N,69.0,N,WNW,,...,51.830889,51.77024,,1001.2,7.0,7.0,,17.6,0.0,0.6


In [4]:
# df['Date'] = pd.to_datetime(df['Date'])
# df['Year'] = df['Date'].dt.year
# df['Month'] = df['Date'].dt.month
# df['Day'] = df['Date'].dt.day
# df.drop('Date', axis=1, inplace = True)
# df.head()

In [5]:
categorical = [var for var in df.columns if df[var].dtype=='O']

print('There are {} categorical variables\n'.format(len(categorical)))

print('The categorical variables are :', categorical)

# pd.get_dummies(df.Location, drop_first=True).head()
# pd.get_dummies(df.WindGustDir, drop_first=True, dummy_na=True).head()
# pd.get_dummies(df.WindDir9am, drop_first=True, dummy_na=True).head()
# pd.get_dummies(df.WindDir3pm, drop_first=True, dummy_na=True).head()

There are 4 categorical variables

The categorical variables are : ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']


In [6]:
numerical = [var for var in df.columns if df[var].dtype!='O']

print('There are {} numerical variables\n'.format(len(numerical)))

print('The numerical variables are :', numerical)

df[numerical].isnull().sum()

There are 17 numerical variables

The numerical variables are : ['MinTemp', 'MaxTemp', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainToday', 'RISK_MM']


MinTemp          13918
MaxTemp          13826
Evaporation      13981
Sunshine         14046
WindGustSpeed    13834
WindSpeed9am     14029
WindSpeed3pm     13928
Humidity9am      14056
Humidity3pm      14056
Pressure9am      13964
Pressure3pm      14104
Cloud9am         13910
Cloud3pm         13936
Temp9am          14030
Temp3pm          13937
RainToday            0
RISK_MM          13871
dtype: int64

In [7]:
print(round(df[numerical].describe()),2)

       MinTemp  MaxTemp  Evaporation  Sunshine  WindGustSpeed  WindSpeed9am  \
count  56082.0  56174.0      56019.0   55954.0        56166.0       55971.0   
mean      12.0     24.0          6.0       8.0           39.0          14.0   
std        6.0      7.0          3.0       3.0           13.0           9.0   
min       -8.0     -4.0          0.0       0.0            7.0           0.0   
25%        8.0     18.0          4.0       8.0           31.0           7.0   
50%       12.0     23.0          6.0       8.0           39.0          13.0   
75%       17.0     29.0          6.0       9.0           46.0          19.0   
max       34.0     47.0         86.0      14.0          135.0         130.0   

       WindSpeed3pm  Humidity9am  Humidity3pm  Pressure9am  Pressure3pm  \
count       56072.0      55944.0      55944.0      56036.0      55896.0   
mean           18.0         52.0         60.0       1018.0       1015.0   
std             9.0         18.0         11.0          7.0     

In [89]:
for df1 in [df]:
    for col in numerical:
        if col!='RainToday':
            col_median=df_train[col].median()
            df1[col].fillna(col_median, inplace=True) 

In [59]:
for df2 in [df]:
    df2['WindGustDir'].fillna(df_train['WindGustDir'].mode()[0], inplace=True)
    df2['WindDir9am'].fillna(df_train['WindDir9am'].mode()[0], inplace=True)
    df2['WindDir3pm'].fillna(df_train['WindDir3pm'].mode()[0], inplace=True)
    df2['Location'].fillna(df_train['Location'].mode()[0], inplace=True)
df[categorical].isnull().sum()

Location       0
WindGustDir    0
WindDir9am     0
WindDir3pm     0
dtype: int64

In [8]:
IQR = df.Evaporation.quantile(0.75) - df.Evaporation.quantile(0.25)
Lower_fence = df.Evaporation.quantile(0.25) - (IQR * 3)
Upper_fence = df.Evaporation.quantile(0.75) + (IQR * 3)
print('Evaporation outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

Evaporation outliers are values < 0.515118606532937 or > 9.579841857956085


In [9]:
IQR = df.WindSpeed9am.quantile(0.75) - df.WindSpeed9am.quantile(0.25)
Lower_fence = df.WindSpeed9am.quantile(0.25) - (IQR * 3)
Upper_fence = df.WindSpeed9am.quantile(0.75) + (IQR * 3)
print('WindSpeed9am outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

WindSpeed9am outliers are values < -29.0 or > 55.0


In [10]:
IQR = df.WindGustSpeed.quantile(0.75) - df.WindGustSpeed.quantile(0.25)
Lower_fence = df.WindGustSpeed.quantile(0.25) - (IQR * 3)
Upper_fence = df.WindGustSpeed.quantile(0.75) + (IQR * 3)
print('WindGustSpeed outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

WindGustSpeed outliers are values < -14.0 or > 91.0


In [8]:
def max_value(df3, variable, top):
    return np.where(df3[variable]>top, top, df3[variable])

for df3 in [df]:
    df3['WindGustSpeed'] = max_value(df3, 'WindGustSpeed', 60.0)
    df3['Evaporation'] = max_value(df3, 'Evaporation', 5.5)
    df3['WindSpeed9am'] = max_value(df3, 'WindSpeed9am', 30.0)
    df3['RISK_MM'] = max_value(df3, 'RISK_MM', 300.0)

In [8]:
df = pd.concat([df[numerical],
                     pd.get_dummies(df.Location), 
                     pd.get_dummies(df.WindGustDir),
                     pd.get_dummies(df.WindDir9am),
                     pd.get_dummies(df.WindDir3pm)], axis=1)

In [9]:
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
scaler.fit(df)
df = pd.DataFrame(scaler.transform(df), index=df.index, columns=df.columns)
df.iloc[4:10]

Unnamed: 0,MinTemp,MaxTemp,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,...,NNW,NW,S,SE,SSE,SSW,SW,W,WNW,WSW
4,0.389549,0.47451,0.037123,,0.484375,,,0.518309,0.517702,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5,0.425178,0.545098,0.066067,,0.1875,0.084615,0.24359,0.498169,0.584702,0.677996,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.337292,0.323529,0.011601,0.510345,,0.215385,0.333333,0.708797,0.596722,0.794275,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,0.36342,0.376471,0.027842,0.551257,0.21875,0.115385,0.166667,0.391384,0.681234,0.697674,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.586698,0.570588,0.066067,0.551257,,0.2,,0.639832,0.600083,0.352415,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,,0.343137,0.054524,0.358621,0.453125,0.215385,0.423077,0.58974,0.689603,0.533095,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# train

In [66]:
# 將非數值欄位拿掉
#df = df.drop(columns = [col for col in df.columns if df[col].dtype == np.object])

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns = ['RainToday']).values[:train_end_idx, :],
    df['RainToday'].values[:train_end_idx], test_size=0.4)
X_ans = df.drop(columns = ['RainToday']).values[train_end_idx:, :]

In [18]:
from imblearn.over_sampling import SMOTE, ADASYN
# X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)
# X_resampled, y_resampled = ADASYN().fit_resample(X_train, y_train)
from imblearn.over_sampling import BorderlineSMOTE
X_resampled, y_resampled = BorderlineSMOTE().fit_resample(X_train, y_train)

In [10]:
import  xgboost as xgb
import pandas as pd
from sklearn.model_selection import  train_test_split
from sklearn.metrics import accuracy_score, f1_score

params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.04
params['max_depth'] = 8
params['learning_rate'] = 0.01

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_test, label=y_test)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=100, verbose_eval=10)
y_pred_decision = bst.predict(xgb.DMatrix(X_test))

print(y_pred_decision)

i = 0
for item in y_pred_decision:
    if item > 0.25:
        y_pred_decision[i] = 1
    else :
        y_pred_decision[i] = 0
    i+=1
print('Accuracy: %f' % accuracy_score(y_test, y_pred_decision))
print('f1-score: %f' % f1_score(y_test, y_pred_decision))

[0]	train-logloss:0.686038	valid-logloss:0.686324
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.
[10]	train-logloss:0.62205	valid-logloss:0.625137
[20]	train-logloss:0.568627	valid-logloss:0.574458
[30]	train-logloss:0.523496	valid-logloss:0.53189
[40]	train-logloss:0.484853	valid-logloss:0.495741
[50]	train-logloss:0.451511	valid-logloss:0.464884
[60]	train-logloss:0.422667	valid-logloss:0.438346
[70]	train-logloss:0.397656	valid-logloss:0.415457
[80]	train-logloss:0.375635	valid-logloss:0.395612
[90]	train-logloss:0.356099	valid-logloss:0.378422
[100]	train-logloss:0.338667	valid-logloss:0.363421
[110]	train-logloss:0.323062	valid-logloss:0.35032
[120]	train-logloss:0.309205	valid-logloss:0.338756
[130]	train-logloss:0.296979	valid-logloss:0.328688
[140]	train-logloss:0.2857	valid-logloss:0.319607
[150]	train-logloss:0.275666	valid-logloss:0.311653
[160]	train-logloss:0.266758	val

In [19]:
y_pred_decision = bst.predict(xgb.DMatrix(X_test))

print(y_pred_decision)

i = 0
for item in y_pred_decision:
    if item > 0.206:
        y_pred_decision[i] = 1
    else :
        y_pred_decision[i] = 0
    i+=1
print('Accuracy: %f' % accuracy_score(y_test, y_pred_decision))
print('f1-score: %f' % f1_score(y_test, y_pred_decision))

[0.23873436 0.01336598 0.2163492  ... 0.01759693 0.00546538 0.00488592]
Accuracy: 0.868662
f1-score: 0.460730


In [19]:
import  xgboost as xgb
from sklearn.model_selection import  train_test_split
from sklearn.metrics import accuracy_score, f1_score

xgtrain = xgb.DMatrix(X_resampled, label=y_resampled)
clf = xgb.XGBClassifier(missing=9999999999,
                max_depth = 10,
                n_estimators=10,
                learning_rate=0.05, 
                subsample=1.0,
                colsample_bytree=0.5,
                min_child_weight = 3,
                seed=1301)
xgb_param = clf.get_xgb_params()

print ('Start cross validation')
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=500, nfold=10, metrics=['auc'],
     early_stopping_rounds=50, stratified=True, seed=1301)
print('Best number of trees = {}'.format(cvresult.shape[0]))
clf.set_params(n_estimators=cvresult.shape[0])
print('Fit on the trainingsdata')
clf.fit(X_resampled, y_resampled, eval_metric='auc')
print('Predict the probabilities based on features in the test set')
pred = clf.predict(X_test, ntree_limit=cvresult.shape[0])
print(pred)

Start cross validation


KeyboardInterrupt: 

In [25]:
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV 

param = {'max_depth': 10,
         'learning_rate ': 0.01,
         'silent': 1,
         'objective': 'binary:logistic',
         "eval_metric":"auc",
         "scale_pos_weight":10,
         "subsample":0.8,
         "min_child_weight":1,
}

xgb_model = XGBClassifier(param)

test_params = {
 'max_depth':[4,8,12]
}

model = GridSearchCV(estimator = xgb_model, param_grid = test_params)
model.fit(X_resampled,y_resampled)
print (model.best_params_)
#Accuracy: 0.888025
#f1-score: 0.297442

{'max_depth': 12}


In [20]:
import  xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score

param = {'max_depth': 50,
         'learning_rate ': 0.01,
         'silent': 1,
         'objective': 'binary:logistic',
         "eval_metric":"auc",
         "scale_pos_weight":10,
         "subsample":0.8,
         "min_child_weight":1,
}

dtrain = xgb.DMatrix(X_resampled, label=y_resampled)
dtest  = xgb.DMatrix(X_test)

cv_res= xgb.cv(param,dtrain,num_boost_round=2000,early_stopping_rounds=30,nfold=10, metrics='auc',show_stdv=True)

#cv_res.shape[0]为最佳迭代次数
bst = xgb.train(param,dtrain,num_boost_round=cv_res.shape[0])

y_pre = bst.predict(dtest)

print(y_pre)

i = 0
for item in y_pre:
    if item > 0.5:
        y_pre[i] = 1
    else :
        y_pre[i] = 0
    i+=1
    
print('Accuracy: %f' % accuracy_score(y_test, y_pre))
print('f1-score: %f' % f1_score(y_test, y_pre))

#Accuracy: 0.879397
#f1-score: 0.370297

[9.4481715e-05 1.2543435e-03 1.9602671e-04 ... 1.2280121e-04 7.4517353e-05
 9.4927140e-03]
Accuracy: 0.885750
f1-score: 0.357333


In [17]:
from kmeans_smote import KMeansSMOTE
kmeans_smote = KMeansSMOTE(
    kmeans_args={
        'n_clusters': 100
    },
    smote_args={
       'k_neighbors': 10
    }
)
sm = KMeansSMOTE(random_state=0)
X_res, y_res = sm.fit_resample(X_train, y_train)

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score

#train tree model
 
model = DecisionTreeClassifier(criterion='gini',max_depth=10000)
model.fit(X_resampled,y_resampled)

#predict
y_pred_decision = model.predict(X_test)
print('Accuracy: %f' % accuracy_score(y_test, y_pred_decision))
print('f1-score: %f' % f1_score(y_test, y_pred_decision))

Accuracy: 0.767119
f1-score: 0.235706


In [18]:
from sklearn.metrics import accuracy_score, f1_score
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.04
params['max_depth'] = 100
params['learning_rate'] = 0.01

d_train = xgb.DMatrix(X_resampled, label=y_resampled)
d_valid = xgb.DMatrix(X_test, label=y_test)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 100, watchlist, early_stopping_rounds=100, verbose_eval=10)
y_pred_decision = bst.predict(xgb.DMatrix(X_test))
print(y_pred_decision)
i = 0
for item in y_pred_decision:
    if item > 0.5:
        y_pred_decision[i] = 1
    else :
        y_pred_decision[i] = 0
    i+=1

print('Accuracy: %f' % accuracy_score(y_test, y_pred_decision))
print('f1-score: %f' % f1_score(y_test, y_pred_decision))
# print("Accuracy: ", str(sum(y_test == (y_pred > 0.5))/y_test.shape[0]))

[0]	train-logloss:0.685296	valid-logloss:0.687378
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 100 rounds.
[10]	train-logloss:0.615135	valid-logloss:0.636261
[20]	train-logloss:0.555433	valid-logloss:0.593773
[30]	train-logloss:0.503889	valid-logloss:0.558079
[40]	train-logloss:0.458897	valid-logloss:0.527387
[50]	train-logloss:0.419337	valid-logloss:0.500425
[60]	train-logloss:0.384381	valid-logloss:0.476997
[70]	train-logloss:0.353069	valid-logloss:0.456206
[80]	train-logloss:0.324947	valid-logloss:0.43797
[90]	train-logloss:0.299763	valid-logloss:0.42196
[99]	train-logloss:0.279283	valid-logloss:0.40918
[0.1867793  0.36559418 0.19524628 ... 0.6295627  0.21509641 0.4645976 ]
Accuracy: 0.862994
f1-score: 0.336243


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score


# instantiate the model
model = LogisticRegression(solver='liblinear', random_state=0)
model.fit(X_resampled, y_resampled)

y_pred_decision = model.predict(X_test)
print('Accuracy: %f' % accuracy_score(y_test, y_pred_decision))
print('f1-score: %f' % f1_score(y_test, y_pred_decision))

Accuracy: 0.759363
f1-score: 0.391075


In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

model = RandomForestClassifier(criterion='gini',n_estimators=10,random_state=1)
model.fit(X_resampled,y_resampled)

y_pred_forest = model.predict(X_test)
print('Accuracy: %.4f' % accuracy_score(y_test, y_pred_forest))
print('f1-score: %f' % f1_score(y_test, y_pred_forest))

Accuracy: 0.8662
f1-score: 0.271672


In [20]:
# ans_pred = model.predict(X_ans)
ans_pred = bst.predict(xgb.DMatrix(X_ans))
i = 0
for item in ans_pred:
    if item > 0.206:
        ans_pred[i] = 1
    else :
        ans_pred[i] = 0
    i+=1
df_sap = pd.DataFrame(ans_pred.astype(int), columns = ['RainToday'])
df_sap.to_csv('myAns.csv',  index_label = 'Id')