In [89]:
import pandas as pd

In [90]:
df = pd.read_csv('./data/processed/carbon-v1.csv')
df.head()

Unnamed: 0,date_time,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,target_carbon_monoxide,...,temp-24,abshum-6,abshum-3,relhum-6,relhum-3,s1-6,s2-6,s3-6,s4-6,s5-6
0,2010-03-10 18:00:00,13.1,46.0,0.7578,1387.2,1087.8,1056.0,1742.8,1293.4,2.5,...,13.1,0.7578,0.7578,46.0,46.0,1387.2,1087.8,1056.0,1742.8,1293.4
1,2010-03-10 19:00:00,13.2,45.3,0.7255,1279.1,888.2,1197.5,1449.9,1010.9,2.1,...,13.2,0.7255,0.7255,45.3,45.3,1279.1,888.2,1197.5,1449.9,1010.9
2,2010-03-10 20:00:00,12.6,56.2,0.7502,1331.9,929.6,1060.2,1586.1,1117.0,2.2,...,12.6,0.7502,0.7502,56.2,56.2,1331.9,929.6,1060.2,1586.1,1117.0
3,2010-03-10 21:00:00,11.0,62.4,0.7867,1321.0,929.0,1102.9,1536.5,1263.2,2.2,...,11.0,0.7867,0.0289,62.4,16.4,1321.0,929.0,1102.9,1536.5,1263.2
4,2010-03-10 22:00:00,11.9,59.0,0.7888,1272.0,852.7,1180.9,1415.5,1132.2,1.5,...,11.9,0.7888,0.0633,59.0,13.7,1272.0,852.7,1180.9,1415.5,1132.2


In [91]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# ss = StandardScaler()
# df = ss.fit_transform(df)

targets = df[['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']].copy(deep=True)
df = df.drop(['date_time', 'target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'], axis=1)


# ts = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']
# targets = [df.pop(x) for x in ts]



In [92]:
train, test, train_y, test_y = train_test_split(df, targets, test_size=0.2)

In [93]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn import model_selection
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor


def test_models(train, test, train_y, test_y):
    dfs = []
    models = [
          ('LinReg', LinearRegression()), 
          ('KNN', KNeighborsRegressor()),
          ('RF', RandomForestRegressor()),
          ('SVM', MultiOutputRegressor(SVR())),
          ('XGB', MultiOutputRegressor(XGBRegressor())),
          ('RDG', Ridge())
        ]
    results = []
    names = []
    scoring = ['r2', 'neg_mean_squared_error']
    for name, model in models:
        kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=90210)
        cv_results = model_selection.cross_validate(model, train, train_y, cv=kfold, scoring=scoring)
        clf = model.fit(train, train_y)
        y_pred = clf.predict(test)
        print(name)
        print(r2_score(test_y, y_pred))
        results.append(cv_results)
        names.append(name)
        this_df = pd.DataFrame(cv_results)
        this_df['model'] = name
        dfs.append(this_df)

    final = pd.concat(dfs, ignore_index=True)
    return final
    

In [94]:
res = test_models(train, test, train_y, test_y)

LinReg
0.8516286224321696
KNN
0.8665088496377281
RF
0.9175063477324376
SVM
0.7492462254687142
XGB
0.9195970126523104
RDG
0.851624019596572


In [109]:
XGBRegressor()

XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None)

In [110]:
from sklearn.model_selection import GridSearchCV

parameters = {'estimator__objective':['reg:linear'],
              'estimator__learning_rate': [.03], 
              'estimator__max_depth': [9],
              'estimator__min_child_weight': [5],
              'estimator__silent': [1],
              'estimator__gamme': [0, 3, 5],
              'estimator__gpu_id':[-1],
              'estimator__colsample_bylevel': [None, 0.5, 0.7],
              'estimator__tree_method': ['approx'],
              'estimator__subsample': [0.5, 0.7],
              'estimator__colsample_bytree': [0.5, 0.7],
              'estimator__n_estimators': [800, 1000]
              }
# parameters = {
#     'bootstrap': [True],
#     'max_depth': [80, 90, 100, 110],
#     'max_features': [2, 3],
#     'min_samples_leaf': [3, 4, 5],
#     'min_samples_split': [8, 10, 12],
#     'n_estimators': [100, 200, 300, 1000]
# }


grid = GridSearchCV(MultiOutputRegressor(XGBRegressor()), cv=5, param_grid=parameters, scoring='r2', verbose=10)

grid.fit(train, train_y)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV 1/5; 1/72] START estimator__colsample_bylevel=None, estimator__colsample_bytree=0.5, estimator__gamme=0, estimator__gpu_id=-1, estimator__learning_rate=0.03, estimator__max_depth=9, estimator__min_child_weight=5, estimator__n_estimators=800, estimator__objective=reg:linear, estimator__silent=1, estimator__subsample=0.5, estimator__tree_method=approx
[CV 1/5; 1/72] END estimator__colsample_bylevel=None, estimator__colsample_bytree=0.5, estimator__gamme=0, estimator__gpu_id=-1, estimator__learning_rate=0.03, estimator__max_depth=9, estimator__min_child_weight=5, estimator__n_estimators=800, estimator__objective=reg:linear, estimator__silent=1, estimator__subsample=0.5, estimator__tree_method=approx;, score=0.923 total time=  10.2s
[CV 2/5; 1/72] START estimator__colsample_bylevel=None, estimator__colsample_bytree=0.5, estimator__gamme=0, estimator__gpu_id=-1, estimator__learning_rate=0.03, estimator__max_depth=9, estimator

GridSearchCV(cv=5,
             estimator=MultiOutputRegressor(estimator=XGBRegressor(base_score=None,
                                                                   booster=None,
                                                                   colsample_bylevel=None,
                                                                   colsample_bynode=None,
                                                                   colsample_bytree=None,
                                                                   gamma=None,
                                                                   gpu_id=None,
                                                                   importance_type='gain',
                                                                   interaction_constraints=None,
                                                                   learning_rate=None,
                                                                   max_delta_step=None,
                       

In [111]:
print(grid.best_score_)
print(grid.best_estimator_)

0.9213711408258011
MultiOutputRegressor(estimator=XGBRegressor(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.7, gamma=None,
                                            gamme=0, gpu_id=-1,
                                            importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=0.03,
                                            max_delta_step=None, max_depth=9,
                                            min_child_weight=5, missing=nan,
                                            monotone_constraints=None,
                                            n_estimators=1000, n_jobs=None,
                                            num_parallel_tree=None,
                                            objective

In [112]:
model = grid.best_estimator_

In [113]:
model.score(test, test_y)

0.9347841800235107

In [114]:
final_model = grid.best_estimator_
final_model.fit(df, targets)

MultiOutputRegressor(estimator=XGBRegressor(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.7, gamma=None,
                                            gamme=0, gpu_id=-1,
                                            importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=0.03,
                                            max_delta_step=None, max_depth=9,
                                            min_child_weight=5, missing=nan,
                                            monotone_constraints=None,
                                            n_estimators=1000, n_jobs=None,
                                            num_parallel_tree=None,
                                            objective='reg:linear',
    

In [115]:
testdat = pd.read_csv('./data/processed/carbontest-v1.csv')
testdat = testdat.drop('date_time', axis=1)
predict = model.predict(testdat)

In [116]:
print(predict)

[[  1.54142     9.366296  361.4376   ]
 [  2.0105405  14.040433  466.9923   ]
 [  1.8814142  13.308816  450.10034  ]
 ...
 [  2.4312983  13.208817  339.4273   ]
 [  2.1076846  10.546712  252.47699  ]
 [  2.2240303  12.450686  244.50528  ]]


In [117]:
preddf = pd.DataFrame(predict, columns = ['a','b','c'])

In [118]:
ttest = pd.read_csv('./data/tpsj/test.csv')

In [119]:
submission = pd.DataFrame({
        'date_time': ttest.date_time,
        'target_carbon_monoxide': preddf.a,
        'target_benzene': preddf.b,
        'target_nitrogen_oxides': preddf.c
    })
submission.to_csv('./submissions/carbon3-Ps.csv', index=False)