In [6]:
import  pandas as  pd  # data processing, CSV file I/O (e.g. pd.read_csv)# data  
import numpy as np


In [10]:
train_features = pd.read_csv('data/dengue_features_train.csv')
train_labels = pd.read_csv('data/dengue_labels_train.csv')

test_features = pd.read_csv('data/dengue_features_test.csv')

train_features.dropna(inplace=True)
train_labels.dropna(inplace=True)

In [11]:
sample_data = pd.merge(train_labels, train_features, on=('year', 'city', 'weekofyear'))

sample_data.fillna(method='ffill', inplace=True)
sample_data_sj = sample_data[sample_data.city == 'sj'].copy()
sample_data_iq = sample_data[sample_data.city == 'iq'].copy()

test_features.fillna(method='ffill', inplace=True)
test_data_sj = test_features[test_features.city == "sj"].copy()
test_data_iq = test_features[test_features.city == "iq"].copy()

In [12]:
sample_data_sj.drop('reanalysis_tdtr_k', axis=1, inplace=True)
sample_data_iq.drop('reanalysis_tdtr_k', axis=1, inplace=True)

sample_data_sj.drop('year', axis=1, inplace=True)
sample_data_iq.drop('year', axis=1, inplace=True)

sample_data_sj.drop('ndvi_ne', axis=1, inplace=True)
sample_data_iq.drop('ndvi_ne', axis=1, inplace=True)

sample_data_sj.drop('reanalysis_max_air_temp_k', axis=1, inplace=True)
sample_data_iq.drop('reanalysis_max_air_temp_k', axis=1, inplace=True)

sample_data_sj.drop('ndvi_se', axis=1, inplace=True)
sample_data_iq.drop('ndvi_se', axis=1, inplace=True)

sample_data_sj.drop('station_diur_temp_rng_c', axis=1, inplace=True)
sample_data_iq.drop('station_diur_temp_rng_c', axis=1, inplace=True)

sample_data_sj.drop('weekofyear', axis=1, inplace=True)
sample_data_iq.drop('weekofyear', axis=1, inplace=True)

sample_data_sj.drop('ndvi_nw', axis=1, inplace=True)
sample_data_iq.drop('ndvi_nw', axis=1, inplace=True)

In [13]:
test_data_sj.drop('reanalysis_tdtr_k', axis=1, inplace=True)
test_data_iq.drop('reanalysis_tdtr_k', axis=1, inplace=True)

test_data_sj.drop('year', axis=1, inplace=True)
test_data_iq.drop('year', axis=1, inplace=True)

test_data_sj.drop('ndvi_ne', axis=1, inplace=True)
test_data_iq.drop('ndvi_ne', axis=1, inplace=True)

test_data_sj.drop('reanalysis_max_air_temp_k', axis=1, inplace=True)
test_data_iq.drop('reanalysis_max_air_temp_k', axis=1, inplace=True)

test_data_sj.drop('ndvi_se', axis=1, inplace=True)
test_data_iq.drop('ndvi_se', axis=1, inplace=True)

test_data_sj.drop('station_diur_temp_rng_c', axis=1, inplace=True)
test_data_iq.drop('station_diur_temp_rng_c', axis=1, inplace=True)

test_data_sj.drop('weekofyear', axis=1, inplace=True)
test_data_iq.drop('weekofyear', axis=1, inplace=True)

test_data_sj.drop('ndvi_nw', axis=1, inplace=True)
test_data_iq.drop('ndvi_nw', axis=1, inplace=True)

In [14]:
sj_train_subtrain = sample_data_sj.head(800)
sj_train_subtest = sample_data_sj.tail(sample_data_sj.shape[0] - 800)

iq_train_subtrain = sample_data_iq.head(400)
iq_train_subtest = sample_data_iq.tail(sample_data_iq.shape[0] - 400)

In [15]:
### Model generator
from statsmodels.tools import eval_measures
import statsmodels.formula.api as smf

from sklearn.cross_validation import train_test_split
import statsmodels.api as sm


from statsmodels.tools import eval_measures
import statsmodels.formula.api as smf

def get_best_model(train, test):
    # Step 1: specify the form of the model
    model_formula = "total_cases ~ 1 + " \
                    "reanalysis_specific_humidity_g_per_kg + " \
                    "reanalysis_dew_point_temp_k + " \
                    "reanalysis_min_air_temp_k + " \
                    "station_min_temp_c + " \
                    "station_max_temp_c + " \
                    "station_avg_temp_c"
                    
        
    
    grid = 10 ** np.arange(-8, -3, dtype=np.float64)
                    
    best_alpha = []
    best_score = 1000
        
    # Step 2: Find the best hyper parameter, alpha
    for alpha in grid:
        model = smf.glm(formula=model_formula,
                        data=train,
                        family=sm.families.NegativeBinomial(alpha=alpha))

        results = model.fit()
        predictions = results.predict(test).astype(int)
        score = eval_measures.meanabs(predictions, test.total_cases)

        if score < best_score:
            best_alpha = alpha
            best_score = score

    print('best alpha = ', best_alpha)
    print('best score = ', best_score)
            
    # Step 3: refit on entire dataset
    full_dataset = pd.concat([train, test])
    model = smf.glm(formula=model_formula,
                    data=full_dataset,
                    family=sm.families.NegativeBinomial(alpha=best_alpha))

    fitted_model = model.fit()
    return fitted_model
    
sj_best_model = get_best_model(sj_train_subtrain, sj_train_subtest)
iq_best_model = get_best_model(iq_train_subtrain, iq_train_subtest)

best alpha =  1e-08
best score =  20.2767584098
best alpha =  1e-08
best score =  5.29166666667


In [17]:
sj_predictions = sj_best_model.predict(test_data_sj).astype(int)
iq_predictions = iq_best_model.predict(test_data_iq).astype(int)

In [18]:
submission = pd.read_csv("data/submission_format.csv")
submission.drop('total_cases', axis=1, inplace=True)

submission["total_cases"] = np.concatenate([sj_predictions, iq_predictions])

submission.to_csv("output/submission.csv", index=False)