In [10]:
import pandas as pd
import scipy as sc
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

feature_columns = ['city', 'weekofyear',
 'precipitation_amt_mm',
 'reanalysis_air_temp_k',
 'reanalysis_avg_temp_k',
 'reanalysis_dew_point_temp_k',
 'reanalysis_precip_amt_kg_per_m2',
 'reanalysis_relative_humidity_percent',
 'reanalysis_sat_precip_amt_mm',
 'reanalysis_specific_humidity_g_per_kg',
 'station_avg_temp_c']

#ndvi = density of green
#precipitation = rain amount

df = pd.read_csv('dengue_features_train.csv', usecols=feature_columns)
df['precipitation_amt_mm'] = df['precipitation_amt_mm'].rolling(window=5, min_periods=(1)).mean()
df['reanalysis_air_temp_k'] = df['reanalysis_air_temp_k'].rolling(window=5,  min_periods=(1)).median()
df['reanalysis_avg_temp_k'] = df['reanalysis_avg_temp_k'].rolling(window=5,  min_periods=(1)).mean()
df['reanalysis_dew_point_temp_k'] = df['reanalysis_dew_point_temp_k'].rolling(window=5,  min_periods=(1)).median()

df['reanalysis_precip_amt_kg_per_m2'] = df['reanalysis_precip_amt_kg_per_m2'].rolling(window=5,  min_periods=(1)).median()
df['reanalysis_relative_humidity_percent'] = df['reanalysis_relative_humidity_percent'].rolling(window=5,  min_periods=(1)).mean()
df['reanalysis_sat_precip_amt_mm'] = df['reanalysis_sat_precip_amt_mm'].rolling(window=5,  min_periods=(1)).median()
df['reanalysis_specific_humidity_g_per_kg'] = df['reanalysis_specific_humidity_g_per_kg'].rolling(window=5,  min_periods=(1)).median()
df['station_avg_temp_c'] = df['station_avg_temp_c'].rolling(window=5,  min_periods=(1)).mean()

# df = df.fillna(df.mean())
# df['ndvi_ne']=df['ndvi_ne'].fillna(df['ndvi_nw'])
# df['ndvi_nw']=df['ndvi_nw'].fillna(df['ndvi_se'])
# df['ndvi_se']=df['ndvi_se'].fillna(df['ndvi_sw'])
# df['ndvi_sw']=df['ndvi_sw'].fillna(df['ndvi_ne'])
df = df.fillna(df.mean())

data_sj = df[df['city']=='sj']
data_iq = df[df['city']=='iq']

data_sj = data_sj.drop('city', axis=1)
data_iq = data_iq.drop('city', axis=1)

label_columns = ['city', 'total_cases']
labels = pd.read_csv('dengue_labels_train.csv', usecols=label_columns)


labels_sj = labels[labels['city']=='sj']
labels_iq = labels[labels['city']=='iq']

sj_train_features, sj_test_features, sj_train_labels, sj_test_labels = train_test_split(
    data_sj, labels_sj['total_cases'], test_size=0.25, random_state=0, shuffle=False)

iq_train_features, iq_test_features, iq_train_labels, iq_test_labels = train_test_split(
    data_iq, labels_iq['total_cases'], test_size=0.25, random_state=0, shuffle=False)

In [33]:
# df['week_start_date'] = pd.to_datetime(df['week_start_date'])
# df['week_start_date'] = df['week_start_date'].dt.week

In [11]:
test_features = pd.read_csv('dengue_features_test.csv', usecols=feature_columns)

test_features['precipitation_amt_mm'] = test_features['precipitation_amt_mm'].rolling(window=5, min_periods=(1)).mean()
test_features['reanalysis_air_temp_k'] = test_features['reanalysis_air_temp_k'].rolling(window=5,  min_periods=(1)).median()
test_features['reanalysis_avg_temp_k'] = test_features['reanalysis_avg_temp_k'].rolling(window=5,  min_periods=(1)).mean()
test_features['reanalysis_dew_point_temp_k'] = test_features['reanalysis_dew_point_temp_k'].rolling(window=5,  min_periods=(1)).median()

test_features['reanalysis_precip_amt_kg_per_m2'] = test_features['reanalysis_precip_amt_kg_per_m2'].rolling(window=5,  min_periods=(1)).median()
test_features['reanalysis_relative_humidity_percent'] = test_features['reanalysis_relative_humidity_percent'].rolling(window=5,  min_periods=(1)).mean()
test_features['reanalysis_sat_precip_amt_mm'] = test_features['reanalysis_sat_precip_amt_mm'].rolling(window=5,  min_periods=(1)).median()
test_features['reanalysis_specific_humidity_g_per_kg'] = test_features['reanalysis_specific_humidity_g_per_kg'].rolling(window=5,  min_periods=(1)).median()
test_features['station_avg_temp_c'] = test_features['station_avg_temp_c'].rolling(window=5,  min_periods=(1)).mean()

test_features = test_features.fillna(test_features.mean())

test_features_sj = test_features[test_features['city']=='sj']
test_features_iq = test_features[test_features['city']=='iq']

test_features_sj = test_features_sj.drop('city', axis=1)
test_features_iq = test_features_iq.drop('city', axis=1)

In [12]:
import numpy as np
from sklearn import linear_model
lm1 = linear_model.LinearRegression()

# for sj
X = sj_train_features
y = sj_train_labels
model = lm1.fit(X,y)

predictions_sj = lm1.predict(test_features_sj).astype(int)
predictions_sj = predictions_sj.clip(min=0)

In [13]:
lm2 = linear_model.LinearRegression()
X1 = iq_train_features
y1 = iq_train_labels
model = lm2.fit(X1, y1)

predictions_iq = lm2.predict(test_features_iq).astype(int)
predictions_iq = predictions_iq.clip(min=0)

In [14]:
total_preditions = np.concatenate((predictions_sj, predictions_iq))
np.savetxt("foo.csv", total_preditions, delimiter=",")

In [15]:
sq_pred_val = lm1.predict(sj_test_features).astype(int)
iq_pred_val = lm2.predict(iq_test_features).astype(int)

iq_error = mean_absolute_error(iq_test_labels, iq_pred_val)
sj_error = mean_absolute_error(sj_test_labels, sq_pred_val)

In [16]:
sj_error, iq_error

(22.77777777777778, 9.007692307692308)

In [None]:
# --------------------------------------------- Random Forest ---------------------------------

In [66]:
# from sklearn.ensemble import RandomForestRegressor
# model = RandomForestRegressor(n_estimators=1000, max_features='auto',
#                                  max_depth=10, min_samples_leaf=0.005,
#                                  criterion='mae', min_weight_fraction_leaf=0.1
#                                 , warm_start=True)
# model.fit(X,y)

# predictions_sj = model.predict(test_features_sj).astype(int)
# predictions_sj = predictions_sj.clip(min=0)
# numpy.savetxt("foo1.csv", predictions_sj, delimiter=",")

# model.fit(X1, y1)
# predictions_iq = model.predict(test_features_iq).astype(int)
# predictions_iq = predictions_iq.clip(min=0)
# numpy.savetxt("foo2.csv", predictions_iq, delimiter=",")