In [2]:
import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing

# Load dataset 
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
sample = pd.read_csv('../input/sampleSubmission.csv')
weather = pd.read_csv('../input/weather.csv')

# Get labels
labels = train.WnvPresent.values

# Not using codesum for this benchmark
weather = weather.drop('CodeSum', axis=1)

# Split station 1 and 2 and join horizontally
weather_stn1 = weather[weather['Station']==1]
weather_stn2 = weather[weather['Station']==2]
weather_stn1 = weather_stn1.drop('Station', axis=1)
weather_stn2 = weather_stn2.drop('Station', axis=1)
weather = weather_stn1.merge(weather_stn2, on='Date')

# replace some missing values and T with -1
weather = weather.replace('M', -1)
weather = weather.replace('-', -1)
weather = weather.replace('T', -1)
weather = weather.replace(' T', -1)
weather = weather.replace('  T', -1)

# Functions to extract month and day from dataset
# You can also use parse_dates of Pandas.
def create_month(x):
    return x.split('-')[1]

def create_day(x):
    return x.split('-')[2]

train['month'] = train.Date.apply(create_month)
train['day'] = train.Date.apply(create_day)
test['month'] = test.Date.apply(create_month)
test['day'] = test.Date.apply(create_day)

# Add integer latitude/longitude columns
train['Lat_int'] = train.Latitude.apply(int)
train['Long_int'] = train.Longitude.apply(int)
test['Lat_int'] = test.Latitude.apply(int)
test['Long_int'] = test.Longitude.apply(int)

# drop address columns
train = train.drop(['Address', 'AddressNumberAndStreet','WnvPresent', 'NumMosquitos'], axis = 1)
test = test.drop(['Id', 'Address', 'AddressNumberAndStreet'], axis = 1)

# Merge with weather data
train = train.merge(weather, on='Date')
test = test.merge(weather, on='Date')
train = train.drop(['Date'], axis = 1)
test = test.drop(['Date'], axis = 1)

# Convert categorical data to numbers
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train['Species'].values) + list(test['Species'].values))
train['Species'] = lbl.transform(train['Species'].values)
test['Species'] = lbl.transform(test['Species'].values)

lbl.fit(list(train['Street'].values) + list(test['Street'].values))
train['Street'] = lbl.transform(train['Street'].values)
test['Street'] = lbl.transform(test['Street'].values)

lbl.fit(list(train['Trap'].values) + list(test['Trap'].values))
train['Trap'] = lbl.transform(train['Trap'].values)
test['Trap'] = lbl.transform(test['Trap'].values)

# drop columns with -1s
train = train.ix[:,(train != -1).any(axis=0)]
test = test.ix[:,(test != -1).any(axis=0)]

# Random Forest Classifier 
clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=1000)
clf.fit(train, labels)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [2]:
import numpy as np
import csv
import sys
import math
import xgboost as xgb

Ntrain = 10506
Ntest = 116293
Nfea = 13
MISSING = 999.0
# Feature: Month, Week, Latitude, Longitude, NumMosq in Nearest Area, Near Dis, TMax, Tmin, Tavg, WaterBub, Dry, StnPressure

Xtrain = np.zeros((Ntrain, Nfea), dtype=np.float32)
Ytrain = []
Xtest = np.zeros((Ntest, Nfea), dtype=np.float32)

train_head = ""
spray_head = ""
weather_head = ""
weather_dic = {}
train_dic = {}


def sigmoid(x):
    return 1.0 / (1.0 + math.exp(-x))

# build weather dic
fi = csv.reader(open("../input/weather.csv"))
weather_head = fi.__next__()
for line in fi:
    # simply discard station 1
    if line[0] == '1':
        continue
    weather_dic[line[1]] = line

# build train dic
fi = csv.reader(open("../input/train.csv"))
train_head = fi.__next__()
for line in fi:
    idx = train_head.index("Date")
    date = line[idx].split('-')
    key = "%s-%d" % (date[1], int(date[2]) / 7)
    if key not in train_dic:
        train_dic[key] = []
    train_dic[key].append(line)

def gen_month(line, head=train_head):
    idx = head.index("Date")
    date = line[idx].split('-')
    return float(date[1])

def gen_week(line, head=train_head):
    idx = head.index("Date")
    date = line[idx].split('-')
    return int(date[1]) * 4 + int(date[2]) / 7

def gen_latitude(line, head=train_head):
    idx = head.index("Latitude")
    return float(line[idx])

def gen_longitude(line, head=train_head):
    idx = head.index("Longitude")
    return float(line[idx])

def gen_tmax(line, head=train_head):
    idx1 = weather_head.index("Tmax")
    idx2 = head.index("Date")
    return float(weather_dic[line[idx2]][idx1])

def gen_tmin(line, head=train_head):
    idx1 = weather_head.index("Tmin")
    idx2 = head.index("Date")
    return float(weather_dic[line[idx2]][idx1])

def gen_tavg(line, head=train_head):
    idx1 = weather_head.index("Tavg")
    idx2 = head.index("Date")
    return float(weather_dic[line[idx2]][idx1])

def gen_water(line, head=train_head):
    idx1 = weather_head.index("DewPoint")
    idx2 = head.index("Date")
    return float(weather_dic[line[idx2]][idx1])

def gen_snow(line, head=train_head):
    idx1 = weather_head.index("WetBulb")
    idx2 = head.index("Date")
    return float(weather_dic[line[idx2]][idx1])

def gen_pressure(line, head=train_head):
    idx1 = weather_head.index("StnPressure")
    idx2 = head.index("Date")
    return float(weather_dic[line[idx2]][idx1])

def gen_moisq(line, head=train_head):
    idx = train_head.index("NumMosquitos")
    idx1 = head.index("Date")
    #idx2 = train_head.index("NumMosquitos")
    idx3 = head.index("Latitude")
    idx4 = head.index("Longitude")
    train_idx3 = train_head.index("Latitude")
    train_idx4 = train_head.index("Longitude")
    lati = float(line[idx3])
    logi = float(line[idx4])
    date = line[idx1].split('-')
    key = "%s-%d" % (date[1], int(date[2]) / 7)
    min_dis = MISSING
    sol = MISSING
    second_dis = MISSING
    sol2 = MISSING
    temp = []
    tmp = {}
    for line in train_dic[key]:
        dis = (float(line[train_idx3]) - lati) ** 2 + (float(line[train_idx4]) - logi) ** 2
        res = int(line[idx])
        temp.append((dis, res))
        if int(dis) not in tmp:
            tmp[int(dis)] = []
        tmp[int(dis)].append(res)
    temp = sorted(temp, key=lambda s:s[0])
    try:
        min_dis = temp[0][0]
        sol = sum(tmp[int(min_dis)])
        sol /= len(tmp[int(min_dis)]) * 1.0
        second_dis = min_dis
        for item in temp:
            if item[0] != second_dis:
                second_dis = item[0]
                break
        sol2 = sum(tmp[int(second_dis)])
        sol2 /= len(tmp[int(second_dis)]) * 1.0
    except:
        pass
    return (min_dis, sol, second_dis, sol2)

# build train
fi = csv.reader(open("../input/train.csv"))
fi.__next__()
i = 0

sum_wneg = 0.0
sum_wpos = 0.0
#print "make training data"
for line in fi:
    Xtrain[i][0] = gen_snow(line)
    Xtrain[i][1] = gen_tavg(line)
    Xtrain[i][2] = gen_tmax(line)
    Xtrain[i][3] = gen_tmin(line)
    Xtrain[i][4] = gen_week(line)
    #Xtrain[i][5] = gen_moisq(line)
    Xtrain[i][6] = gen_month(line)
    Xtrain[i][7] = gen_water(line)
    Xtrain[i][8] = gen_latitude(line)
    Xtrain[i][9] = gen_longitude(line)
    mos = gen_moisq(line)
    Xtrain[i][5] = mos[0]
    Xtrain[i][10] = mos[1]
    Xtrain[i][11] = mos[2]
    Xtrain[i][12] = mos[3]
    label = int(line[train_head.index("WnvPresent")])
    Ytrain.append(label)
    if label == 0:
        sum_wneg += 1.0
    else:
        sum_wpos += 1.0
    i += 1

#print "make test data"
ids = []
fi = csv.reader(open("../input/test.csv"))
test_head = fi.__next__()

i = 0

for line in fi:
    ids.append(line[0])
    Xtest[i][0] = gen_snow(line, test_head)
    Xtest[i][1] = gen_tavg(line, test_head)
    Xtest[i][2] = gen_tmax(line, test_head)
    Xtest[i][3] = gen_tmin(line, test_head)
    Xtest[i][4] = gen_week(line, test_head)
    #Xtrain[i][5] = gen_moisq(line)
    Xtest[i][6] = gen_month(line, test_head)
    Xtest[i][7] = gen_water(line, test_head)
    Xtest[i][8] = gen_latitude(line, test_head)
    Xtest[i][9] = gen_longitude(line, test_head)
    mos = gen_moisq(line, test_head)
    Xtest[i][5] = mos[0]
    Xtest[i][10] = mos[1]
    Xtest[i][11] = mos[2]
    Xtest[i][12] = mos[3]
    i += 1

#print "training"
dtrain = xgb.DMatrix(Xtrain, label=Ytrain, missing = MISSING)
dtest = xgb.DMatrix(Xtest, missing = MISSING)
param = {}
# use logistic regression loss, use raw prediction before logistic transformation
# since we only need the rank
param['objective'] = 'binary:logitraw'
# scale weight of positive examples
param['scale_pos_weight'] = sum_wneg/sum_wpos
param['eta'] = 0.1
param['max_depth'] = 7
param['eval_metric'] = 'auc'
param['silent'] = 1
param['min_child_weight'] = 100
param['subsample'] = 0.7
param['colsample_bytree'] = 0.7
param['nthread'] = 4

num_round = 50

#xgb.cv(param, dtrain, num_round, nfold=5)
bst = xgb.train(param, dtrain, num_round)

#print "testing"
ypred = bst.predict(dtest)

fo = csv.writer(open("submission_test.csv", "w"), lineterminator="\n")
fo.writerow(["Id","WnvPresent"])
i = 0
for item in ids:
    fo.writerow([ids[i], sigmoid(ypred[i])])
    i += 1


In [15]:
import pandas as pd
submit = pd.DataFrame({"Id": ids,"WnvPresent":ypred})
submit.to_csv("testsubmit.csv",index=False)

In [18]:
pd.read_csv("testsubmit.csv")
#samplesub = pd.read_csv('../input/sampleSubmission.csv')
#samplesub

Unnamed: 0,Id,WnvPresent
0,1,-3.968497
1,2,-3.968497
2,3,-3.968497
3,4,-3.968497
4,5,-3.968497
5,6,-3.968497
6,7,-3.968497
7,8,-3.968497
8,9,-4.061266
9,10,-4.061266
