In [1]:
import numpy as np
import pandas as pd

# Preprocessing

In [2]:
COLUMNS = ['station','date','feature', 'value', 'measurement','quality', 'source', 'hour']

In [6]:
#Transform Data to meaningful features
df = pd.read_csv('../data/2014.csv', header=None, names=COLUMNS)

In [7]:
df = df.append(pd.read_csv('../data/2015.csv', header=None, names=COLUMNS))
df = df.append(pd.read_csv('../data/2016.csv', header=None, names=COLUMNS))
df = df.append(pd.read_csv('../data/2017.csv', header=None, names=COLUMNS))

KeyboardInterrupt: 

In [None]:
df.head()

In [None]:
#Only selecting features who are measured more often than 2.5% compared to the most measured feature
#selected_features = ['AWND','PRCP','SNOW', 'SNWD', 'TAVG', 'TMAX', 'TMIN', 'WDF2']
selected_features = ['TMIN']
#How should we aggregate features?
#selected_features_mean = ['AWND','PRCP','SNOW', 'SNWD', 'TAVG', 'WDF2']
#elected_features_max = 'TMAX'
#selected_features_min = 'TMIN'

In [None]:
df = df[df['feature'].isin(selected_features)]

In [None]:
#df_sub = df[:100000]
#df[df['station']=='AE000041196'][df['feature']=='PRCP'].head(1000000)
#Some weather stations measure some values such as PRCP in extremely unregular patterns or don't measure them at all.
#Probably very similar issues with other features

In [None]:
df_pivot = df.pivot_table(index=['station','date'], columns='feature', values='value', aggfunc=np.min)

In [None]:
#Location Information?
df_stations = pd.read_csv('../data/ghcnd-stations.csv', header=None, names=['station','lat', 'long', 'elev'], sep=';')

In [None]:
df_stations = df_stations.set_index('station')

In [None]:
#Deciding on average, min or max per feature and day.
#df_red = df_pivot['mean']
#df_red = df_red.drop(['TMAX', 'TMIN'], axis=1)
#df_red = df_red.join(df_pivot['amax']['TMAX'])
#df_red = df_red.join(df_pivot['amin']['TMIN'])
#df_red.head()

In [None]:
df_time = df_pivot['TMIN'].iloc[:100000]
df_time = df_time.reset_index()
df_time.head()

In [None]:
stations = df_time.station.values

In [None]:
def derive_nth_day_feature(data, feature, N): 
    col_name = "{}_{}".format(feature, N)
    data[col_name] = [None]*len(data)
    #col = data.columns.get_loc(feature)
    for station in stations:
        rows = len(data[data['station']==station])
        index = data[data['station']==station].index
        i = index[0]
        for r in range(i+N, rows):
            data.at[r,col_name] = data.loc[r-N,feature]
    return data

In [None]:
for N in range(1, 5):
    df_train = derive_nth_day_feature(df_time, 'TMIN', N)
            
df_train.head()

In [None]:
for N in range(5, 10):
    df_train = derive_nth_day_feature(df_time, 'TMIN', N)
            
df_train.head()

# getting finally our hands dirty

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from lasagne.layers import DenseLayer
from lasagne.layers import InputLayer
from lasagne.layers import DropoutLayer
from lasagne.nonlinearities import softmax
from lasagne.updates import nesterov_momentum, adagrad
from nolearn.lasagne import NeuralNet
from scipy.special import expit
import random
from sklearn.neural_network import BernoulliRBM
from sklearn.random_projection import SparseRandomProjection, GaussianRandomProjection
from sklearn.manifold import LocallyLinearEmbedding, MDS
from sklearn.decomposition import TruncatedSVD
from sklearn.kernel_approximation import RBFSampler, Nystroem
from sklearn.svm import LinearSVC, SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier, RandomForestClassifier

In [None]:
def load_train_data(path):
    df = pd.read_csv(path)
    X = df.values.copy()
    np.random.shuffle(X)
    X, labels = X[:, 1:-1].astype(np.float32), X[:, -1]
    encoder = LabelEncoder()
    y = encoder.fit_transform(labels).astype(np.int32)
    scaler = StandardScaler()
    X = scaler.fit_transform(np.log(1+X))
    rbm1 = SVC(C=100.0, gamma = 0.1, probability=True, verbose=1).fit(X[0:9999,:], y[0:9999])
    rbm2 = RandomForestClassifier(n_estimators=300, criterion='entropy', max_features='auto', bootstrap=False, oob_score=False, n_jobs=1, verbose=1).fit(X[0:9999,:], y[0:9999])
    rbm3 = GradientBoostingClassifier(n_estimators=50,max_depth=11,subsample=0.8,min_samples_leaf=5,verbose=1).fit(X[0:9999,:], y[0:9999])
    X =  np.append(X[10000:LINES,:], np.power(rbm1.predict_proba(X[10000:LINES,:])*rbm2.predict_proba(X[10000:LINES,:])*rbm3.predict_proba(X[10000:LINES,:]), (1/3.0))   , 1)
    return X, y[10000:LINES], encoder, scaler, rbm1, rbm2, rbm3

def load_test_data(path, scaler, rbm1, rbm2, rbm3):
    df = pd.read_csv(path)
    X = df.values.copy()
    X, ids = X[:, 1:].astype(np.float32), X[:, 0].astype(str)
    X = scaler.transform(np.log(1+X))
    X =  np.append(X, np.power(rbm1.predict_proba(X)*rbm2.predict_proba(X)*rbm3.predict_proba(X), (1/3.0)), 1)
    return X, ids

In [None]:
#Load Cleaned Data
X, y, encoder, scaler, rbm1, rbm2, rbm3 = load_train_data('/data/train.csv')
X_test, ids = load_test_data('/data/test.csv', scaler, rbm1, rbm2, rbm3)

num_classes = len(encoder.classes_)
num_features = X.shape[1]

print(num_classes); print(num_features); print(X)

In [None]:
random.seed(21)
np.random.seed(21)

LINES = 61877

In [None]:
def make_submission(y_prob, ids, encoder, name='/data/lasagneSeed21.csv'):
    with open(name, 'w') as f:
        f.write('id,')
        f.write(','.join(encoder.classes_))
        f.write('\n')
        for id, probs in zip(ids, y_prob):
            probas = ','.join([id] + map(str, probs.tolist()))
            f.write(probas)
            f.write('\n')
    print("Wrote submission to file {}.".format(name))

In [None]:
layers0 = [('input', InputLayer),
('dropoutf', DropoutLayer),
('dense0', DenseLayer),
('dropout', DropoutLayer),
('dense1', DenseLayer),
('dropout2', DropoutLayer),
('dense2', DenseLayer),
('output', DenseLayer)]


net0 = NeuralNet(layers=layers0,

input_shape=(None, num_features),
dropoutf_p=0.1,
dense0_num_units=600,
dropout_p=0.3,
dense1_num_units=600,
dropout2_p=0.1,
dense2_num_units=600,

output_num_units=num_classes,
output_nonlinearity=softmax,

#update=nesterov_momentum,
update=adagrad,
update_learning_rate=0.008,
eval_size=0.2,
verbose=1,
max_epochs=20)



net0.fit(X, y)
y_prob = net0.predict_proba(X_test)

In [None]:
num_runs = 50
for jj in xrange(num_runs):
  print(jj)
  X, y, encoder, scaler, rbm1, rbm2, rbm3 = load_train_data('/data/train.csv')
  X_test, ids = load_test_data('/data/test.csv', scaler, rbm1, rbm2, rbm3)
  num_classes = len(encoder.classes_)
  num_features = X.shape[1]
  net0.fit(X, y)
  y_prob = y_prob + net0.predict_proba(X_test)


y_prob = y_prob/(num_runs+1.0)
make_submission(y_prob, ids, encoder)