In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

from fancyimpute import KNN

print "Loading data"

# training data
dfs = [pd.read_csv("../data/sub_challenge_1/data_obs_{}.txt".format(i), 
            header=0, index_col=0, sep="\t") for i in range(1, 11)]

# ground truth
ground_truth_table = pd.read_csv("../data/sub_challenge_1/data_true.txt", 
            header=0, index_col=0, sep="\t")

# conver from data frame ot numpy array
datas = [df.values for df in dfs]
ground_truth = ground_truth_table.values

In [63]:
def generate_training_data(data, ground_truth):

    nan_idx_i, nan_idx_j = np.where(np.isnan(data))
    
    y = ground_truth[nan_idx_i, nan_idx_j]
    
    knn = KNN(k=5, verbose=0)
    imputed_data = knn.complete(data)
    
    X = np.zeros((len(nan_idx_i), data.shape[1]))
    for idx, (i, j) in enumerate(zip(nan_idx_i, nan_index_j)):

        x = imputed_data[i]
        x[j] = 0

        X[idx] = x
    
    return X, y

In [77]:
np.arange(10) != 5

array([ True,  True,  True,  True,  True, False,  True,  True,  True,  True], dtype=bool)

In [93]:
i, j = 4, 6
rows = np.append(np.array([1,5]), i)
columns = np.append(np.array([3, 7]), j)

In [94]:
rows.sort()
columns.sort()
rows, columns

(array([1, 4, 5]), array([3, 6, 7]))

In [95]:
from itertools import product

In [96]:
X = np.random.rand(10, 10)

In [97]:
list(product(rows, columns))

[(1, 3), (1, 6), (1, 7), (4, 3), (4, 6), (4, 7), (5, 3), (5, 6), (5, 7)]

In [98]:
np.repeat(rows, len(columns))

array([1, 1, 1, 4, 4, 4, 5, 5, 5])

In [101]:
np.tile(columns, len(rows))

array([3, 6, 7, 3, 6, 7, 3, 6, 7])

In [91]:
X[list(product(rows, columns))]

IndexError: too many indices for array

In [160]:
def generate_training_data_from_ground_truth(ground_truth, num_patterns, grid_size=3):

    num_samples, num_features = ground_truth.shape
    
    idx_i = np.random.choice(num_samples, size=(num_patterns,))
    idx_j = np.random.choice(num_features, size=(num_patterns,))
    
    num_samples_per_target = (2 * grid_size + 1) ** 2 - 1
    
    X = np.zeros((num_patterns, num_samples_per_target))
    
    for i in range(num_patterns):
        
        rows = np.append(np.append(np.random.choice(np.where(np.arange(num_samples) != idx_i[i])[0], size=(grid_size,) ), 
                         idx_i[i]), np.random.choice(np.where(np.arange(num_samples) != idx_i[i])[0], size=(grid_size,) ))
        columns = np.append(np.append(np.random.choice(np.where(np.arange(num_features) != idx_j[i])[0], size=(grid_size,) ),
                            idx_j[i]), np.random.choice(np.where(np.arange(num_features) != idx_j[i])[0], size=(grid_size,) ))
                                   
        I = np.repeat(rows, len(columns))
        J = np.tile(columns, len(rows))
        
        
        x = ground_truth[I, J]
        mask = (I == idx_i[i]) & (J == idx_j[i])
        
        x = x[~mask]
        
        X[i] = x
        
    
    y = ground_truth[idx_i, idx_j]
    
    return X, y

In [161]:
def generate_testing_data(data, grid_size=3):
    
    num_samples, num_features = data.shape
    
    idx_i, idx_j = np.where(np.isnan(data))

    num_samples_per_target = (2 * grid_size + 1) ** 2 - 1
    
    num_patterns = len(idx_i)
    
    X = np.zeros((num_patterns, num_samples_per_target))
    
    for i in range(num_patterns):
        
        rows = np.append(np.append(np.random.choice(np.where(np.arange(num_samples) != idx_i[i])[0], size=(grid_size,) ), 
                         idx_i[i]), np.random.choice(np.where(np.arange(num_samples) != idx_i[i])[0], size=(grid_size,) ))
        columns = np.append(np.append(np.random.choice(np.where(np.arange(num_features) != idx_j[i])[0], size=(grid_size,) ),
                            idx_j[i]), np.random.choice(np.where(np.arange(num_features) != idx_j[i])[0], size=(grid_size,) ))
                                   
        I = np.repeat(rows, len(columns))
        J = np.tile(columns, len(rows))
        
        
        x = ground_truth[I, J]
        mask = (I == idx_i[i]) & (J == idx_j[i])
        
        x = x[~mask]
        
        X[i] = x
    
    return X

In [162]:
X, y = generate_training_data_from_ground_truth(ground_truth, num_patterns=10000000,)

In [157]:
# training_patterns = [generate_training_data(data, ground_truth) for data in datas]

In [158]:
# X_train, y_train = training_patternsining_patterns[0]
# for X, y in training_patterns[1:-1]:
#     X_train = np.append(X_train, X, axis=0)
#     y_train = np.append(y_train, y)
    
# X_test, y_test = training_patterns[-1]

In [None]:
regressor = RandomForestRegressor(n_estimators=100)
regressor.fit(X, y)

In [145]:
data=datas[0]

In [147]:
testing_data = generate_testing_data(data)

In [148]:
testing_data.shape

(190543, 24)

In [149]:
predictions = regressor.predict(testing_data)

In [150]:
predictions[:5]

array([ 23.80940521,  28.83581949,  29.81528266,  26.51793438,  18.00954556])

In [152]:
y_test = ground_truth[np.where(np.isnan(data))]

In [153]:
y_test[:5]

array([  0.       ,  29.7998622,   0.       ,   0.       ,   0.       ])

In [154]:
from sklearn.metrics import mean_squared_error as mse

In [155]:
def rmse (y_pred, y_true): 
    # rsme prediction and ground truth
    return np.sqrt(mse(y_true, y_pred))

In [156]:
rmse(predictions, y_test)

11.684528933187948