In [1]:
import csv
import numpy as np

bad_val = np.float64(-9.99000000e+02)
zero = np.float64(0)

# ------------------------------------------------
#change these variables if necessary
train_path = 'train.csv'
test_path = 'test.csv'

train_cleaned_path = 'train_cleaned.csv'
test_cleaned_path = 'test_cleaned.csv'

#columns whose mean is below this number (absolute) will not be normalised (avoids numerical problems)
normalisation_threshold = 0.1
# ------------------------------------------------

In [2]:
#replace before with provided value in array
def replace(array, before, value):
    
    f = lambda x,y: y if x==before else x
    if (np.isnan(before)):
        f = lambda x,y: y if np.isnan(x) else x
    vf = np.vectorize(f)
    #return np.fromiter((f(x, value) for x in array), array.dtype, count=len(array))
    return vf(array, value)

def mapp(array, f):
    return np.fromiter((f(x, value) for x in array), array.dtype, count=len(array))
        
def avg(array):
    return np.nanmean(array)

def create_f64_arrays(csv, size):
    header = next(csv)
    
    index = np.empty(size, dtype = np.int64)
    letters = np.empty(size, dtype = np.string_)
    ret = np.empty([size, len(header)-2], dtype = np.float64)

    
    idx = 0
    for x in csv:
        index[idx] = x[0]
        letters[idx] = x[1]
        
        ret[idx] = np.float64(x[2:])
        idx = idx + 1
    
    #print(ret)
    return (header, index, letters, ret)
    
    
def to_f64_try(input):
    try:
        ret =np.float64(input)

        return ret
    except ValueError:
        return input

    

In [3]:
test = csv.reader(open(test_path, 'r'), delimiter=',')

train = csv.reader(open(train_path, 'r'), delimiter=',')

In [4]:
(test_header, test_index, test_letters, test_data) = create_f64_arrays(test, 568239-1)
(train_header, train_index, train_letters, train_data) = create_f64_arrays(train, 250001-1)

In [5]:
print(test_data[0:,0])

[-999.     106.398  117.794 ...,  108.497   96.711   92.373]


In [6]:
test_data_nan = replace(test_data, bad_val, np.nan)
train_data_nan = replace(train_data, bad_val, np.nan)

In [7]:
#replace -999 with column average
for x in range(0, test_data_nan.shape[1]):
    aver = avg(test_data_nan[:, x])
    test_data_nan[:, x] = replace(test_data_nan[:, x], np.nan, aver)
    print(test_data_nan[:, x])
    
for x in range(0, train_data_nan.shape[1]):
    aver = avg(train_data_nan[:, x])
    train_data_nan[:, x] = replace(train_data_nan[:, x], np.nan, aver)

[ 121.87172934  106.398       117.794      ...,  108.497        96.711
   92.373     ]
[ 79.589  67.49   56.226 ...,   9.837  20.006  80.109]
[ 23.916  87.949  96.358 ...,  65.149  66.942  77.619]
[  3.036  49.994   4.137 ...,  18.006  29.761   3.984]
[ 2.40501628  2.40501628  2.40501628 ...,  2.40501628  2.40501628
  2.40501628]
[ 372.35542865  372.35542865  372.35542865 ...,  372.35542865  372.35542865
  372.35542865]
[-0.83183271 -0.83183271 -0.83183271 ..., -0.83183271 -0.83183271
 -0.83183271]
[ 0.903  2.048  2.755 ...,  2.742  2.479  2.486]
[  3.036   2.679   4.137 ...,  18.006   2.739   3.984]
[  56.018  132.865   97.6   ...,   68.097  101.676   77.348]
[ 1.536  1.777  1.096 ...,  0.642  0.936  1.683]
[-1.404 -1.204 -1.408 ..., -0.836  1.41  -1.346]
[ 0.4585633  0.4585633  0.4585633 ...,  0.4585633  0.4585633  0.4585633]
[ 22.088  30.716  46.564 ...,  41.478  36.575  28.83 ]
[-0.54  -1.784 -0.298 ...,  1.057 -1.249  1.8  ]
[-0.609  3.054  3.079 ...,  1.025 -1.54  -1.138]
[ 33.93

In [8]:
def normalise(array):
    ret = np.empty(array.shape)
    for x in range(0, array.shape[1]):
        (aver, std) = (np.average(array[:, x]), np.std(array[:, x]))
        #dont touch columns with very small mean in the first place
        if std > normalisation_threshold:
            ret[:, x] = (array[:, x] - aver) / std
        else:
            ret[:, x] = array[:, x] - aver
    return ret

test_data_norm = normalise(test_data_nan)
train_data_norm = normalise(train_data_nan)

In [9]:
def write_new_csv(name, header, index, letters, data):
    data_str = np.array(data, dtype='<U9')
    index_str = np.array(index, dtype='<U9')
    left = np.c_[index_str, letters]
    payload = np.c_[left, data_str]

    
    fp = open(name, 'w')
    wr = csv.writer(fp, delimiter=',',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    
    wr.writerow(header)
    wr.writerows(payload)
    
    
    
write_new_csv(test_cleaned_path, test_header, test_index, test_letters, test_data_norm)
write_new_csv(train_cleaned_path, train_header, train_index, train_letters, train_data_norm)

In [10]:
for x in range(0, 30):
    
    print(np.average(test_data_norm[:, x]))

-3.94598691259e-16
-2.45834847107e-16
-6.11542316889e-16
5.75198523241e-19
-3.92110332431e-16
5.69971719269e-16
-1.457002868e-16
4.30586111909e-16
2.7868368451e-16
2.4641004563e-16
8.07028536739e-17
5.32746371363e-17
-9.01411111814e-16
2.67317261519e-16
-0.0119663943629
-0.0153522256519
5.74323221141e-17
-0.0188911706011
0.0520638271288
-1.90315685299e-17
-0.00798097804089
7.74642359017e-16
-2.50836573396e-17
-9.23068586645e-16
-0.000355455132343
-0.0217031457245
-1.01897668824e-16
-0.0043993786342
-0.0143666541749
-4.19907426282e-16
