In [1]:
from __future__ import absolute_import, division, print_function

In [2]:
import zipfile,time, os

import numpy as np

import pandas as pd

from joblib import Parallel, delayed

In [3]:
def split(i, p=','):
    res = []
    for n in i.split(p):
        try:
            res.append(float(n))
        except ValueError:
            res.append(np.NAN)
    return res

In [4]:
def read_file(filename, is_zip=False, fillna='mean', limit=None):
    '''Reads the zip/csv file of the data and returns a dataframe of it'''
    print("Reading data ...")

    name, ext = os.path.splitext(filename)

    start = time.time()
    if is_zip or ext=='.zip':
        with zipfile.ZipFile(filename) as f:
            str_data = pd.compat.to_str(
                f.read(
                    f.namelist()[0]))
    else:
        with open(filename) as f:
            str_data = pd.compat.to_str(f.read())

    if limit:
        splitted_data = str_data.split()[:limit]
    else:
        splitted_data = str_data.split()
    columns_names = splitted_data[0].split(',')
    struct_data = Parallel(n_jobs=-1)(delayed(split)(i) for i in splitted_data[1:])
    df = pd.DataFrame(struct_data, columns=columns_names).set_index('Id')
    print("Data read ...\ \nTook: ",time.time()-start)
    
    if fillna is not None:
        print("Filling NaN values with {}".format(fillna))
    
    if fillna == "mean" or fillna == "avg" or fillna == "average":
        means = df.groupby(df.index).mean()
        filled = df.fillna(means.loc[df.index]).fillna(df.mean())
        
    elif fillna == "median":
        meds = df.groupby(df.index).median()
        filled = df.fillna(meds.loc[df.index]).fillna(df.median())
    elif fillna is None:
        return df
    else:
        filled = df.fillna(fillna)
    
    return filled

In [5]:
df_train = read_file('data/train.zip', limit=60000)
df_train.sample(5)

Reading data ...
Data read ...\ 
Took:  8.887539386749268
Filling NaN values with mean


Unnamed: 0_level_0,minutes_past,radardist_km,Ref,Ref_5x5_10th,Ref_5x5_50th,Ref_5x5_90th,RefComposite,RefComposite_5x5_10th,RefComposite_5x5_50th,RefComposite_5x5_90th,...,RhoHV_5x5_90th,Zdr,Zdr_5x5_10th,Zdr_5x5_50th,Zdr_5x5_90th,Kdp,Kdp_5x5_10th,Kdp_5x5_50th,Kdp_5x5_90th,Expected
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2405.0,52.0,7.0,25.277778,24.166667,26.555556,27.35,7.0,23.954545,26.045455,6.5,...,1.048333,-0.078125,-0.089286,0.256944,1.125,0.507139,-2.51167,-0.050003,2.648745,1.524001
401.0,26.0,15.0,23.431635,20.588808,23.113441,26.131171,25.146985,22.701278,24.846279,27.692052,...,1.014645,0.604019,-0.581839,0.4248,2.026942,-0.027961,-3.440595,-0.444247,3.862786,0.508
2670.0,4.0,15.0,23.431635,20.588808,23.113441,26.131171,25.146985,22.701278,24.846279,27.692052,...,1.014645,0.604019,-0.581839,0.4248,2.026942,-0.027961,-3.440595,-0.444247,3.862786,14.000007
724.0,21.0,10.0,8.5,20.588808,23.113441,10.0,8.5,22.701278,8.5,10.0,...,0.952778,2.3125,-0.581839,0.4248,2.0,-0.027961,-3.440595,-0.444247,3.862786,0.02
3709.0,16.0,11.0,32.0,28.0,30.5,34.0,32.0,30.0,31.5,35.0,...,0.988333,1.0625,-0.25,0.625,3.125,2.110001,-3.520004,-0.350006,3.48999,0.254


In [6]:
df_train.to_csv("df_train.csv")

In [7]:
del df_train

In [8]:
df_test = read_file('data/test.zip', limit=5000)
df_test.sample(5)

Reading data ...
Data read ...\ 
Took:  4.028555393218994
Filling NaN values with mean


Unnamed: 0_level_0,minutes_past,radardist_km,Ref,Ref_5x5_10th,Ref_5x5_50th,Ref_5x5_90th,RefComposite,RefComposite_5x5_10th,RefComposite_5x5_50th,RefComposite_5x5_90th,...,RhoHV_5x5_50th,RhoHV_5x5_90th,Zdr,Zdr_5x5_10th,Zdr_5x5_50th,Zdr_5x5_90th,Kdp,Kdp_5x5_10th,Kdp_5x5_50th,Kdp_5x5_90th
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
290.0,54.0,12.0,22.415189,19.801556,22.104916,25.006138,24.056393,21.990426,23.729332,26.315527,...,0.981306,1.01558,0.476188,-0.668302,0.243524,1.848817,0.331193,-3.232344,-0.392695,3.897392
331.0,24.0,8.0,9.166667,5.5,5.75,7.7,9.166667,5.5,5.75,7.7,...,0.955,1.051667,4.9375,-0.668302,2.6875,4.9375,0.331193,-3.232344,-0.392695,3.897392
291.0,23.0,12.0,12.5,19.801556,16.0,21.5,12.5,11.5,16.0,21.5,...,1.0,1.051667,1.5625,-0.75,-0.28125,1.5625,0.709991,-3.880005,-1.150002,2.169998
451.0,46.0,10.0,22.415189,19.801556,22.104916,25.006138,24.056393,21.990426,23.729332,26.315527,...,0.981306,1.01558,0.476188,-0.668302,0.243524,1.848817,0.331193,-3.232344,-0.392695,3.897392
3.0,3.0,11.0,22.5,15.0,25.5,34.5,29.5,15.0,27.5,36.5,...,0.951667,0.971667,0.5,-0.25,0.5625,2.375,1.409988,-5.320007,0.349991,3.869995


In [9]:
df_test.to_csv("df_test.csv")

In [10]:
del df_test