In [1]:
import pandas as pd
import dask.dataframe as dd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [31]:
train_df = pd.read_csv('train.csv', chunksize=100000)

In [33]:
max_vals = {}
min_vals = {}
float_cols = set()

for chunk in train_df:
    chunk = chunk.select_dtypes(exclude='object')

    for column in chunk.columns:
        if chunk[column].dtype == np.float:
            float_cols.add(column)
            
        max_value = chunk[column].max()
        max_vals[column] = max(max_vals.get(column, max_value), max_value)

        min_value = chunk[column].min()
        min_vals[column] = min(min_vals.get(column, min_value), min_value)

  interactivity=interactivity, compiler=compiler, result=result)


In [36]:
float_dtypes = [np.float16, np.float32, np.float64]
uint_dtypes = [np.uint8, np.uint16, np.uint32, np.uint64]
int_dtypes = [np.int8, np.int16, np.int32, np.int64]

float_finfo = [np.finfo(el) for el in float_dtypes]
uint_iinfo = [np.iinfo(el) for el in uint_dtypes]
int_iinfo = [np.iinfo(el) for el in int_dtypes]


def choose_type(column, dtypes, dtype_infos):
    for dtype, dtype_info in zip(dtypes, dtype_infos):
        if max_vals[column] <= dtype_info.max and \
           min_vals[column] >= dtype_info.min:
            return dtype

        
def get_suitable_dtype(column):
    if column in float_cols:
        return choose_type(column, float_dtypes, float_finfo)
    
    if min_vals[column] < 0:
        return choose_type(column, int_dtypes, int_iinfo)
    
    return choose_type(column, uint_dtypes, uint_iinfo)

In [37]:
suitable_dtypes = [(col, get_suitable_dtype(col)) for col in max_vals]
suitable_dtypes = dict(suitable_dtypes)

In [40]:
chunk.select_dtypes(include=np.object)

8900000
8900001
8900002
8900003
8900004
...
8921478
8921479
8921480
8921481
8921482


In [39]:
import pprint

pprint.pprint(suitable_dtypes)

{'AVProductStatesIdentifier': <class 'numpy.float32'>,
 'AVProductsEnabled': <class 'numpy.float16'>,
 'AVProductsInstalled': <class 'numpy.float16'>,
 'AutoSampleOptIn': <class 'numpy.uint8'>,
 'Census_FirmwareManufacturerIdentifier': <class 'numpy.float16'>,
 'Census_FirmwareVersionIdentifier': <class 'numpy.float32'>,
 'Census_HasOpticalDiskDrive': <class 'numpy.uint8'>,
 'Census_InternalBatteryNumberOfCharges': <class 'numpy.float32'>,
 'Census_InternalPrimaryDiagonalDisplaySizeInInches': <class 'numpy.float16'>,
 'Census_InternalPrimaryDisplayResolutionHorizontal': <class 'numpy.float16'>,
 'Census_InternalPrimaryDisplayResolutionVertical': <class 'numpy.float16'>,
 'Census_IsAlwaysOnAlwaysConnectedCapable': <class 'numpy.float16'>,
 'Census_IsFlightingInternal': <class 'numpy.float16'>,
 'Census_IsFlightsDisabled': <class 'numpy.float16'>,
 'Census_IsPenCapable': <class 'numpy.uint8'>,
 'Census_IsPortableOperatingSystem': <class 'numpy.uint8'>,
 'Census_IsSecureBootEnabled': <cla

In [51]:
train_df = pd.read_csv('train.csv', chunksize=100000)

chunk = next(train_df)

object_cols = list(chunk.select_dtypes(include=np.object).dtypes.index)

suitable_dtypes.update({el:np.object for el in object_cols})

  interactivity=interactivity, compiler=compiler, result=result)


In [53]:
import pickle

with open('suitable_dtypes.pkl', 'wb') as f:
    pickle.dump(suitable_dtypes, f)