In [1]:
import pandas as pd
import dask.dataframe as dd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# dataset_name = 'train.csv'
dataset_name = 'cleaned_train.csv'

# dtypes_name = 'suitable_dtypes.pkl'
dtypes_name = 'cleaned_dtypes.pkl'

In [3]:
train_df = pd.read_csv(dataset_name, chunksize=100000)

In [4]:
max_vals = {}
min_vals = {}
float_cols = set()

for chunk in train_df:
    chunk = chunk.select_dtypes(exclude='object')
    
    for column in chunk.columns:
        if chunk[column].dtype == np.float:
            float_cols.add(column)
            
        max_value = chunk[column].max()
        max_vals[column] = max(max_vals.get(column, max_value), max_value)

        min_value = chunk[column].min()
        min_vals[column] = min(min_vals.get(column, min_value), min_value)

In [5]:
float_dtypes = [np.float16, np.float32, np.float64]
uint_dtypes = [np.uint8, np.uint16, np.uint32, np.uint64]
int_dtypes = [np.int8, np.int16, np.int32, np.int64]

float_finfo = [np.finfo(el) for el in float_dtypes]
uint_iinfo = [np.iinfo(el) for el in uint_dtypes]
int_iinfo = [np.iinfo(el) for el in int_dtypes]


def choose_type(column, dtypes, dtype_infos):
    for dtype, dtype_info in zip(dtypes, dtype_infos):
        if max_vals[column] <= dtype_info.max and \
           min_vals[column] >= dtype_info.min:
            return dtype

        
def get_suitable_dtype(column):
    if column in float_cols:
        return choose_type(column, float_dtypes, float_finfo)
    
    if min_vals[column] < 0:
        return choose_type(column, int_dtypes, int_iinfo)
    
    return choose_type(column, uint_dtypes, uint_iinfo)

In [6]:
suitable_dtypes = [(col, get_suitable_dtype(col)) for col in max_vals]
suitable_dtypes = dict(suitable_dtypes)

In [40]:
chunk.select_dtypes(include=np.object)

8900000
8900001
8900002
8900003
8900004
...
8921478
8921479
8921480
8921481
8921482


In [7]:
import pprint

pprint.pprint(suitable_dtypes)

{'AVProductStatesIdentifier': <class 'numpy.uint32'>,
 'AVProductsEnabled_0': <class 'numpy.uint8'>,
 'AVProductsEnabled_1': <class 'numpy.uint8'>,
 'AVProductsEnabled_2': <class 'numpy.uint8'>,
 'AVProductsEnabled_3': <class 'numpy.uint8'>,
 'AVProductsEnabled_4': <class 'numpy.uint8'>,
 'AVProductsInstalled_1': <class 'numpy.uint8'>,
 'AVProductsInstalled_2': <class 'numpy.uint8'>,
 'AVProductsInstalled_3': <class 'numpy.uint8'>,
 'AVProductsInstalled_4': <class 'numpy.uint8'>,
 'AVProductsInstalled_5': <class 'numpy.uint8'>,
 'AppVersion_0': <class 'numpy.uint8'>,
 'AppVersion_1': <class 'numpy.uint16'>,
 'AppVersion_2': <class 'numpy.uint16'>,
 'AutoSampleOptIn': <class 'numpy.uint8'>,
 'AvSigVersion_0': <class 'numpy.uint8'>,
 'AvSigVersion_1': <class 'numpy.uint16'>,
 'AvSigVersion_2': <class 'numpy.uint16'>,
 'AvSigVersion_3': <class 'numpy.uint8'>,
 'Census_ActivationChannel_OEM:DM': <class 'numpy.uint8'>,
 'Census_ActivationChannel_OEM:NONSLP': <class 'numpy.uint8'>,
 'Census_

In [8]:
train_df = pd.read_csv(dataset_name, chunksize=100000)

chunk = next(train_df)

object_cols = list(chunk.select_dtypes(include=np.object).dtypes.index)

suitable_dtypes.update({el:np.object for el in object_cols})

In [9]:
import pickle

with open(dtypes_name, 'wb') as f:
    pickle.dump(suitable_dtypes, f)