In [1]:
import numpy as np
import pandas as pd
pd.__version__

'2.2.2'

In [153]:

url = '../../assets/AmesHousing.csv'
df = pd.read_csv(url) #, engine='pyarrow', dtype_backend='pyarrow')

### Shrinking Numbers

Goals:

* Create a function, `shrink_ints`, to automatically convert suitable integer columns to smaller integer types (`uint8`, `uint16`, `uint32`) based on their range of values.
* Apply the `shrink_ints` function to the DataFrame to reduce memory usage while maintaining data integrity.
* Create a function, `clean_housing`, that combines the data cleaning steps for string columns, clipping values in the "Garage Yr Blt" column, and shrinking integer columns.


In [154]:
for size in [np.uint8, np.uint16, np.uint32]:
    print(np.iinfo(size))

Machine parameters for uint8
---------------------------------------------------------------
min = 0
max = 255
---------------------------------------------------------------

Machine parameters for uint16
---------------------------------------------------------------
min = 0
max = 65535
---------------------------------------------------------------

Machine parameters for uint32
---------------------------------------------------------------
min = 0
max = 4294967295
---------------------------------------------------------------



In [187]:
for size in [np.float16, np.float32, np.float64]:
    print(np.finfo(size))

Machine parameters for float16
---------------------------------------------------------------
precision =   3   resolution = 1.00040e-03
machep =    -10   eps =        9.76562e-04
negep =     -11   epsneg =     4.88281e-04
minexp =    -14   tiny =       6.10352e-05
maxexp =     16   max =        6.55040e+04
nexp =        5   min =        -max
smallest_normal = 6.10352e-05   smallest_subnormal = 5.96046e-08
---------------------------------------------------------------

Machine parameters for float32
---------------------------------------------------------------
precision =   6   resolution = 1.0000000e-06
machep =    -23   eps =        1.1920929e-07
negep =     -24   epsneg =     5.9604645e-08
minexp =   -126   tiny =       1.1754944e-38
maxexp =    128   max =        3.4028235e+38
nexp =        8   min =        -max
smallest_normal = 1.1754944e-38   smallest_subnormal = 1.4012985e-45
---------------------------------------------------------------

Machine parameters for float64
---

In [178]:
np.finfo(np.float64)

finfo(resolution=1e-15, min=-1.7976931348623157e+308, max=1.7976931348623157e+308, dtype=float64)

In [188]:

(
    df
    .select_dtypes(include=[np.float_])
    .memory_usage(deep=True)
    .sum()
)

257972

In [156]:
(
    df
    .select_dtypes(include=[np.int_])
    .memory_usage(deep=True)
    .sum()
)

656452

In [157]:
(
    df
    .select_dtypes(include=[np.int_])
    .describe()
    .T
)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Order,2930.0,1465.5,845.9625,1.0,733.25,1465.5,2197.75,2930.0
PID,2930.0,714464500.0,188730800.0,526301100.0,528477000.0,535453620.0,907181100.0,1007100000.0
MS SubClass,2930.0,57.38737,42.63802,20.0,20.0,50.0,70.0,190.0
Lot Area,2930.0,10147.92,7880.018,1300.0,7440.25,9436.5,11555.25,215245.0
Overall Qual,2930.0,6.094881,1.411026,1.0,5.0,6.0,7.0,10.0
Overall Cond,2930.0,5.56314,1.111537,1.0,5.0,5.0,6.0,9.0
Year Built,2930.0,1971.356,30.24536,1872.0,1954.0,1973.0,2001.0,2010.0
Year Remod/Add,2930.0,1984.267,20.86029,1950.0,1965.0,1993.0,2004.0,2010.0
1st Flr SF,2930.0,1159.558,391.8909,334.0,876.25,1084.0,1384.0,5095.0
2nd Flr SF,2930.0,335.456,428.3957,0.0,0.0,0.0,703.75,2065.0


In [158]:
(
    df
    .select_dtypes(include=[np.float_])
    .describe()
    .T
)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Lot Frontage,2440.0,69.22459,23.365335,21.0,58.0,68.0,80.0,313.0
Mas Vnr Area,2907.0,101.896801,179.112611,0.0,0.0,0.0,164.0,1600.0
BsmtFin SF 1,2929.0,442.629566,455.590839,0.0,0.0,370.0,734.0,5644.0
BsmtFin SF 2,2929.0,49.722431,169.168476,0.0,0.0,0.0,0.0,1526.0
Bsmt Unf SF,2929.0,559.262547,439.494153,0.0,219.0,466.0,802.0,2336.0
Total Bsmt SF,2929.0,1051.614544,440.615067,0.0,793.0,990.0,1302.0,6110.0
Bsmt Full Bath,2928.0,0.431352,0.52482,0.0,0.0,0.0,1.0,3.0
Bsmt Half Bath,2928.0,0.061134,0.245254,0.0,0.0,0.0,0.0,2.0
Garage Yr Blt,2771.0,1978.132443,25.528411,1895.0,1960.0,1979.0,2002.0,2207.0
Garage Cars,2929.0,1.766815,0.760566,0.0,1.0,2.0,2.0,5.0


In [169]:
def shrink_float(column: pd.Series):
    mapping = column.dtypes.name
    contains_nan = column.isna().any()
    max = column.max()
    min = column.min()
    if contains_nan:
        # fill na
        column = column.fillna(column.mean(), inplace=False).astype(np.float64)
        # return column.astype(mapping)
    if min >= np.finfo(np.float16).min and max <= np.finfo(np.float16).max:
        mapping = np.float16
    elif min >= np.finfo(np.float32).min and max <= np.finfo(np.float32).max:
        mapping = np.float32
    elif min >= np.finfo(np.float64).min and max <= np.finfo(np.float64).max:
        mapping = np.float64
    return column.astype(mapping)

In [170]:
def shrink_int(colum: pd.Series):
    mapping = colum.dtypes.name
    max_ = colum.max()
    min_ = colum.min()
    if min_ >= np.iinfo(np.uint8).min and max_ <= np.iinfo(np.uint8).max:
        mapping = np.uint8
    elif min_ >= np.iinfo(np.int16).min and max_ <= np.iinfo(np.int16).max:
        mapping = np.int16
    elif min_ >= np.iinfo(np.int32).min and max_ <= np.iinfo(np.int32).max:
        mapping = np.int32
    elif min_ >= np.iinfo(np.int64).min and max_ <= np.iinfo(np.int64).max:
        mapping = np.int64
    return colum.astype(mapping)


In [161]:
# df.select_dtypes(include=[np.int_]).apply(lambda col: shrink_int(col)).dtypes

In [162]:
(
    df
    .select_dtypes(include=[np.int_])
    .apply(lambda col: shrink_int(col))
    # .pipe(shrink_int)
    .memory_usage(deep=True)
    .sum()
)

152492

In [174]:
(152492/656452)*100

23.229725859621116

In [189]:
(
    df
    .select_dtypes(include=[np.float_])
    .apply(lambda col: shrink_float(col))
    # .pipe(shrink_int)
    .memory_usage(deep=True)
    .sum()
)

64592

In [190]:
(64592/257972)*100

25.03837625788845