In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [0]:
train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/데이콘/스타2/train.csv')
test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/데이콘/스타2/test.csv')

In [5]:
print(train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67091776 entries, 0 to 67091775
Data columns (total 7 columns):
game_id           int64
winner            int64
time              float64
player            int64
species           object
event             object
event_contents    object
dtypes: float64(1), int64(3), object(3)
memory usage: 3.5+ GB
None


In [0]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            #print("******************************")
            #print("Column: ",col)
            #print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            #print("dtype after: ",props[col].dtype)
            #print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

In [7]:
train, NAlist = reduce_mem_usage(train)
print("_________________")
print("Warning: the following columns have missing values filled with 'df['column_name'].min() -1': ")
print("_________________")
print("")
print(NAlist)

Memory usage of properties dataframe is : 3583.0875244140625  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  2367.3971557617188  MB
This is  66.07142972732312 % of the initial size
_________________
_________________

[]


In [8]:
print(train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67091776 entries, 0 to 67091775
Data columns (total 7 columns):
game_id           float32
winner            float32
time              uint8
player            float32
species           object
event             object
event_contents    object
dtypes: float32(3), object(3), uint8(1)
memory usage: 2.3+ GB
None


In [0]:
# 저장

#train.to_pickle('train.pkl')
#train.to_csv('train.csv', index=False)

In [0]:
# Load data with Pickle

#%%time
#train = pd.read_pickle('train.pkl')

In [9]:
print("event unique values : {}".format(train.event.unique()))
print("species unique values : {}".format(train.species.unique()))

event unique values : ['Camera' 'Selection' 'Ability' 'Right Click' 'SetControlGroup'
 'GetControlGroup' 'AddToControlGroup' 'ControlGroup']
species unique values : ['T' 'P' 'Z']


In [10]:
event_dict = {i:j for (i,j) in zip(train.event.unique(), range(train.event.nunique()))}
print(event_dict)

{'Camera': 0, 'Selection': 1, 'Ability': 2, 'Right Click': 3, 'SetControlGroup': 4, 'GetControlGroup': 5, 'AddToControlGroup': 6, 'ControlGroup': 7}


In [0]:
train['event'] = train['event'].map(event_dict)

In [12]:
species_dict = {i:j for (i,j) in zip(train.species.unique(), range(train.species.nunique()))}
print(species_dict)

{'T': 0, 'P': 1, 'Z': 2}


In [0]:
train['species'] = train['species'].map(species_dict)

In [14]:
train, NAlist = reduce_mem_usage(train)
print("-"*30)
print("Warning: the following columns have missing values filled with 'df['column_name'].min() -1': ")
print("-"*30)
print("")
print(NAlist)

Memory usage of properties dataframe is : 2367.3971557617188  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  2047.4786376953125  MB
This is  86.48648718328499 % of the initial size
------------------------------
------------------------------

[]


In [15]:
print(train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67091776 entries, 0 to 67091775
Data columns (total 7 columns):
game_id           float32
winner            float32
time              float32
player            float32
species           float32
event             float32
event_contents    object
dtypes: float32(6), object(1)
memory usage: 2.0+ GB
None


In [17]:
%%time
train.to_pickle('/content/drive/My Drive/Colab Notebooks/data/데이콘/스타2/train.pkl')
#train.to_csv('train.csv', index=False)

CPU times: user 20.9 s, sys: 4.05 s, total: 24.9 s
Wall time: 1min 23s


#Change test dataset

In [0]:
del train

In [19]:
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28714849 entries, 0 to 28714848
Data columns (total 6 columns):
game_id           int64
time              float64
player            int64
species           object
event             object
event_contents    object
dtypes: float64(1), int64(2), object(3)
memory usage: 1.3+ GB
None


In [0]:
test['event'] = test['event'].map(event_dict)
test['species'] = test['species'].map(species_dict)

In [21]:
test, NAlist = reduce_mem_usage(test)
print("-"*30)
print("Warning: the following columns have missing values filled with 'df['column_name'].min() -1': ")
print("-"*30)
print("")
print(NAlist)

Memory usage of properties dataframe is : 1314.4615936279297  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  684.6154718399048  MB
This is  52.083337783217985 % of the initial size
------------------------------
------------------------------

[]


In [22]:
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28714849 entries, 0 to 28714848
Data columns (total 6 columns):
game_id           float32
time              uint8
player            float32
species           float32
event             float32
event_contents    object
dtypes: float32(4), object(1), uint8(1)
memory usage: 684.6+ MB
None


In [23]:
%%time
test.to_pickle('/content/drive/My Drive/Colab Notebooks/data/데이콘/스타2/test.pkl')
#train.to_csv('train.csv', index=False)

CPU times: user 9.06 s, sys: 2.07 s, total: 11.1 s
Wall time: 17.6 s
