In [1]:
from dask import dataframe as dd
from datetime import datetime

In [2]:
def parse_datetime(dt):
    date_format = '%Y-%m-%d %H:%M:%S.%f'
    try:
        dt = datetime.strptime(dt[:-4], date_format).timestamp()
    except ValueError:
        dt = datetime.strptime(dt[:-4], date_format[:-3]).timestamp()

    dt *= 1000.0
    dt = int(dt)
    dt -= 1648806250315

    return dt

In [3]:
def parse_timestamp(timestamp):
    date_format = '%Y-%m-%d %H'

    timestamp /= 1000
    timestamp = int(timestamp)
    timestamp += 1648806250

    dt = datetime.fromtimestamp(timestamp).strftime(date_format)
    dt = datetime.strptime(dt, date_format)

    return dt

## Оптимизиране на оригиналната информация. Премахване на колоната user_id

In [4]:
ddf_core = dd.read_csv('..\\data\\raw\\2022_place_canvas_history.csv', converters={"timestamp": parse_datetime})

ddf_core = ddf_core.drop('user_id', axis=1)
ddf_core.astype({'timestamp': 'uint32'}).dtypes
ddf_core = ddf_core.sort_values('timestamp')

ddf_core.head()

Unnamed: 0,timestamp,pixel_color,coordinate
97731,0,#7EED56,4242
97732,12356,#00A368,999999
97733,16311,#3690EA,4442
97734,21388,#D4D7D9,22
97735,34094,#3690EA,2323


In [5]:
ddf_core.to_csv('..\\data\\raw\\2022_place_canvas_history_core.csv', index=False, single_file=True)

['f:\\Дипломна работа\\code\\data\\raw\\2022_place_canvas_history_core.csv']

## Експортиране на колоната user_id в отделен файл

In [None]:
ddf_users = dd.read_csv('..\\data\\raw\\2022_place_canvas_history.csv')

ddf_users = ddf_users.drop(['timestamp','pixel_color','coordinate'],axis=1)

ddf_users.head()

In [None]:
ddf_users.to_csv('..\\data\\raw\\2022_place_canvas_history_users.csv', index=False, single_file=True)

## Преобразуване на timestamp към datetime и експортиране на datetime колоната в отделен файл

In [10]:
ddf_date = dd.read_csv('..\\data\\raw\\2022_place_canvas_history_core.csv')

ddf_date = ddf_date.drop(['pixel_color', 'coordinate'], axis=1)

ddf_date.head()

Unnamed: 0,timestamp
0,0
1,12356
2,16311
3,21388
4,34094


In [13]:
ddf_date = ddf_date.rename(columns={'timestamp': 'datetime'})
ddf_date['datetime'] = ddf_date['datetime'].map(parse_timestamp)

ddf_date.head()

Unnamed: 0,datetime
0,2022-04-01 12:00:00
1,2022-04-01 12:00:00
2,2022-04-01 12:00:00
3,2022-04-01 12:00:00
4,2022-04-01 12:00:00


In [14]:
ddf_date.to_csv('..\\data\\raw\\2022_place_canvas_history_datetime.csv', index=False, single_file=True)

['f:\\Дипломна работа\\code\\data\\raw\\2022_place_canvas_history_datetime.csv']

## Завъртане на pixel_color данните, оставяйки datetime като индекс и преброяване на pixel_color видовете за всеки час

In [102]:
ddf_pivot = dd.read_csv('..\\data\\raw\\2022_place_canvas_history_core.csv')

ddf_pivot = ddf_pivot.drop('coordinate', axis=1)
ddf_pivot = ddf_pivot.rename(columns={'timestamp': 'datetime'})

ddf_pivot['datetime'] = ddf_pivot['datetime'].map(parse_timestamp)
ddf_pivot['index'] = ddf_pivot.index

ddf_pivot.head()

Unnamed: 0,datetime,pixel_color
0,2022-04-01 12:00:00,14
1,2022-04-01 12:00:00,3
2,2022-04-01 12:00:00,7
3,2022-04-01 12:00:00,21
4,2022-04-01 12:00:00,7


In [40]:
ddf_pivot['pixel_color'] = ddf_pivot['pixel_color'].astype('category')
ddf_pivot['pixel_color'] = ddf_pivot.pixel_color.cat.as_known()
ddf_pivot.dtypes

datetime       datetime64[ns]
pixel_color          category
index                   int64
dtype: object

In [41]:
ddf_pivot = ddf_pivot.pivot_table(index='datetime', columns='pixel_color', values='pixel_color', aggfunc='count')
ddf_pivot.head()

pixel_color,#000000,#00A368,#2450A4,#3690EA,#51E9F4,#7EED56,#811E9F,#898D90,#9C6926,#B44AC0,...,#BE0039,#FF3881,#00CCC0,#515252,#6D001A,#94B3FF,#DE107F,#E4ABFF,#FFB470,#FFF8B8
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-04-01 12:00:00,0.0,2.0,3.0,2.0,1.0,1.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-04-01 13:00:00,117511.0,21775.0,83172.0,15056.0,24328.0,21971.0,19421.0,7081.0,6169.0,5815.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-04-01 14:00:00,198737.0,33085.0,115221.0,20169.0,38124.0,30806.0,26811.0,12597.0,9609.0,7658.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-04-01 15:00:00,242980.0,35452.0,98686.0,25440.0,42816.0,33479.0,27869.0,17086.0,12235.0,10745.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-04-01 16:00:00,246588.0,40658.0,99270.0,29857.0,49147.0,37166.0,32077.0,20787.0,11657.0,9305.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
columns = ddf_pivot.columns
for column in columns:
    ddf_pivot[column] = ddf_pivot[column].astype('int32')

ddf_pivot.dtypes

pixel_color
#000000    int32
#00A368    int32
#2450A4    int32
#3690EA    int32
#51E9F4    int32
#7EED56    int32
#811E9F    int32
#898D90    int32
#9C6926    int32
#B44AC0    int32
#D4D7D9    int32
#FF4500    int32
#FF99AA    int32
#FFA800    int32
#FFD635    int32
#FFFFFF    int32
#00756F    int32
#009EAA    int32
#00CC78    int32
#493AC1    int32
#6A5CFF    int32
#6D482F    int32
#BE0039    int32
#FF3881    int32
#00CCC0    int32
#515252    int32
#6D001A    int32
#94B3FF    int32
#DE107F    int32
#E4ABFF    int32
#FFB470    int32
#FFF8B8    int32
dtype: object

In [54]:
ddf_pivot.to_csv('..\\data\\raw\\2022_place_canvas_history_pivoted.csv', single_file=True)

['f:\\Дипломна работа\\code\\data\\raw\\2022_place_canvas_history_pivoted.csv']