In [1]:
%%capture
!pip install dask_cuda dask-cudf-cu12

In [2]:
%%capture
!pip install polars[gpu] -U --extra-index-url=https://pypi.nvidia.com

In [3]:
import cudf
import cupy as cp
import pandas as pd
import numpy as np
import polars as pl

In [4]:
from dask_cuda import LocalCUDACluster
from dask.distributed import Client, progress
import dask_cudf
import dask.dataframe as dd

# Домашнее задание 1

Так как синтаксис очень похож на numpy, то ничего сложного в cupy нет.
Придумайте какую-нибудь функцию только с использованием numpy и не в одну операцию, чтобы она несла какой-то смысл. Что-то аналогичное функции из занятия l2norm_dec:

    def l2norm_dec(x):
        return cp.sqrt(cp.sum(cp.power(x, 2), axis=1))
        
Можно взять норму Фробениуса для матрицы.

Запишите эту же функцию при помощи cupy

Третья функция - cupy + декоратор @cp.fuse()

Сравните время выполнения=)

In [5]:
def frobenius_norm_np(x):
    return np.sqrt(np.sum(np.power(x, 2)))

In [6]:
@cp.fuse()
def frobenius_norm_cp(x):
    return cp.sqrt(cp.sum(cp.power(x, 2)))

In [7]:
frobenius_norm_kernel = cp.ReductionKernel(
    'T x',  # input params
    'T y',  # output params
    'x * x',  # map
    'a + b',  # reduce
    'y = sqrt(a)',  # post-reduction map
    '0',  # identity value
    'frobenius_norm'  # kernel name
)

In [8]:
x = np.arange(10, dtype=np.float32).reshape(2, 5)

In [9]:
x = cp.arange(10, dtype=np.float32).reshape(2, 5)

In [92]:
%timeit frobenius_norm_np(x)

264 µs ± 6.59 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [93]:
%timeit frobenius_norm_cp(x)

119 µs ± 11.9 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [94]:
%timeit frobenius_norm_kernel(x)

104 µs ± 838 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


# Домашнее задание 2

Сравните среднюю вероятность смерти мужчин и женщин по группам возрастов на основе столбца death_ind. Тоже самое проделайте для вероятности госпитализации, преобразовав переменную hosp_yn, как мы сделали с переменной death_yn.

In [14]:
gdf = cudf.read_parquet('covid.gzip')

In [15]:
gdf = gdf.dropna()

In [16]:
gdf['death_ind'] = gdf['death_yn'].apply(lambda x: 1 if x=='Yes' else 0)

In [17]:
gdf['hosp_ind'] = gdf['hosp_yn'].apply(lambda x: 1 if x=='Yes' else 0)

In [18]:
result_death_ind = gdf.groupby(['sex', 'age_group'])['death_ind'].mean().reset_index()

result_hosp_ind = gdf.groupby(['sex', 'age_group'])['hosp_ind'].mean().reset_index()

In [19]:
male_death_df = result_death_ind[result_death_ind['sex'] == 'Male'].drop('sex', axis=1).set_index('age_group')
female_death_df = result_death_ind[result_death_ind['sex'] == 'Female'].drop('sex', axis=1).set_index('age_group')

male_hosp_df = result_hosp_ind[result_hosp_ind['sex'] == 'Male'].drop('sex', axis=1).set_index('age_group')
female_hosp_df = result_hosp_ind[result_hosp_ind['sex'] == 'Female'].drop('sex', axis=1).set_index('age_group')

In [20]:
male_death_df = male_death_df.rename(columns={'death_ind': 'male_death_pr'})
female_death_df = female_death_df.rename(columns={'death_ind': 'female_death_pr'})

male_hosp_df = male_hosp_df.rename(columns={'hosp_ind': 'male_hosp_pr'})
female_hosp_df = female_hosp_df.rename(columns={'hosp_ind': 'female_hosp_pr'})

In [21]:
combined_death_df = male_death_df.join(female_death_df, how='outer').sort_index()

combined_hosp_df = male_hosp_df.join(female_hosp_df, how='outer').sort_index()

In [22]:
combined_death_df

Unnamed: 0_level_0,male_death_pr,female_death_pr
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1
0 - 9 Years,0.000257,0.000223
10 - 19 Years,0.000191,0.000134
20 - 29 Years,0.000725,0.000345
30 - 39 Years,0.00229,0.001076
40 - 49 Years,0.006098,0.002635
50 - 59 Years,0.015651,0.007524
60 - 69 Years,0.044501,0.025573
70 - 79 Years,0.114141,0.076018
80+ Years,0.267811,0.199618
Unknown,0.000355,0.000394


In [23]:
combined_hosp_df

Unnamed: 0_level_0,male_hosp_pr,female_hosp_pr
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1
0 - 9 Years,0.0159,0.014042
10 - 19 Years,0.008124,0.009225
20 - 29 Years,0.013744,0.019351
30 - 39 Years,0.031092,0.030775
40 - 49 Years,0.054479,0.039122
50 - 59 Years,0.085752,0.061315
60 - 69 Years,0.143717,0.108478
70 - 79 Years,0.240449,0.189835
80+ Years,0.322185,0.224683
Unknown,0.005113,0.004532


# Домашнее задание 3

Аналогично заданию 2, но теперь при помощи dask_cudf

In [83]:
dask_gdf = dask_cudf.read_parquet('covid.gzip')

In [84]:
dask_gdf = dask_gdf.dropna()

In [85]:
death_func = lambda x: 1 if x == 'Yes' else 0
hosp_func = lambda x: 1 if x == 'Yes' else 0

dask_gdf['death_ind'] = dask_gdf['death_yn'].map_partitions(lambda x: x.apply(death_func))
dask_gdf['hosp_ind'] = dask_gdf['hosp_yn'].map_partitions(lambda x: x.apply(hosp_func))

In [86]:
result_death_ind = dask_gdf.groupby(['sex', 'age_group'])['death_ind'].mean().reset_index()

result_hosp_ind = dask_gdf.groupby(['sex', 'age_group'])['hosp_ind'].mean().reset_index()

In [87]:
male_death_df = result_death_ind[result_death_ind['sex'] == 'Male'].drop('sex', axis=1).set_index('age_group')
female_death_df = result_death_ind[result_death_ind['sex'] == 'Female'].drop('sex', axis=1).set_index('age_group')

male_hosp_df = result_hosp_ind[result_hosp_ind['sex'] == 'Male'].drop('sex', axis=1).set_index('age_group')
female_hosp_df = result_hosp_ind[result_hosp_ind['sex'] == 'Female'].drop('sex', axis=1).set_index('age_group')

In [88]:
male_death_df = male_death_df.rename(columns={'death_ind': 'male_death_pr'})
female_death_df = female_death_df.rename(columns={'death_ind': 'female_death_pr'})

male_hosp_df = male_hosp_df.rename(columns={'hosp_ind': 'male_hosp_pr'})
female_hosp_df = female_hosp_df.rename(columns={'hosp_ind': 'female_hosp_pr'})

In [89]:
combined_death_df = male_death_df.join(female_death_df, how='outer').sort_values('age_group')

combined_hosp_df = male_hosp_df.join(female_hosp_df, how='outer').sort_values('age_group')

In [90]:
combined_death_df.compute()

Unnamed: 0_level_0,male_death_pr,female_death_pr
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1
0 - 9 Years,0.000257,0.000223
10 - 19 Years,0.000191,0.000134
20 - 29 Years,0.000725,0.000345
30 - 39 Years,0.00229,0.001076
40 - 49 Years,0.006098,0.002635
50 - 59 Years,0.015651,0.007524
60 - 69 Years,0.044501,0.025573
70 - 79 Years,0.114141,0.076018
80+ Years,0.267811,0.199618
Unknown,0.000355,0.000394


In [91]:
combined_hosp_df.compute()

Unnamed: 0_level_0,male_hosp_pr,female_hosp_pr
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1
0 - 9 Years,0.0159,0.014042
10 - 19 Years,0.008124,0.009225
20 - 29 Years,0.013744,0.019351
30 - 39 Years,0.031092,0.030775
40 - 49 Years,0.054479,0.039122
50 - 59 Years,0.085752,0.061315
60 - 69 Years,0.143717,0.108478
70 - 79 Years,0.240449,0.189835
80+ Years,0.322185,0.224683
Unknown,0.005113,0.004532


# Домашнее задание 4

Аналогично заданию 2, но теперь Polars c GPU

In [34]:
df_pl = pl.scan_parquet('covid.gzip')

In [35]:
gpu_engine = pl.GPUEngine(
    device=0,
    raise_on_fail=True
)

In [36]:
df_pl = df_pl.drop_nulls()

In [37]:
df_pl = df_pl.with_columns(
    pl.when(pl.col('death_yn') == 'Yes').then(1).otherwise(0).alias('death_ind')
)

In [38]:
df_pl = df_pl.with_columns(
    pl.when(pl.col('hosp_yn') == 'Yes').then(1).otherwise(0).alias('hosp_ind')
)

In [39]:
male_death_pr = df_pl.filter(pl.col('sex')=='Male')\
               .group_by('age_group')\
               .agg(pl.col('death_ind').mean().alias('male_death_pr'))

In [40]:
female_death_pr = df_pl.filter(pl.col('sex')=='Female')\
               .group_by('age_group')\
               .agg(pl.col('death_ind').mean().alias('female_death_pr'))

In [41]:
death_pr = male_death_pr.join(female_death_pr, on='age_group', how='full')

In [42]:
death_pr = death_pr.drop('age_group_right').sort(by='age_group', descending=False)

In [43]:
death_pr.collect(engine=gpu_engine)

age_group,male_death_pr,female_death_pr
str,f64,f64
"""0 - 9 Years""",0.000257,0.000223
"""10 - 19 Years""",0.000191,0.000134
"""20 - 29 Years""",0.000725,0.000345
"""30 - 39 Years""",0.00229,0.001076
"""40 - 49 Years""",0.006098,0.002635
"""50 - 59 Years""",0.015651,0.007524
"""60 - 69 Years""",0.044501,0.025573
"""70 - 79 Years""",0.114141,0.076018
"""80+ Years""",0.267811,0.199618
"""Unknown""",0.000355,0.000394


In [44]:
male_hosp_pr = df_pl.filter(pl.col('sex')=='Male')\
               .group_by('age_group')\
               .agg(pl.col('hosp_ind').mean().alias('male_hosp_pr'))

In [45]:
female_hosp_pr = df_pl.filter(pl.col('sex')=='Female')\
               .group_by('age_group')\
               .agg(pl.col('hosp_ind').mean().alias('female_hosp_pr'))

In [46]:
hosp_pr = male_hosp_pr.join(female_hosp_pr, on='age_group', how='full')

In [47]:
hosp_pr = hosp_pr.drop('age_group_right').sort(by='age_group', descending=False)

In [48]:
hosp_pr.collect(engine=gpu_engine)

age_group,male_hosp_pr,female_hosp_pr
str,f64,f64
"""0 - 9 Years""",0.0159,0.014042
"""10 - 19 Years""",0.008124,0.009225
"""20 - 29 Years""",0.013744,0.019351
"""30 - 39 Years""",0.031092,0.030775
"""40 - 49 Years""",0.054479,0.039122
"""50 - 59 Years""",0.085752,0.061315
"""60 - 69 Years""",0.143717,0.108478
"""70 - 79 Years""",0.240449,0.189835
"""80+ Years""",0.322185,0.224683
"""Unknown""",0.005113,0.004532
