# Здесь будут разные полезности <hr>

In [21]:
import pandas as pd
import numpy as np

<hr>

### Column names preparation

In [2]:
import re

In [28]:
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 
           'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']

def create_df(number_of_rows, columns):
    df_data = dict()
    for column in columns:
        df_data[column] = np.where(np.random.rand(number_of_rows)<0.5, 'A', 'B')
    df = pd.DataFrame(df_data)
    return df

df = create_df(50, columns)

In [14]:
example = 'DDD Pregnancies'

def transform_colnames(colname):
    re1 = r'[A-Z]{1}[a-z]+'
    re2 = r'[a-z]+'
    re3 = r'[A-Z]{2,}'
    
    colname_list = re.compile("%s|%s|%s" % (re1, re2, re3)).findall(colname)
    colname = ''
    for i in colname_list:
        if i.isupper():
            colname = colname + i + '_'
        else:        
            colname = colname + i.lower() + '_'
    return colname[:-1]


transform_colnames(example)

'DDD_pregnancies'

### Timer

In [3]:
import time
from contextlib import contextmanager

@contextmanager
def timer(name):
    t0 = time.perf_counter()
    yield
    print(f"[{name}] done in {time.perf_counter() - t0:2f} s")

In [4]:
def fibo(n):
    if n == 0: return 0
    elif n == 1: return 1
    else: return fibo(n-1)+fibo(n-2)

In [5]:
with timer('Fibonacci'):
    fibo(35)
    

[Fibonacci] done in 2.944506 s


 <hr>

### 'Stringfield' information

In [6]:
from ast import literal_eval

stringfield_info = "[1, 2, 'txt', [4, 5]]"

result = literal_eval(stringfield_info)

use <b>literal_eval</b> for Data Frame column

In [7]:
# for column in df_columns:
#     df['column'] = df['column'].apply(literal_eval)

<hr>

### Remove "$" char 

In [8]:
def create_df(number_of_rows):
    df = pd.DataFrame({'price': (100*np.random.rand(number_of_rows)).astype(int), 'feature': np.zeros(number_of_rows)})
    df['price'] = df['price'].astype(str) + '$'
    return df

data = create_df(5)
data

Unnamed: 0,price,feature
0,1$,0.0
1,50$,0.0
2,58$,0.0
3,84$,0.0
4,93$,0.0


In [9]:
with timer('method_1'):
    data['corr_price_1'] = data['price'].apply(lambda x: x.replace('$', '')).astype('int')

[method_1] done in 0.001362 s


In [10]:
with timer('method_2'):
    data['corr_price_2'] = data['price'].apply(lambda x: x[:-1]).astype('int')

[method_2] done in 0.001601 s


In [11]:
data

Unnamed: 0,price,feature,corr_price_1,corr_price_2
0,1$,0.0,1,1
1,50$,0.0,50,50
2,58$,0.0,58,58
3,84$,0.0,84,84
4,93$,0.0,93,93


<b> <hr> </b>

### Binarize data

In [12]:
def create_df(number_of_rows):
    df = pd.DataFrame({'type': np.where(np.random.rand(number_of_rows)<0.5, 'A', 'B'), 'feature': np.zeros(number_of_rows)})
    return df

bin_data = create_df(10000000)
bin_data.head()

Unnamed: 0,type,feature
0,B,0.0
1,A,0.0
2,B,0.0
3,A,0.0
4,A,0.0


In [13]:
with timer('method_1'):
    bin_data['bin_type_1'] = (bin_data['type'] == 'A').astype('int')

[method_1] done in 0.398258 s


In [14]:
with timer('method_2'):
    bin_data['bin_type_2'] = np.where(bin_data['type'] == 'A', 1, 0)

[method_2] done in 0.395373 s


In [15]:
with timer('method_3'):
    bin_data['bin_type_3'] = pd.get_dummies(bin_data['type'])['A']

[method_3] done in 0.452171 s


In [16]:
bin_data.head()

Unnamed: 0,type,feature,bin_type_1,bin_type_2,bin_type_3
0,B,0.0,0,0,0
1,A,0.0,1,1,1
2,B,0.0,0,0,0
3,A,0.0,1,1,1
4,A,0.0,1,1,1


<hr>

### Split string-data

In [17]:
def create_df(number_of_rows):
    df = pd.DataFrame({'A': (100*np.random.rand(number_of_rows)).astype('int'),
                       'B': (100*np.random.rand(number_of_rows)).astype('int')})
    df['A/B'] = df['A'].astype('str') + '/' + df['B'].astype('str')
    del df['A']
    del df['B']
    return df

split_data = create_df(10000000)
split_data.head()

Unnamed: 0,A/B
0,33/30
1,14/39
2,18/83
3,85/57
4,87/94


In [18]:
tmp = split_data['A/B'].str.split('/')
tmp

0          [33, 30]
1          [14, 39]
2          [18, 83]
3          [85, 57]
4          [87, 94]
             ...   
9999995    [23, 33]
9999996    [78, 54]
9999997     [4, 35]
9999998    [49, 52]
9999999     [68, 4]
Name: A/B, Length: 10000000, dtype: object

<hr>

## Info 

In [34]:
def get_info(df):
    edited_df = df.applymap(lambda x: np.nan if 'nan' in str(x).lower() else x)
    info_df = pd.DataFrame()
    info_df['col_name'] = edited_df.columns.to_list()
    info_df['dtype'] = edited_df.dtypes.to_list()
    info_df['missing_values'] = edited_df.isna().sum().to_list()
    info_df['unique_vals'] = edited_df.nunique().to_list()
    info_df['total_vals'] = edited_df.shape[0]
    return info_df

### Download zip-file from yandex-disk

In [1]:
import requests
from urllib.parse import urlencode
from urllib.request import urlopen
from zipfile import ZipFile
from io import BytesIO

def get_zip_yandex_disk(public_key):    
    base_url = 'https://cloud-api.yandex.net/v1/disk/public/resources/download?'
    url = base_url + urlencode(dict(public_key=public_key)) 
    response = requests.get(url)
    download_link = response.json()['href']
    download_response = urlopen(download_link)
    zip_file = ZipFile(BytesIO(download_response.read()))
    file_names = list([x.filename for x in zip_file.infolist()])
    print(f'zip file contains {file_names}')
    return zip_file