In [1]:
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np

In [2]:
def read_csv_pgbar(csv_path, index_col=None, chunksize=1024, read_first=None, usecols=None):
    rows = sum(1 for _ in open(csv_path, 'r')) - 1
    chunk_list = []
    with tqdm(total=rows, desc=f'Reading {csv_path}') as pbar:
        i = 0
        for chunk in pd.read_csv(csv_path, index_col=index_col, chunksize=chunksize, sep='\t', usecols=usecols):
            chunk_list.append(chunk)
            pbar.update(len(chunk))
            i += 1
            if i+1 == read_first:
                break
    df = pd.concat((f for f in chunk_list), axis=0)
    return df

Create client_feat.csv

In [3]:
feat_traintest = read_csv_pgbar('FINAL_FEATURES_TRAINTEST.tsv')
feat_traintest.to_csv('client_feat.csv', index=False)

Reading FINAL_FEATURES_TRAINTEST.tsv: 100%|██████████| 159288/159288 [00:14<00:00, 11185.94it/s]


Create full_seq.csv

In [None]:
targets_traintest = read_csv_pgbar('FINAL_TARGETS_DATES_TRAINTEST.tsv', index_col=0)
all_seq_traintest = read_csv_pgbar('FINAL_ALL_SEQUENCES_TRAINTEST.tsv', index_col=0)
all_seq_traintest['seq_list'] = None
for idx in tqdm(all_seq_traintest.index):
    all_seq_traintest.loc[idx, 'seq_list'] = ' '.join(eval(all_seq_traintest.loc[idx, 'SEQUENCE']))
seq_target = pd.merge(targets_traintest, all_seq_traintest, left_index=True, right_index=True, how='left')
seq_target['seq'] = None
for idx in tqdm(seq_target.index):
    if str(seq_target.loc[idx, 'seq_list']) != 'nan':        
        seq_target.loc[idx, 'seq'] = 'start_state ' + seq_target.loc[idx, 'seq_list']
    else:
        seq_target.loc[idx, 'seq'] = 'start_state'
seq_target.reset_index()[['CLIENT_ID', 'seq']].to_csv('full_seq.csv', index=False)

Create target.csv

In [None]:
targets_traintest.reset_index()[['CLIENT_ID', 'TARGET']].to_csv('target.csv', index=False)

Create av_friends.csv

In [None]:
pbar = tqdm(total=9974289)
with open('FINAL_FEATURES_FRIENDS.tsv', mode='r') as rfile, open('friends.tsv', mode='w') as wfile:
    prev_client_id = 0
    n_friends = 0
    row_data = np.zeros(1014,)
    wfile.write(rfile.readline())
    while True:
        line = rfile.readline()
        if not line:
            break
        list_of_strings = line.split('\t')
        row = list(map(float, list_of_strings))
        client_id = int(row[0])
        friend_id = int(row[-1])
        if prev_client_id != client_id:
            ints = [prev_client_id] + row_data.astype('int').tolist() + [n_friends]
            wfile.write('\t'.join([str(i) for i in ints]) + '\n')
            
            prev_client_id = client_id
            n_friends = 0
            row_data = np.zeros(1014,)
        row_data = row_data + np.array(row[1:-1])
        n_friends += 1
        pbar.update(1)

In [None]:
feat_friends = read_csv_pgbar('friends.tsv')
feat_client = pd.read_csv('client_feat.csv')
feat_client['N_FRIENDS'] = 1
feat_client.head()

In [None]:
total_friends = pd.concat([feat_client, feat_friends])
agg_rule = {col: 'sum' for col in feat_client.columns[1:]}
aggr_total_friends = total_friends.groupby('CLIENT_ID').agg(agg_rule)
av_total_friends = aggr_total_friends.loc[:, :]
av_total_friends.iloc[:, :-1] = (
    aggr_total_friends.iloc[:, :-1].values / aggr_total_friends.iloc[:, -1].values[:, None]
).round(2)

In [None]:
av_total_friends.reset_index().iloc[:, :-1].to_csv('av_friends.csv', index=False)