In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data_folder = './data/'

In [3]:
def read_preprocessed_financial_data(data_folder):
    transactions = pd.read_csv(data_folder+'transactions.csv', sep=',')
    tr_types = pd.read_csv(data_folder+'tr_types.csv', sep=';')
    tr_mcc_codes = pd.read_csv(data_folder+'tr_mcc_codes.csv', sep=';')
    gender_train = pd.read_csv(data_folder+'gender_train.csv', sep=',')
    
    df = pd.merge(transactions, gender_train, on='customer_id', how='outer')
    df = pd.merge(df, tr_mcc_codes, on='mcc_code', how='outer') 
    df = pd.merge(df, tr_types, on='tr_type', how='outer') 
    
    df = df[~np.isnan(df['gender'])]
    
    le = LabelEncoder()
    df['term_id'] = le.fit_transform(df['term_id'])
    
    times = df['tr_datetime'].apply(lambda x: x.split(' ')[1].split(':'))
    to_seconds = lambda x: int(x[0]) * 3600 + int(x[1]) * 60 + int(x[2])
    df['tr_datetime'] = times.apply(to_seconds)
    
    df['mcc_description'] = df['mcc_description'].fillna(df['mcc_description'].value_counts().index[0])
    df['tr_description'] = df['tr_description'].fillna(df['tr_description'].value_counts().index[0])
    
    return df

In [4]:
df = read_preprocessed_financial_data(data_folder)
df.shape # (3751083, 9)

(3751083, 9)

In [5]:
df.head()

Unnamed: 0,customer_id,tr_datetime,mcc_code,tr_type,amount,term_id,gender,mcc_description,tr_description
0,39026145.0,37406,4814.0,1030,-2245.92,311690,1.0,"Звонки с использованием телефонов, считывающих...",Оплата услуги. Банкоматы СБ РФ
1,39026145.0,25711,4814.0,1030,-5614.79,311690,1.0,"Звонки с использованием телефонов, считывающих...",Оплата услуги. Банкоматы СБ РФ
2,39026145.0,25570,4814.0,1030,-1122.96,311690,1.0,"Звонки с использованием телефонов, считывающих...",Оплата услуги. Банкоматы СБ РФ
3,39026145.0,31743,4814.0,1030,-2245.92,311690,1.0,"Звонки с использованием телефонов, считывающих...",Оплата услуги. Банкоматы СБ РФ
4,39026145.0,51128,4814.0,1030,-2245.92,311690,1.0,"Звонки с использованием телефонов, считывающих...",Оплата услуги. Банкоматы СБ РФ


In [6]:
vectorizer = TfidfVectorizer()
tfidf_mcc = vectorizer.fit_transform(df['mcc_description'])
tfidf_transaction = vectorizer.fit_transform(df['tr_description'])
tfidf_mcc, tfidf_transaction

(<3751083x499 sparse matrix of type '<class 'numpy.float64'>'
 	with 17524726 stored elements in Compressed Sparse Row format>,
 <3751083x67 sparse matrix of type '<class 'numpy.float64'>'
 	with 23300180 stored elements in Compressed Sparse Row format>)