In [13]:
%matplotlib inline
import pandas as pd
import matplotlib
import time
import numpy as np
import matplotlib.pyplot as plt
TRADE_DATA = "../data/Trade.csv"

In [14]:
df_trade = pd.read_csv(TRADE_DATA)
df_trade.columns
df_trade.describe()
df_trade.head()

Unnamed: 0,TradeDateKey,CustomerIdx,IsinIdx,BuySell,NotionalEUR,Price,TradeStatus,CustomerInterest
0,20161207,2789,8478,Sell,653168.0,0.0,Unknown,1.0
1,20170329,2574,14562,Buy,1656487.0,0.0,Unknown,1.0
2,20170418,2574,4747,Buy,939673.0,0.0,Unknown,1.0
3,20170310,2574,9885,Sell,708082.0,0.0,Unknown,1.0
4,20161116,2574,8885,Buy,1147709.0,0.0,Unknown,1.0


In [None]:
starting_date = 20160101
ending_date = 201603010
seq_label = 7
seq_features = 56

# Delete Holding Values
df_tra = df_trade[df_trade["TradeStatus"] != "Holding"]

# Drop useless columns
df_tra = df_tra.drop(["TradeStatus", "NotionalEUR", "Price"], axis=1)

# Get Dummies for BuySell feature
df_tra = df_tra.get_dummies(df_tra, columns=["BuySell"])

# Reorder the DF?

# Reorder Trade by Date
df_tra = df_tra.sort_values("TradeDateKey", ascending=True)

print("CREATING DICTIONARY")
print(df_tra.head(5))
print(df_tra.describe())
# Create a dictionary to store {date : value}
dictionary_date = {}
i = 0
for row in df_tra["TradeDateKey"].unique():
    dictionary_date[row]=i
    i = i+1
    
print("DATA PREPROCESSING")
print(df_tra.head(5))
print(df_tra.describe())

# Transform DateKey into a column from 0 to 1000
df_tra["TradeDateKey"] = df_tra["TradeDateKey"].apply(lambda x: dictionary_date[x])

print("FIRST PREPROCESSING DONE: ")
print(df_tra.head(5))
print(df_tra.describe())

def generate_labels(positive_samples, negative_samples):
    
    # Concatanate Negative and Positive Samples
    test_set = pd.concat([positive_samples, negative_samples])
    test_set = test_set.drop(["TradeDateKey"], axis=1)

    # Unique Values
    test_set = test_set.groupby(['CustomerIdx', 'IsinIdx']).sum()
    test_set = test_set.reset_index(level=['CustomerIdx', 'IsinIdx'])
    test_set["CustomerInterest"] = test_set["CustomerInterest"].apply(lambda x: 1 if x > 1 else x)
    
    return test_set

def generate_customer_features(date, customer_id):
    # Take the customer history
    df = df_tra[df_tra["CustomerIdx"] == customer_id]
    
    # Take One Week
    df = df[(df["TradeDateKey"] >= date) & (df["TradeDateKey"] < date + seq_label)]
        
    # Number of Interactions during the period
    num_int = len(df.index)
    
    return num_int

def generate_bond_features(date, bond_id):
    # Take the customer history
    df = df_tra[df_tra["IsinIdx"] == bond_id]
    
    # Take One Week
    df = df[(df["TradeDateKey"] >= date) & (df["TradeDateKey"] < date + seq_label)]
    
    # Number of Interactions during the period
    num_int = len(df.index)
    
    return num_int

def generate_customer_bond_features(date, customer_id, bond_id):
    # Take the customer-bond history
    df = df_tra[(df_tra["CustomerIdx"] == customer_id) & (df_tra["IsinIdx"] == bond_id)]
    
    # Take One Week
    df = df[(df["TradeDateKey"] >= date) & (df["TradeDateKey"] < date + seq_label)]
    
    # Number of Interactions during the period
    num_int = len(df.index)
    
    return num_int

def generate_sequence(date, sample):
    """
        generate a single sequence of shape
        (8, feeatures)
    """
    customer_id = sample[0]
    bond_id = sample[1]
    sequence = []
    for i in range(date-seq_features, date, seq_label):
        cust_feature = generate_customer_features(i, customer_id)
        bond_feature = generate_bond_features(i, bond_id)
        cust_bond_feature = generate_customer_bond_features(i, customer_id, bond_id)
        sample = np.array([cust_feature, bond_feature, cust_bond_feature])
        sequence.append(sample)
        
    sequence = np.asarray(sequence)
    return sequence
    
def generate_set(date, df):
    """
        generate a numpy array with shape:
        (samples, 8, features)
    """
    features = []
    labels = []
    for sample in df:
        feature = generate_sequence(date, sample)
        features.append(feature)
        labels.append(sample[-1])
   
    features = np.asarray(features)
    labels = np.asarray(labels)
    return features, labels

def generate_negative_samples(df):
    # Negative Samples
    negative_samples = df[df["TradeDateKey"] >= 0]
    negative_samples = negative_samples.groupby(["CustomerIdx", "IsinIdx"]).count()
    negative_samples = negative_samples[negative_samples["CustomerInterest"] <= 1]
    negative_samples["CustomerInterest"] = negative_samples["CustomerInterest"].apply(lambda x: 0 if x > 0 else x)
    negative_samples = negative_samples.reset_index(level=['CustomerIdx', 'IsinIdx'])
    return negative_samples

data = []
negative_samples = generate_negative_samples(df_tra)
print("GENERATED GENERAL NEGATIVE SAMPLES")

for i in range(starting_date, ending_date - seq_label):
    positive_samples = df_tra[(df_tra["TradeDateKey"] >= i) & (df_tra["TradeDateKey"] < i + seq_label)]
    labels = generate_labels(positive_samples, negative_samples)
    data.append((i, labels))

print("GENERATED DATA")
print(len(data))
# split train test with indeces
train = data[0:-2]
val = data[-2]
test = data[-1]

print("Some Statistics About The Weekly interactions.")
for i, df in train:
    print(df.shape)

print("GENERATING TRAIN SET...")
train_samples = []
for i, df in train:
    X, y = generate_set(i, df.values)
    print(X.shape)
    print(y.shape)
    train_samples.append((X, y))

print("GENERATED TRAIN SET")
# concatanate train or train them separately (I can also train on the last one and that's it.)
# (samples, 8, features)
# shuffle and train


    

CREATING DICTIONARY
         TradeDateKey  CustomerIdx  IsinIdx BuySell  CustomerInterest
1861377      20160104         1758     4937     Buy               1.0
1100790      20160104         2273     2890     Buy               1.0
281091       20160104          939    15328    Sell               1.0
840342       20160104         3283     7860     Buy               1.0
840310       20160104         1875    22836    Sell               1.0
       TradeDateKey   CustomerIdx       IsinIdx  CustomerInterest
count  2.211929e+06  2.211929e+06  2.211929e+06         2211929.0
mean   2.016819e+07  1.906291e+03  1.444043e+04               1.0
std    6.948648e+03  9.086160e+02  7.682489e+03               0.0
min    2.016010e+07  0.000000e+00  0.000000e+00               1.0
25%    2.016082e+07  1.145000e+03  7.699000e+03               1.0
50%    2.017032e+07  2.103000e+03  1.463500e+04               1.0
75%    2.017101e+07  2.574000e+03  2.176700e+04               1.0
max    2.018042e+07  3.470000e+0

In [None]:
# X_test score:
X_test, y_test = generate_set(i, test.values)
# Transform X_test
# Predict X_test

# Creating submission file
# Load challenge file
# Transform DataFrame
date = 20180423
for sample in data:
    X = generate_sequence(date, sample)
    X = normalize_sequence(X)
    y = model.predict(X)

In [None]:
# Create a dictionary {(date, customer) : [features]}
# Delete Holding Values
df = df_trade[df_trade["TradeStatus"] != "Holding"]
# Drop useless columns
df = df.drop(["TradeStatus", "NotionalEUR", "Price"], axis=1)
# Get Dummies for BuySell feature
df = df.get_dummies(df_tra, columns=["BuySell"])
# Reorder Trade by Date
df = df.sort_values("TradeDateKey", ascending=True)
print("CREATING DICTIONARY")
print(df.head(5))
print(df.describe())
# Create a dictionary to store {date : value}
dictionary_date = {}
i = 0
for row in df["TradeDateKey"].unique():
    dictionary_date[row]=i
    i = i+1
    
print("DATA PREPROCESSING")
print(df.head(5))
print(df.describe())

# Transform DateKey into a column from 0 to ...
df["TradeDateKey"] = df["TradeDateKey"].apply(lambda x: dictionary_date[x])

print("FIRST PREPROCESSING DONE: ")
print(df.head(5))
print(df.describe())

df = df.groupby("CustomerIdx", "TradeDateKey").count()
df["NumInt"] = df["TradeDateKey"]
num_int = num_int.reset_index()




In [51]:
max_date = df_trade["TradeDateKey"].max()

num_int = df_trade.groupby('IsinIdx').count()
num_int["NumInt"] = num_int["TradeDateKey"]
num_int = num_int.reset_index()
num_int = num_int[["IsinIdx", "NumInt"]]
print(num_int.head())
print(num_int.describe())

# Last interactions
last_int = df_trade.groupby('IsinIdx').max()
last_int["LastInt"] = last_int["TradeDateKey"].apply(lambda x: max_date - x)
# Reset Idx
last_int = last_int.reset_index()
last_int = last_int[["IsinIdx", "LastInt"]]
print(last_int.head())
print(last_int.describe())

# Merge
df = pd.merge(num_int, last_int, on=['IsinIdx'], how='left')
print(df.head())
print(df.describe())

# Last month Interactions
last_month_df = df_trade[df_trade["TradeDateKey"] >= max_date -30]
last_month_df = last_month_df.groupby('IsinIdx').count()
last_month_df["LastMonthInt"] = last_month_df["TradeDateKey"]
# Reset Idx
last_month_df = last_month_df.reset_index()
last_month_df = last_month_df[["IsinIdx", "LastMonthInt"]]
print(last_month_df.head())

# Merge
df = pd.merge(df, last_month_df, on=['IsinIdx'], how='left')
print(df.head())
print(df.describe())

# Last Week Interactions
last_week_df = df_trade[df_trade["TradeDateKey"] >= max_date -7]
last_week_df = last_week_df.groupby('IsinIdx').count()
last_week_df["LastWeekInt"] = last_week_df["TradeDateKey"]
# Reset Idx
last_week_df = last_week_df.reset_index()
last_week_df = last_week_df[["IsinIdx", "LastWeekInt"]]
print(last_week_df.head())

# Merge
df = pd.merge(df, last_week_df, on=['IsinIdx'], how='left')
print(df.head())
print(df.describe())

# Last 2 Week Interactions
last_2_week_df = df_trade[df_trade["TradeDateKey"] >= max_date -15]
last_2_week_df = last_2_week_df.groupby('IsinIdx').count()
last_2_week_df["Last2WeekInt"] = last_2_week_df["TradeDateKey"]
# Reset Idx
last_2_week_df = last_2_week_df.reset_index()
last_2_week_df = last_2_week_df[["IsinIdx", "Last2WeekInt"]]
print(last_2_week_df.head())

# Merge
df = pd.merge(df, last_2_week_df, on=['IsinIdx'], how='left')
print(df.head())
print(df.describe())

# Last 2 Month Interactions
last_2_month_df = df_trade[df_trade["TradeDateKey"] >= max_date -60]
last_2_month_df = last_2_month_df.groupby('IsinIdx').count()
last_2_month_df["Last2MonthInt"] = last_2_month_df["TradeDateKey"]
# Reset Idx
last_2_month_df = last_2_month_df.reset_index()
last_2_month_df = last_2_month_df[["IsinIdx", "Last2MonthInt"]]
print(last_2_month_df.head())

# Merge
df = pd.merge(df, last_2_month_df, on=['IsinIdx'], how='left')
print(df.head())
print(df.describe())

# Fill NAN with Zeros
df.fillna(0, inplace=True)

print(df.head())
print(df.describe())

   IsinIdx  NumInt
0        0      32
1        1     595
2        2      75
3        3    1467
4        4      15
            IsinIdx        NumInt
count  27305.000000  27305.000000
mean   13695.877312    247.647720
std     7902.381870    279.575663
min        0.000000      1.000000
25%     6855.000000     63.000000
50%    13700.000000    151.000000
75%    20542.000000    333.000000
max    27394.000000   3248.000000
   IsinIdx  LastInt
0        0     9507
1        1       91
2        2       91
3        3      194
4        4       91
            IsinIdx       LastInt
count  27305.000000  27305.000000
mean   13695.877312   1908.650906
std     7902.381870   4508.013246
min        0.000000      0.000000
25%     6855.000000      4.000000
50%    13700.000000     17.000000
75%    20542.000000    121.000000
max    27394.000000  20321.000000
   IsinIdx  NumInt  LastInt
0        0      32     9507
1        1     595       91
2        2      75       91
3        3    1467      194
4        4    

In [65]:
# Transform it into a dictionary
from itertools import islice
def take(n, iterable):
    return list(islice(iterable, n))
def print_dictionary(dictionary=None):
    n_items = take(10, dictionary.items())
    for key, val in n_items:
        print(key, val)
    return
# One solution
df['Features'] = list(zip(df['NumInt'], df['LastInt'], df["LastMonthInt"], df["LastWeekInt"], df["Last2WeekInt"], df["Last2MonthInt"]))
df_dict = df.groupby("IsinIdx")["Features"].apply(list).to_dict()
print_dictionary(df_dict)

0 [(32, 9507, 0.0, 0.0, 0.0, 0.0)]
1 [(595, 91, 0.0, 0.0, 0.0, 0.0)]
2 [(75, 91, 0.0, 0.0, 0.0, 0.0)]
3 [(1467, 194, 0.0, 0.0, 0.0, 0.0)]
4 [(15, 91, 0.0, 0.0, 0.0, 0.0)]
5 [(133, 291, 0.0, 0.0, 0.0, 0.0)]
6 [(188, 91, 0.0, 0.0, 0.0, 0.0)]
7 [(1447, 3, 11.0, 7.0, 10.0, 11.0)]
8 [(120, 9391, 0.0, 0.0, 0.0, 0.0)]
9 [(9, 20015, 0.0, 0.0, 0.0, 0.0)]


In [10]:
df_trade.sort_values("TradeDateKey", ascending=True).tail(3)

Unnamed: 0,TradeDateKey,CustomerIdx,IsinIdx,BuySell,NotionalEUR,Price,TradeStatus,CustomerInterest
5675819,20180422,3032,6499,Buy,817906.0,,Holding,0.0
5678090,20180422,3032,15303,Buy,1615037.0,,Holding,0.0
5678094,20180422,3032,15308,Buy,2658687.0,,Holding,0.0


In [16]:
df_trade[(df_trade["TradeDateKey"] >= 20180328) & (df_trade["TradeDateKey"] < 20180405)].count()

TradeDateKey        145271
CustomerIdx         145271
IsinIdx             145271
BuySell             145271
NotionalEUR         145271
Price                19782
TradeStatus         145271
CustomerInterest    145271
dtype: int64

In [116]:
# Filter the data
df_trade = df_trade[df_trade["TradeDateKey"] >= 20171008]
# Delete Holding Values
df_trade = df_trade[df_trade["TradeStatus"] != "Holding"]

# Drop Useless Columns
df_trade = df_trade.drop(["TradeStatus", "NotionalEUR", "Price"], axis=1)
df_trade = df_trade.sort_values("TradeDateKey", ascending=True)

In [121]:
# Positive Samples
positive_samples = df_trade[(df_trade["TradeDateKey"] >= 20180328) & (df_trade["TradeDateKey"] < 20180405)]
# positive_samples = positive_samples.groupby(["CustomerIdx", "IsinIdx"]).count()["CustomerInterest"].reset_index(level=['CustomerIdx', 'IsinIdx'])
# positive_samples["CustomerInterest"] = positive_samples["CustomerInterest"].apply(lambda x: 0 if x > 1 else x)

# Negative Samples
negative_samples = df_trade
# negative_samples = negative_samples.groupby(["CustomerIdx", "IsinIdx"]).count()["CustomerInterest"].reset_index(level=['CustomerIdx', 'IsinIdx'])

# negative_samples["CustomerInterest"] = negative_samples["CustomerInterest"].apply(lambda x: 0 if x > 0 else x)
negative_samples = negative_samples.groupby(["CustomerIdx", "IsinIdx"]).count()
negative_samples = negative_samples[negative_samples["CustomerInterest"] <= 2]
negative_samples["CustomerInterest"] = negative_samples["CustomerInterest"].apply(lambda x: 0 if x > 0 else x)
print(negative_samples.describe())
print(negative_samples.head())
negative_samples = negative_samples.reset_index(level=['CustomerIdx', 'IsinIdx'])
print(negative_samples.describe())
print(negative_samples.head())


# print( # ["CustomerInterest"].apply(lambda x: )
# negative_samples = negative_samples.sort_values("CustomerIdx", ascending=True)

test_set = pd.concat([positive_samples, negative_samples])
# test_set = test_set.sort_values("CustomerIdx", ascending=True)
# test_set.groupby(["CustomerIdx", "IsinIdx"]).sum().reset_index(level=['CustomerIdx', 'IsinIdx'])
print(test_set.describe())
test_set.sort_values("CustomerIdx", ascending=True).head(10)

        TradeDateKey        BuySell  CustomerInterest
count  219606.000000  219606.000000          219606.0
mean        1.233596       1.233596               0.0
std         0.423119       0.423119               0.0
min         1.000000       1.000000               0.0
25%         1.000000       1.000000               0.0
50%         1.000000       1.000000               0.0
75%         1.000000       1.000000               0.0
max         2.000000       2.000000               0.0
                     TradeDateKey  BuySell  CustomerInterest
CustomerIdx IsinIdx                                         
0           21856               2        2                 0
            24944               1        1                 0
            25992               2        2                 0
            26726               1        1                 0
            27045               2        2                 0
         CustomerIdx        IsinIdx   TradeDateKey        BuySell  \
count  219606.0000

Unnamed: 0,BuySell,CustomerIdx,CustomerInterest,IsinIdx,TradeDateKey
4,2,0,0,27045,2
0,2,0,0,21856,2
1,1,0,0,24944,1
2,2,0,0,25992,2
3,1,0,0,26726,1
5,1,0,0,27086,1
6,1,0,0,27298,1
11,1,1,0,16400,1
10,1,1,0,10244,1
9,1,1,0,10243,1


In [40]:
df_trade.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,TradeDateKey,BuySell,CustomerInterest,CustomerIdx
CustomerIdx,IsinIdx,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,5215,2,2,2,1
2,16812,2,2,2,1
2,17091,1,1,1,1
2,17275,2,2,2,1
14,22209,2,2,2,1


In [11]:
df_trade.groupby("TradeDateKey").count()["CustomerInterest"].values

array([     1,   3524,   5317,   4808,   5031,      2,     19,   5549,
         4622,   5779,   5827,   5731,      5,   1951,   4172,   5096,
         6523,   4711,      7,   4919,   5332,   4708,   6833,   4561,
           15,   5881,   7591, 197510,   6383,   5156,      5,   6088,
         6513,   6159,   7461,   6590,      2,   6710,   6266,   5346,
         5850,   4414,     10,   1618,   5162,   5078,   7130,   4732,
            2,      6,   7591,   5091, 120770,   5307,   4602,   4970,
         5240,   4535,   4108,   3815,   5219,   4615,   4993,   5917,
         4180,   6051,  10461,   6197,   5355,   1878,      1,      1,
         6989,   5462,   6153,   9472,     97, 117437,   1793,   5577,
         4742,  10166,   5414,      1,     14,   3949,   4809,   6407,
         5500,   5401,     49,   4722,   5455,   5987,   5140,   4768,
            3])

In [8]:
dictionary = df_trade.groupby(['CustomerIdx', 'IsinIdx']).count()['TradeDateKey'].to_dict()
dictionary[(0, 24944)]

1

In [41]:
df_trade.describe()
print(np.sort(df_trade["TradeDateKey"].unique()))

TEST_DATE = 20180412
VAL_DATE = 20180405
TRAIN_DATE = 20180328

[20180101 20180102 20180103 20180104 20180105 20180106 20180107 20180108
 20180109 20180110 20180111 20180112 20180114 20180115 20180116 20180117
 20180118 20180119 20180121 20180122 20180123 20180124 20180125 20180126
 20180128 20180129 20180130 20180131 20180201 20180202 20180204 20180205
 20180206 20180207 20180208 20180209 20180211 20180212 20180213 20180214
 20180215 20180216 20180218 20180219 20180220 20180221 20180222 20180223
 20180224 20180225 20180226 20180227 20180228 20180301 20180302 20180305
 20180306 20180307 20180308 20180309 20180312 20180313 20180314 20180315
 20180316 20180319 20180320 20180321 20180322 20180323 20180324 20180325
 20180326 20180327 20180328 20180329 20180330 20180331 20180402 20180403
 20180404 20180405 20180406 20180407 20180408 20180409 20180410 20180411
 20180412 20180413 20180415 20180416 20180417 20180418 20180419 20180420
 20180422]


In [59]:
# Create a Double KEY dictionary
"""train = df_trade.values

from collections import defaultdict
from itertools import product

customer_ids = np.unique(train[:,1])
bonds_ids = np.unique(train[:,2])

cbidx_freq = defaultdict(dict)
for c, b in product(customer_ids, bonds_ids):
    for sample in train:
        i = 0
        if sample[1] == c and sample[2] == b:
            i = i + 1
        cbidx_freq[c][b] = i"""
"""max_value = df_trade["TradeDateKey"].max()
df_trade[df_trade["TradeDateKey"] == max_value].groupby("CustomerIdx").count()["CustomerInterest"]"""


# Generate the label
df_trade.groupby(['CustomerIdx','IsinIdx']).count()['TradeDateKey']

# Generate the train set
df_trade[(df_trade["TradeDateKey"] >= TRAIN_DATE) & (df_trade["TradeDateKey"] < VAL_DATE)].groupby(["CustomerIdx", "IsinIdx"]).count().describe()
df_trade[(df_trade["TradeDateKey"] >= VAL_DATE) & (df_trade["TradeDateKey"] < TEST_DATE)].groupby(["CustomerIdx", "IsinIdx"]).count().describe()
df = df_trade[df_trade["TradeDateKey"] >= TEST_DATE].groupby(["CustomerIdx", "IsinIdx"]).count()["CustomerInterest"]

print(df.head())
print(df.values[0:5])
df.reset_index(level=['CustomerIdx', 'IsinIdx'])

CustomerIdx  IsinIdx
0            27045      1
             27298      1
5            10842      1
8            1812       1
             3364       1
Name: CustomerInterest, dtype: int64
[1 1 1 1 1]


Unnamed: 0,CustomerIdx,IsinIdx,CustomerInterest
0,0,27045,1
1,0,27298,1
2,5,10842,1
3,8,1812,1
4,8,3364,1
5,8,21277,1
6,8,24116,1
7,8,25460,1
8,8,26341,1
9,9,7504,1


In [6]:
from itertools import islice
dictionary = df_trade.groupby('IsinIdx').count()['TradeDateKey'].to_dict()

def take(n, iterable):
    return list(islice(iterable, n))
def print_dictionary(dictionary=None):
    n_items = take(10, dictionary.items())
    for key, val in n_items:
        print(key, val)
    return
print_dictionary(dictionary)

1 7
2 10
3 1
4 4
5 1
6 40
7 76
14 2
15 8
17 31
