## Importing the modules and the constants

In [1]:
CHALLENGE_DATA = "../data/Challenge_20180423.csv"
CUSTOMER_DATA = "../data/Customer.csv"
ISIN_DATA = "../data/Isin.csv"
MACRO_MARKET_DATA = "../data/MarketData_Macro.csv"
SAMPLE_SUBMISSION = "../data/sample_submission.csv"
TRADE_DATA = "../data/Trade.csv"
MARKET_DATA = "../data/Market.csv"

import pandas as pd
import numpy as np

## Importing the different Dataset for exploration purposes

In [2]:
df_customer = pd.read_csv(CUSTOMER_DATA)
df_challenge = pd.read_csv(CHALLENGE_DATA)
df_isin = pd.read_csv(ISIN_DATA)
df_macro = pd.read_csv(MACRO_MARKET_DATA)
df_trade = pd.read_csv(TRADE_DATA)
df_market = pd.read_csv(MARKET_DATA)

## Customer Table 
It represents the data regarding the customers. There are 3471 total customers. Both Subsector and Country have high cardinality; if the information is related with the label a feature engineering step should be applied.

In [3]:
print("Overview")
print(df_customer.head(5))
print(df_customer.describe())

print()
print("Nan Values")
print(df_customer.isnull().sum())

print()
print("Unique Sector Values")
print(df_customer['Sector'].nunique())

print()
print("Unique Subsector Values")
print(df_customer['Subsector'].nunique())

print()
print("Unique Region Values")
print(df_customer['Region'].nunique())

print()
print("Unique Country Values")
print(df_customer['Country'].nunique())

print()
print("Example of customer info")
print(df_customer[df_customer["CustomerIdx"] == 2789])

Overview
   CustomerIdx                        Sector                      Subsector  \
0         2975      Banks and Intermediaries                           Bank   
1         1594  Asset Managers & Hedge Funds                            NaN   
2          399                   Corporation  Corp - Comm. & Prof. Services   
3          836                  Asset Owners                      Insurance   
4          816                  Asset Owners                      Insurance   

     Region   Country  
0  Americas  BARBADOS  
1  Americas   BERMUDA  
2  Americas   BERMUDA  
3  Americas   BERMUDA  
4  Americas   BERMUDA  
       CustomerIdx
count  3471.000000
mean   1735.000000
std    1002.135719
min       0.000000
25%     867.500000
50%    1735.000000
75%    2602.500000
max    3470.000000

Nan Values
CustomerIdx      0
Sector           0
Subsector      352
Region           0
Country          0
dtype: int64

Unique Sector Values
5

Unique Subsector Values
41

Unique Region Values
3

Uniq

## General Market Data

No feature is directly correlated to the problem. If the temporal information can be exploited, we can have a general idea baout the market and could affect the prediction. The number of NAN values is not relevant a simple MEDIAN/MEAN could be used.

In [4]:
print("Overview")
print(df_macro.head(5))
print(df_macro.describe())

print()
print("Nan Values")
print(df_macro.isnull().sum())

Overview
    DateKey       SSE      DAX  EUROSTOXX   VSTOXX  FTSE100       HSI  \
0  20150101       NaN      NaN        NaN      NaN      NaN       NaN   
1  20150102       NaN  9764.73    3139.44  26.2531  6547.80  23857.82   
2  20150105  3350.519  9473.16    3023.14  29.6236  6417.16  23721.32   
3  20150106  3351.446  9469.66    3007.91  28.8317  6366.51  23485.41   
4  20150107  3373.954  9518.18    3026.79  28.1715  6419.83  23681.26   

     NIKKEI  DOWJONES_INDU    SP500     ...      Swap_TRY2Y  Swap_TRY5Y  \
0       NaN            NaN      NaN     ...       10.069276    9.915120   
1       NaN       17832.99  2058.20     ...       10.151881    9.976101   
2  17408.71       17501.65  2020.58     ...        9.720274    9.575104   
3  16883.19       17371.64  2002.61     ...        9.446627    9.301924   
4  16885.33       17584.52  2025.90     ...        9.315702    9.221503   

   Swap_USD10Y  Swap_USD2Y  Swap_USD30Y  Swap_USD5Y  Swap_ZAR10Y  Swap_ZAR2Y  \
0     2.295037       

## Bonds Data

It represents the data regarding the different BONDS. The number of Missing Values is not relevant, but the OHE procedure will be expensive. The date could be considered both categorical and numerical; in this case the temporal information could be relevant in the final prediction.

In [5]:
print("Overview")
print(df_isin.head(5))
print(df_isin.describe())

print()
print("Nan Values")
print(df_isin.isnull().sum())

print()
print("Unique Maturity Date")
print(df_isin['ActualMaturityDateKey'].nunique())

print()
print("Unique Issue Date")
print(df_isin['IssueDateKey'].nunique())

print()
print("Unique Seniority")
print(df_isin['Seniority'].nunique())

print()
print("Unique Currency")
print(df_isin['Currency'].nunique())

print()
print("Unique Activity Group")
print(df_isin['ActivityGroup'].nunique())

print()
print("Unique Region")
print(df_isin['Region'].nunique())

print()
print("Unique Activity")
print(df_isin['Activity'].nunique())

print()
print("Unique Risk Captain")
print(df_isin['RiskCaptain'].nunique())

print()
print("Unique Owner")
print(df_isin['Owner'].nunique())

print()
print("Unique Composite Rating")
print(df_isin['CompositeRating'].nunique())

print()
print("Unique Industry Sector")
print(df_isin['IndustrySector'].nunique())

print()
print("Unique Industry Subgroup")
print(df_isin['IndustrySubgroup'].nunique())

print()
print("Unique Market Issue")
print(df_isin['MarketIssue'].nunique())

print()
print("Unique Coupon Type")
print(df_isin['CouponType'].nunique())

print()
print("New Bonds")
print(df_isin[df_isin["IsinIdx"] == 27340])

Overview
   IsinIdx  TickerIdx  ActualMaturityDateKey  IssueDateKey Seniority Currency  \
0        0        238               20381231      20051129       GOV      USD   
1        1        238               20331231      20051129       GOV      USD   
2        2        238               20331231      20051129       GOV      ARS   
3        3        236               20170417      20070417       GOV      USD   
4        4        234               20221004      20100222       GOV      ARS   

       ActivityGroup    Region   Activity RiskCaptain          Owner  \
0  FLOW LOCAL MARKET  AMERICAS  ARGENTINA   ARGENTINA  EMK ARGENTINA   
1  FLOW LOCAL MARKET  AMERICAS  ARGENTINA   ARGENTINA  EMK ARGENTINA   
2  FLOW LOCAL MARKET  AMERICAS  ARGENTINA   ARGENTINA  EMK ARGENTINA   
3  FLOW LOCAL MARKET  AMERICAS  ARGENTINA   ARGENTINA  EMK ARGENTINA   
4  FLOW LOCAL MARKET  AMERICAS  ARGENTINA   ARGENTINA  EMK ARGENTINA   

  CompositeRating IndustrySector IndustrySubgroup MarketIssue  IssuedAm

## Trade Table

It represents all the trades that have been done during almost 2 years. The Price feature has too many missing values, the column will be dropped soon.

In [6]:
print("Overview")
print(df_trade.head(5))
print(df_trade.describe())

print()
print("Nan Values")
print(df_trade.isnull().sum())

print()
print("Unique Dates")
print(df_trade['TradeDateKey'].nunique())

print()
print("Unique Customers")
print(df_trade['CustomerIdx'].nunique())

print()
print("Unique IsinIdx")
print(df_trade['IsinIdx'].nunique())

print()
print("Trades with New Bonds")
print(df_trade[df_trade["IsinIdx"] == 27340])

Overview
   TradeDateKey  CustomerIdx  IsinIdx BuySell  NotionalEUR  Price TradeStatus  \
0      20161207         2789     8478    Sell     653168.0    0.0     Unknown   
1      20170329         2574    14562     Buy    1656487.0    0.0     Unknown   
2      20170418         2574     4747     Buy     939673.0    0.0     Unknown   
3      20170310         2574     9885    Sell     708082.0    0.0     Unknown   
4      20161116         2574     8885     Buy    1147709.0    0.0     Unknown   

   CustomerInterest  
0               1.0  
1               1.0  
2               1.0  
3               1.0  
4               1.0  
       TradeDateKey   CustomerIdx       IsinIdx   NotionalEUR         Price  \
count  6.762021e+06  6.762021e+06  6.762021e+06  6.762021e+06  2.144088e+06   
mean   2.016750e+07  1.922354e+03  1.460369e+04  6.313228e+06  1.779008e+05   
std    6.768082e+03  8.579293e+02  7.963809e+03  2.712408e+08  1.318503e+06   
min    2.016010e+07  0.000000e+00  0.000000e+00 -1.48554

## Market Data

It represents the historical information about the bond. No missing Values are present. INTUITION: due to the fact that it represents a sequence of features during the last 2 years, a Squence-to-Sequence Autoencoder could represent the data with lower dimensionality.

In [7]:
print("Overview")
print(df_market.head(5))
print(df_market.tail(5))
print(df_market.describe())

print()
print("Nan Values")
print(df_market.isnull().sum())

print()
print("Group By Bond Index")
print(df_market.groupby(["IsinIdx"]).count())

print("New Bonds")
print(df_market[df_market["IsinIdx"] == 27340])

Overview
   IsinIdx   DateKey    Price  Yield  ZSpread
0        1  20160101  104.250  7.835    5.505
1        7  20160101  107.500  7.520    5.541
2      102  20160101  100.746  4.048    2.085
3      331  20160101  112.790 -0.752   -0.215
4      345  20160101  113.383 -0.667   -0.272
         IsinIdx   DateKey    Price  Yield  ZSpread
9867742    25748  20180416  100.943  2.873    2.645
9867743    25748  20180417  100.948  2.874    2.654
9867744    25748  20180418  100.952  2.874    2.645
9867745    25748  20180419  100.957  2.874    2.623
9867746    25748  20180420  100.961  2.874    2.623
            IsinIdx       DateKey         Price         Yield       ZSpread
count  9.867747e+06  9.867747e+06  9.867747e+06  9.867747e+06  9.867747e+06
mean   1.393869e+04  2.016819e+07  1.054736e+02  1.882719e+01  2.120735e+03
std    7.479061e+03  6.875624e+03  1.206965e+01  4.072348e+03  3.197921e+05
min    1.000000e+00  2.016010e+07  1.500000e-01 -1.170163e+04 -2.484130e+02
25%    7.536000e+03  2.

In [8]:
df_challenge.head(5)

Unnamed: 0,PredictionIdx,DateKey,CustomerIdx,IsinIdx,BuySell,CustomerInterest
0,a1e0d80784,20180423,1856,13323,Buy,
1,c2cc6cc2a8,20180423,1856,9230,Buy,
2,a8e94f6344,20180423,1780,9157,Buy,
3,758bae1e35,20180423,2129,9131,Buy,
4,02ab378ee8,20180423,1758,7151,Buy,


## Merging data for better exploration

In [9]:
print("Customer Information")
print(df_customer.head(5))

print()
print("Market information - historical information for each bond")
print(df_market.head(5))

print()
print("Trade Table - all the transactions performed by the users")
print(df_trade.head(5))
print(df_trade.tail(5))

print()
print("Bonds Table - current information about the bonds")
print(df_isin.head(5))

# print()
# print("General Market information")
# print(df_macro.head(5))

Customer Information
   CustomerIdx                        Sector                      Subsector  \
0         2975      Banks and Intermediaries                           Bank   
1         1594  Asset Managers & Hedge Funds                            NaN   
2          399                   Corporation  Corp - Comm. & Prof. Services   
3          836                  Asset Owners                      Insurance   
4          816                  Asset Owners                      Insurance   

     Region   Country  
0  Americas  BARBADOS  
1  Americas   BERMUDA  
2  Americas   BERMUDA  
3  Americas   BERMUDA  
4  Americas   BERMUDA  

Market information - historical information for each bond
   IsinIdx   DateKey    Price  Yield  ZSpread
0        1  20160101  104.250  7.835    5.505
1        7  20160101  107.500  7.520    5.541
2      102  20160101  100.746  4.048    2.085
3      331  20160101  112.790 -0.752   -0.215
4      345  20160101  113.383 -0.667   -0.272

Trade Table - all the tr

## Understanding Temporal Correlation
The temporal correlation is low, but having the features regarding the history of each client and bond the performance could improve a lot.

In [10]:
print()
print("Trade Table - all the transactions performed by the users")
print(df_trade.head(5))
print(df_trade.tail(5))

print(df_trade.columns)
columns = ["TradeDateKey", "CustomerIdx", "IsinIdx", "NotionalEUR", "CustomerInterest"]

for feat in columns:
    print(df_trade[feat].corr(df_trade["CustomerInterest"]))


Trade Table - all the transactions performed by the users
   TradeDateKey  CustomerIdx  IsinIdx BuySell  NotionalEUR  Price TradeStatus  \
0      20161207         2789     8478    Sell     653168.0    0.0     Unknown   
1      20170329         2574    14562     Buy    1656487.0    0.0     Unknown   
2      20170418         2574     4747     Buy     939673.0    0.0     Unknown   
3      20170310         2574     9885    Sell     708082.0    0.0     Unknown   
4      20161116         2574     8885     Buy    1147709.0    0.0     Unknown   

   CustomerInterest  
0               1.0  
1               1.0  
2               1.0  
3               1.0  
4               1.0  
         TradeDateKey  CustomerIdx  IsinIdx BuySell  NotionalEUR  Price  \
6762016      20160101         3470     5957     Buy          0.0    NaN   
6762017      20160101         3470     5958     Buy          0.0    NaN   
6762018      20160101         3470     9491     Buy          0.0    NaN   
6762019      20160101 

## Merging the data - concatanate Trade, Bond, Historical Bond, General Market Info and Customers

In [11]:
# Merging with current customers information
df_customers_trade = pd.merge(df_trade, df_customer, on=["CustomerIdx"])
print(df_customers_trade.head(5))
print(df_customers_trade.describe())

Unnamed: 0,TradeDateKey,CustomerIdx,IsinIdx,BuySell,NotionalEUR,Price,TradeStatus,CustomerInterest,Sector,Subsector,Region,Country
0,20161207,2789,8478,Sell,653168.0,0.0,Unknown,1.0,Asset Managers & Hedge Funds,Hedge Fund,Americas,USA
1,20160804,2789,7474,Buy,1041012.0,0.0,Unknown,1.0,Asset Managers & Hedge Funds,Hedge Fund,Americas,USA
2,20170807,2789,14622,Sell,637322.0,0.0,Unknown,1.0,Asset Managers & Hedge Funds,Hedge Fund,Americas,USA
3,20171120,2789,9131,Sell,2059758.0,0.0,Unknown,1.0,Asset Managers & Hedge Funds,Hedge Fund,Americas,USA
4,20161207,2789,5983,Sell,513203.0,0.0,Unknown,1.0,Asset Managers & Hedge Funds,Hedge Fund,Americas,USA


In [12]:
del df_trade
del df_customers
# Merging with current bond info
df_customers_trade_bonds = pd.merge(df_customers_trade, df_isin, on=["IsinIdx"])
print(df_customers_trade_bonds.head(5))
print(df_customers_trade_bonds.describe())

Unnamed: 0,TradeDateKey,CustomerIdx,IsinIdx,BuySell,NotionalEUR,Price,TradeStatus,CustomerInterest,Sector,Subsector,...,Region_y,Activity,RiskCaptain,Owner,CompositeRating,IndustrySector,IndustrySubgroup,MarketIssue,IssuedAmount,CouponType
0,20161207,2789,8478,Sell,653168.0,0.0,Unknown,1.0,Asset Managers & Hedge Funds,Hedge Fund,...,AMERICAS,CDS AND HY,CDS AND HY,US HY PHARMA,CCC+,"Consumer, Non-cyclic",Medical-Drugs,Priv placement,1200000000.0,STEP CPN
1,20160331,2789,8478,Buy,154310.0,,Holding,0.0,Asset Managers & Hedge Funds,Hedge Fund,...,AMERICAS,CDS AND HY,CDS AND HY,US HY PHARMA,CCC+,"Consumer, Non-cyclic",Medical-Drugs,Priv placement,1200000000.0,STEP CPN
2,20160630,2789,8478,Sell,1993999.0,,Holding,0.0,Asset Managers & Hedge Funds,Hedge Fund,...,AMERICAS,CDS AND HY,CDS AND HY,US HY PHARMA,CCC+,"Consumer, Non-cyclic",Medical-Drugs,Priv placement,1200000000.0,STEP CPN
3,20160930,2789,8478,Sell,9693562.0,,Holding,0.0,Asset Managers & Hedge Funds,Hedge Fund,...,AMERICAS,CDS AND HY,CDS AND HY,US HY PHARMA,CCC+,"Consumer, Non-cyclic",Medical-Drugs,Priv placement,1200000000.0,STEP CPN
4,20160101,2789,8478,Sell,11533251.0,,Holding,0.0,Asset Managers & Hedge Funds,Hedge Fund,...,AMERICAS,CDS AND HY,CDS AND HY,US HY PHARMA,CCC+,"Consumer, Non-cyclic",Medical-Drugs,Priv placement,1200000000.0,STEP CPN


In [None]:
"""del df_isin
del df_customers_trade
df_market_rename = df_market.rename(columns={"DateKey":"TradeDateKey"})
del df_market
# Merging with historical bond information
df_customers_trade_bonds_histbond = pd.merge(df_customers_trade_bonds, df_market_rename, on=["TradeDateKey"])
print(df_customers_trade_bonds_histbond.head(5))
print(df_customers_trade_bonds_histbond.describe())"""