In [22]:
%matplotlib inline
import pandas as pd
import matplotlib
import time
import numpy as np
import matplotlib.pyplot as plt
TRADE_DATA = "../data/Trade.csv"
CHALLENGE_DATA = "../data/Challenge_20180423.csv"

In [23]:
df_challenge = pd.read_csv(CHALLENGE_DATA)
df_trade = pd.read_csv(TRADE_DATA)

In [24]:
print("Overview")
print(df_challenge.head(5))
print(df_challenge.describe())

print("Overview")
print(df_trade.head(5))
print(df_trade.describe())

Overview
  PredictionIdx   DateKey  CustomerIdx  IsinIdx BuySell  CustomerInterest
0    a1e0d80784  20180423         1856    13323     Buy               NaN
1    c2cc6cc2a8  20180423         1856     9230     Buy               NaN
2    a8e94f6344  20180423         1780     9157     Buy               NaN
3    758bae1e35  20180423         2129     9131     Buy               NaN
4    02ab378ee8  20180423         1758     7151     Buy               NaN
          DateKey    CustomerIdx        IsinIdx  CustomerInterest
count    484758.0  484758.000000  484758.000000               0.0
mean   20180423.0    1936.835749   15572.977102               NaN
std           0.0     869.365212    8381.691252               NaN
min    20180423.0       0.000000       1.000000               NaN
25%    20180423.0    1288.000000    7950.000000               NaN
50%    20180423.0    2106.000000   15592.000000               NaN
75%    20180423.0    2574.000000   23841.000000               NaN
max    20180423.0  

In [25]:
def intersect(df1, df2, columns):
    df1 = df1.drop_duplicates(columns)
    df2 = df2.drop_duplicates(columns)
    merged = df1.append(df2)
    return merged[merged.duplicated(columns)]

df = pd.merge(df_challenge, df_trade, on=["CustomerIdx", "IsinIdx"], how="left")
print(df.describe())
print(df.head())

df = intersect(df_trade, df_challenge, ["CustomerIdx", "IsinIdx"])
print("{} out of {} interactions in challange already happened".format(df.shape[0], df_challenge.shape[0]))

          DateKey   CustomerIdx       IsinIdx  CustomerInterest_x  \
count   3193638.0  3.193638e+06  3.193638e+06                 0.0   
mean   20180423.0  2.024466e+03  1.493910e+04                 NaN   
std           0.0  8.547243e+02  7.976784e+03                 NaN   
min    20180423.0  0.000000e+00  1.000000e+00                 NaN   
25%    20180423.0  1.746000e+03  7.833000e+03                 NaN   
50%    20180423.0  2.223000e+03  1.484900e+04                 NaN   
75%    20180423.0  2.585000e+03  2.288900e+04                 NaN   
max    20180423.0  3.470000e+03  2.734400e+04                 NaN   

       TradeDateKey   NotionalEUR         Price  CustomerInterest_y  
count  3.193638e+06  3.193638e+06  2.033936e+06        3.193638e+06  
mean   2.017052e+07  1.375766e+07  1.202644e+05        6.490310e-01  
std    7.151711e+03  3.810539e+08  1.085705e+06        4.772733e-01  
min    2.016010e+07 -7.899616e+07 -9.999990e+05        0.000000e+00  
25%    2.016122e+07  5.82484

In [26]:
train = df_trade[df_trade["TradeDateKey"] < 20180415]
test = df_trade[df_trade["TradeDateKey"] >= 20180415]
positive_samples = df_trade[df_trade["TradeDateKey"] >= 20180415]
negative_samples = df_trade[(df_trade["TradeDateKey"] >= 20171122) & (df_trade["TradeDateKey"] < 20180415)]

In [27]:
df = intersect(train, positive_samples, ["CustomerIdx", "IsinIdx"])
print("{} out of {} interactions in challange already happened".format(df.shape[0], positive_samples.shape[0]))

15466 out of 26124 interactions in challange already happened


In [28]:
def generate_test_set_new(df, from_date=20180415, to_date=None, from_date_label=20171122):
		"""
			The method creates a dataframe for testing purposes similar to the one 
			of the competition. It uses the last 6 months interactions as negative labels.

			@args
				df : DataFrame -> entire Trade Table.
				from_date : int -> corresponding tot the date in which 
					we start the week of test.
				from_date_label : int -> representing the starting date from
					which we start to collect the negative samples.
				to_date : int -> date representing the end of the week.
			@return
				test_set : DataFrame -> with 1 week of positive and negative 
					samples from the week considered and the previous 6 months.
		"""
		# Delete Holding Values
		df = df[df["TradeStatus"] != "Holding"]

		# Drop Useless Columns
		df = df.drop(["TradeStatus", "NotionalEUR", "Price"], axis=1)

		# One Hot Encoding for Sell and Buy
		df = pd.get_dummies(df, columns=['BuySell'])
		
		if to_date is None:
			positive_samples = df[df["TradeDateKey"] >= from_date]
			positive_samples_neg = df[df["TradeDateKey"] >= from_date]
		else:
			positive_samples = df[(df["TradeDateKey"] >= from_date) & (df["TradeDateKey"] < to_date)]
			positive_samples_neg = df[(df["TradeDateKey"] >= from_date) & (df["TradeDateKey"] < to_date)]

		print(positive_samples.describe())

		positive_samples_neg["BuySell_Buy"] = positive_samples["BuySell_Sell"]
		positive_samples_neg["BuySell_Sell"] = positive_samples["BuySell_Buy"]
		positive_samples_neg["CustomerInterest"] = positive_samples_neg["CustomerInterest"]\
            .apply(lambda x: 0 if x > 0 else x)
		
		# Negative Samples
		negative_samples = df[(df["TradeDateKey"] >= from_date_label) & (df["TradeDateKey"] < from_date)]
		negative_samples_neg = df[(df["TradeDateKey"] >= from_date_label) & (df["TradeDateKey"] < from_date)]

		# Opposite Positive
		positive_samples_neg["BuySell_Buy"] = negative_samples["BuySell_Sell"]
		positive_samples_neg["BuySell_Sell"] = negative_samples["BuySell_Buy"]

		# Double Negative Samples
		negative_samples_neg = df[(df["TradeDateKey"] >= from_date_label) & (df["TradeDateKey"] < from_date)]
		negative_samples_neg["BuySell_Buy"] = negative_samples["BuySell_Sell"]
		negative_samples_neg["BuySell_Sell"] = negative_samples["BuySell_Buy"]

		# Put to zero all the negative
		negative_samples["CustomerInterest"] = negative_samples["CustomerInterest"]\
			.apply(lambda x: 0 if x > 0 else x)
		negative_samples_neg["CustomerInterest"] = negative_samples_neg["CustomerInterest"]\
			.apply(lambda x: 0 if x > 0 else x)

		# Concatanate Negative and Positive Samples
		test_set = pd.concat([positive_samples, negative_samples, positive_samples_neg, negative_samples_neg])
		test_set = test_set.drop(["TradeDateKey"], axis=1)

		# Unique Values
		test_set = test_set.groupby(['CustomerIdx', 'IsinIdx', "BuySell_Sell", "BuySell_Buy"]).sum()
		test_set = test_set.reset_index(level=['CustomerIdx', 'IsinIdx', "BuySell_Sell", "BuySell_Buy"])
		test_set["CustomerInterest"] = test_set["CustomerInterest"].apply(lambda x: 1 if x > 1 else x)

		# Reorder the columns
		test_set = test_set[['CustomerIdx', 'IsinIdx', "BuySell_Buy", "BuySell_Sell", 'CustomerInterest']]

		return test_set

In [30]:
test_set_gen = generate_test_set_new(df_trade)
train = pd.get_dummies(train, columns=['BuySell'])
df = intersect(train, test_set_gen, ["CustomerIdx", "IsinIdx", "BuySell_Buy", "BuySell_Sell"])
print("{} out of {} interactions in challange already happened".format(df.shape[0], test_set_gen.shape[0]))

       TradeDateKey   CustomerIdx       IsinIdx  CustomerInterest  \
count  2.180200e+04  21802.000000  21802.000000           21802.0   
mean   2.018042e+07   1890.160306  15974.516237               1.0   
std    1.363429e+00    872.990067   8497.143710               0.0   
min    2.018042e+07      0.000000      7.000000               1.0   
25%    2.018042e+07   1155.000000   8408.000000               1.0   
50%    2.018042e+07   2044.000000  15793.000000               1.0   
75%    2.018042e+07   2573.000000  24454.750000               1.0   
max    2.018042e+07   3470.000000  27357.000000               1.0   

        BuySell_Buy  BuySell_Sell  
count  21802.000000  21802.000000  
mean       0.486148      0.513852  
std        0.499820      0.499820  
min        0.000000      0.000000  
25%        0.000000      0.000000  
50%        0.000000      1.000000  
75%        1.000000      1.000000  
max        1.000000      1.000000  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

326503 out of 440737 interactions in challange already happened
