# Snowtornado

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics.pairwise import cosine_similarity
from stock_exchange import create_interval_tree
from validation import train_test_split, evaluate_model, output_evaluation

## Preprocessing

- Read products
- Read matches
- Create interval tree from stock exchange

In [None]:
products = pd.read_csv("data/products.csv", dtype=str, index_col="id")
matches = pd.read_csv("data/matches.csv", index_col=False, usecols=[0, 1, 2], names=['original_id', 'replacement_id', 'timestamp'])
matches['timestamp'] = pd.to_datetime(matches['timestamp'])
stock_exchange = create_interval_tree("data/stock_exchange.csv")
products.shape

Remove rarely used columns

In [None]:
total = products.isnull().sum().sort_values(ascending=False)
percent = (products.isnull().sum()/products.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
products = products.drop((missing_data[missing_data['Percent'] > 0.15]).index,1)

Clean data by converting every field to lowercase, stripping, and finally converting it to a category type

In [None]:
binary = pd.DataFrame()

for column in products.columns:
    if not column in ["name"]:
        binary[column] = products[column].str.lower().str.strip().astype('category')
    
binary = pd.get_dummies(binary, dummy_na=True)
binary = binary.loc[:, (binary != 0).any(axis=0)] # Remove superfluous NaN columns
binary.shape

In [None]:
matches_train, matches_test = train_test_split(matches, 0.9, stock_exchange)

Fill training data with the binary data

In [None]:
X_train = np.zeros((matches_train.shape[0], binary.shape[1]), dtype = 'bool')
Y_train = np.zeros((matches_train.shape[0], binary.shape[1]), dtype = 'bool')

In [None]:
i = 0
for _, row in matches_train.iterrows():
    original_id = row['original_id']
    replacement_id = row['replacement_id']
    X_train[i] = binary.loc[[original_id]].values[0]
    Y_train[i] = binary.loc[[replacement_id]].values[0]
    i += 1

In [None]:
def products_on_the_market(timestamp):
    return binary.loc[set(map((lambda x: x.data), stock_exchange[timestamp]))]

def find_all_matches(model, original_id, timestamp):
    prediction = model.predict(binary.loc[[original_id]].values)
    similar = cosine_similarity(prediction, products_on_the_market(timestamp))[0]
    indices = np.argsort(-similar)
    return products_on_the_market(timestamp).index.values[indices].tolist()

In [None]:
class OneToOneModel:
    def predict(self, values):
        return values

output_evaluation(OneToOneModel(), matches_test, find_all_matches)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

neurons = X_train.shape[1]

model = Sequential([
    #Dense(input_dim=neurons, units=neurons, activation='sigmoid'),
    Dense(input_dim=neurons, units=neurons, activation='sigmoid'),
])
model.compile(loss='cosine_proximity', optimizer='adagrad')

output_epoch = 100

for epoch in range(0, 400, output_epoch):
    model.fit(X_train, Y_train, initial_epoch=epoch, epochs=epoch + output_epoch, batch_size=32, verbose=0)
    print("After epoch: {}".format(epoch + output_epoch))
    output_evaluation(model, matches_test, find_all_matches)