In [103]:
from tensorflow import keras
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from tensorflow.keras import layers
from keras.utils.vis_utils import plot_model
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [115]:
from numpy.random import seed
seed(1)
from tensorflow import random
random.set_seed(2)

In [116]:
train_df = pd.read_csv("train.csv", delimiter=',')
train_df

Unnamed: 0.1,Unnamed: 0,latitude,longitude,company,is_local,type,fin_1,fin_2,fin_3,fin_4,target
0,0,40.108910,-83.092860,8336,0,3,-135060.089443,86013.396489,1206.094242,52287.082257,0
1,1,39.865420,-84.062800,18403,1,0,-1766.845055,14985.640180,477.494992,168836.215743,1
2,2,39.102660,-84.524680,14022,0,3,-177302.873693,44881.958005,1463.339889,130388.243325,0
3,3,39.101480,-84.523410,11051,0,0,209049.997460,0.000000,95.340075,103267.727546,1
4,4,41.062130,-81.537840,3243,0,3,8669.269507,0.000000,399.421926,177532.206618,1
...,...,...,...,...,...,...,...,...,...,...,...
159996,159996,41.567510,-72.727300,11307,0,3,16243.910806,31628.125379,425.822833,171803.513497,1
159997,159997,41.857517,-88.143778,6324,0,3,28018.854840,48674.176247,401.849141,68368.711459,1
159998,159998,35.261260,-80.837000,15370,1,1,101012.486516,0.000000,199.255523,190942.481245,0
159999,159999,35.261260,-80.836997,9384,0,1,6461.647033,0.000000,507.408851,18122.078134,1


In [117]:
train_df.loc[train_df["company"].value_counts()[train_df["company"]].values <= 10, "company"] = -1
le = preprocessing.LabelEncoder()
le.fit(train_df["company"])
train_df["enc_company"] = le.transform(train_df["company"])


geo_cluster = KMeans(n_clusters=300, random_state=0)
geo_cluster.fit(train_df[['longitude', 'latitude']])

train_df["cluster"] = geo_cluster.predict(train_df[['longitude', 'latitude']])




In [118]:
# big_companies = set(train_df["company"].where(train_df["company"] > -1))

In [119]:
train_df['company'].nunique()

1385

In [120]:
test_df = pd.read_csv("test.csv", delimiter=',')
test_df

Unnamed: 0.1,Unnamed: 0,latitude,longitude,company,is_local,type,fin_1,fin_2,fin_3,fin_4,target
0,160000,39.284310,-76.735350,18187,1,1,113033.389907,0.000000,270.906219,31222.780176,0
1,160001,36.758509,-76.344861,11208,0,0,-87239.590275,73759.387510,759.194862,237587.544996,1
2,160002,43.402802,-75.217100,7437,1,4,-122084.498620,15528.109943,1039.655934,29612.346982,0
3,160003,41.871160,-87.848570,17362,0,1,87355.127256,0.000000,214.594205,163526.475818,1
4,160004,42.161296,-88.129184,11515,0,3,33014.437946,0.000000,379.819724,67499.397999,1
...,...,...,...,...,...,...,...,...,...,...,...
39995,199995,35.241600,-80.983740,11640,0,3,194648.093228,0.000000,95.701485,37584.380280,1
39996,199996,35.241730,-80.983750,11849,0,3,-53202.007008,9641.082811,653.516413,189604.463353,0
39997,199997,35.290596,-80.756953,3987,0,3,-102398.568309,47105.929570,863.135873,26231.880089,0
39998,199998,35.204460,-80.720190,12287,0,0,32066.188506,6659.705140,363.861756,116288.412218,1


In [121]:
test_df.loc[test_df["company"].value_counts()[test_df["company"]].values <= 10, "company"] = -1
# test_df.loc[~test_df["company"].isin(big_companies), "company"] = -1
test_df["enc_company"] = le.transform(test_df["company"])

test_df["cluster"] = geo_cluster.predict(test_df[['longitude', 'latitude']])


In [122]:
x_train, x_validation, y_train, y_validation = train_test_split(train_df.drop('target', axis=1), 
                                                      train_df.target, test_size=0.1, random_state=1)

In [123]:
input_col = keras.Input(shape=(5,), name="input_col")  


x1 = layers.Dense(32)(input_col)
x1 = layers.BatchNormalization()(x1)
x1 = layers.Dense(32)(x1)
x1 = layers.BatchNormalization()(x1)
x1 = layers.Reshape((1, 32))(x1)


company_input = keras.Input(shape=(None,), name="company_input") 
num_companies=le.classes_.shape[0]
x2 = layers.Embedding(num_companies, 128)(company_input)
x2 = layers.Dense(32)(x2)


type_input = keras.Input(shape=(None,), name="type_input") 
num_type = 5
x3 = layers.Embedding(num_type, 16)(type_input)
x3 = layers.Dense(16)(x3)


geo_input = keras.Input(shape=(None,), name="geo_input") 
num_clusters = 300
x4 = layers.Embedding(num_clusters, 128)(geo_input)
x4 = layers.Dense(32)(x4)


x = layers.Concatenate()([x1, x2, x3, x4])
x = layers.Dense(64)(x)
x = layers.BatchNormalization()(x)
x = layers.Dense(64)(x)
x = layers.BatchNormalization()(x)
outputs = layers.Dense(1)(x)


model = keras.Model(
    inputs=[input_col, company_input, type_input,geo_input ],
    outputs=outputs,
)

# plot_model(model, "model.png", show_shapes=True)

In [124]:
model.compile(
    loss=keras.losses.BinaryCrossentropy(),
    optimizer=keras.optimizers.Adam(),
    metrics=["accuracy"],
)

model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath="./checkpoint1",
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)


history = model.fit({"input_col": x_train[["fin_1", "fin_2", "fin_3", "fin_4", "is_local"]],
                     "company_input": x_train["enc_company"], "type_input": x_train["type"], "geo_input": x_train["cluster"]},
                    y_train, batch_size=128, epochs=10, 
                    validation_data=({"input_col": x_validation[["fin_1", "fin_2", "fin_3", "fin_4", "is_local"]],
                     "company_input": x_validation["enc_company"], "type_input": x_validation["type"],
                    "geo_input": x_validation["cluster"]},
                    y_validation), callbacks=[model_checkpoint_callback])

# test_scores = model.evaluate(x_test, np.array(test_df["target"]), verbose=2)
# print("Test loss:", test_scores[0])
# print("Test accuracy:", test_scores[1])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [125]:
test_scores = model.evaluate({"input_col": test_df[["fin_1", "fin_2", "fin_3", "fin_4", "is_local"]],
                     "company_input": test_df["enc_company"], "type_input": test_df["type"], 
                              "geo_input": test_df["cluster"]}, test_df["target"], verbose=2)
print("Test loss:", test_scores[0])
print("Test accuracy:", test_scores[1])

1250/1250 - 2s - loss: 0.5618 - accuracy: 0.7510 - 2s/epoch - 2ms/step
Test loss: 0.5618336200714111
Test accuracy: 0.7510250210762024


In [9]:
test_df['company'].nunique()

2970

In [13]:
pip install pydot

Collecting pydot
  Downloading pydot-1.4.2-py2.py3-none-any.whl (21 kB)
Installing collected packages: pydot
Successfully installed pydot-1.4.2
Note: you may need to restart the kernel to use updated packages.
