In [17]:
cfg = {
  "file_paths": {
    "preprocessed": "../data/preprocessed",
    "embedded": "../data/embedded",
    "predicted": "../data/predicted"
  },
  "dataload": {
    "load": {
      "dataset_name": "dmg777k",
      "use_final_distribution": True,
      "use_torch": True,
      "prune_dist": 25
    }
  },
  "preprocess": {
    "bin_numbers2": {
      "num_bins": 10
    },
    "append_weekday_and_month": {
      "neighborhood": 1
    },
    "delete_number_literals": {
      "neighborhood": 1
    },
    "do_nothing": {
      "nothingness": True
    }
  },
  "embed": {
    "RDF2Vec": {
      "embedding_name": "Word2Vec",
      "embedding_args": {
        "workers": 4,
        "epochs": 40
      },
      "walker_name": "RandomWalker",
      "walker_args": {
        "max_depth": 3,
        "max_walks": 500
      }
    },
    "TransE": {
      "optimizer": "Adam",
      "train_loop_type": "SLCWA",
      "train_loop_args": {
        "num_epochs": 30,
        "batch_size": 1024
      }
    },
    "complex": {
      "optimizer": "Adam",
      "train_loop_type": "SLCWA",
      "train_loop_args": {
        "num_epochs": 30,
        "batch_size": 1024
      }
    }
  },
  "evaluate": {
    "RandomForest": {
      "n_estimators": [
        10,
        20,
        40
      ],
      "max_depth": [
        3,
        5,
        10
      ],
      "cv": 10
    },
    "SVM": {
      "n_estimators": [
        10,
        20,
        40
      ],
      "max_depth": [
        3,
        5,
        10
      ],
      "cv": 10
    },
    "KNN": {
      "n_neighbors": [
        2,
        4,
        7,
        9,
        15
      ],
      "leaf_size": [
        10,
        20,
        30,
        50
      ]
    },
    "NB": {
      "alpha": [
        1,
        2,
        3
      ]
    }
  },
  "pipeline": {
    "dataload": "load",
    "preprocess": [
      "delete_number_literals"
    ],
    "embed": "TransE",
    "evaluate": [
      "SVM",
      "KNN"
    ]
  }
}

In [20]:
cfg['file_paths']

{'preprocessed': '../data/preprocessed',
 'embedded': '../data/embedded',
 'predicted': '../data/predicted'}

In [6]:
cfg['preprocess']

{'bin_numbers2': {'num_bins': 10},
 'append_weekday_and_month': {'neighborhood': 1},
 'delete_number_literals': {'neighborhood': 1},
 'do_nothing': {'nothingness': True}}

In [39]:
from omegaconf import DictConfig, OmegaConf
import numpy as np 

import dataload
import embed
import evaluate
from utils.data_utils import data_to_kg, extract_ents, update_dataset_name
import preprocess
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
print("Data loading...")
# TODO only load if preprocessed file not available
print(cfg["preprocess"][cfg["pipeline"]["preprocess"][0]])
data = getattr(dataload, "load")(**cfg["dataload"]["load"])
data.name = cfg["dataload"]["load"]["dataset_name"]
data = update_dataset_name(
    data, cfg["preprocess"], cfg["pipeline"]["preprocess"])

print("Preprocess started...")
for step in cfg["pipeline"]["preprocess"]:
    print(f"Processing step {step}...")
    data = getattr(preprocess, step)(data, **cfg["preprocess"][step])

# TODO save preprocessed file 

print("Embedding started...")
# TODO load embedder if allready there ? (not best idea since i want to do 5 seperate embeddings for each embedding method)
embedder = getattr(embed, cfg["pipeline"]["embed"])(data,
        **cfg["embed"][cfg["pipeline"]["embed"]])

train_entities, test_entities, train_target, test_taget = extract_ents(
    data)  # extract necessary fields from data

print("fit_transform")
embeddings, train_embeddings, test_embeddings = embedder.fit_transform()
np.savetxt(f'{cfg["file_paths"]["embedded"]}/{data.name}$train.csv',train_embeddings,delimiter=',',fmt="%s")
np.savetxt(f'{cfg["file_paths"]["embedded"]}/{data.name}$test.csv',test_embeddings,delimiter=',',fmt="%s")

# TODO pickle embedder
# TODO save embeddings and be able to save multiple of same embedding method (e.g. _0, _1 ...)
# prio 1
print("Classifier fitting started...")
# TODO pack into 1 step or create map instead of list to be able to know model name.
models = {}
for m in cfg["pipeline"]["evaluate"]:
    print('fitting {m}...')
    model = getattr(evaluate, m)(
        **cfg["evaluate"][m])
    model.fit(train_embeddings, train_target)
    models[m] = model

print("Evaluation started...")
# TODO save into file to analyze in subsequent stages (also with _0,_1 and so on)
# prio 1.1
for m, model in models.items():
    print(f"evaluating model {model}")
    predictions = model.predict(test_embeddings)
    np.savetxt(f'{cfg["file_paths"]["predicted"]}/{data.name}${m}.csv',[predictions, test_taget],delimiter=',',fmt="%s")
    print(
        f"Predicted {len(test_entities)} entities with an accuracy of "
        + f"{accuracy_score(test_taget, predictions) * 100 :.4f}%"
    )
    print(f'resulted in following f scores: micro {f1_score(test_taget, predictions, average="micro")} macro {f1_score(test_taget, predictions, average="macro")}')
    print("Confusion Matrix :")
    print(confusion_matrix(test_taget, predictions))


print("Save Data...")


Data loading...
{'neighborhood': 1}
loaded data dmg777k (57.36s).
pruned (36.13s).
Preprocess started...
Processing step delete_number_literals...
Embedding started...
pykeen file does not exist. Writing pykeen file...


No random seed is specified. This may lead to non-reproducible results.


fit_transform


Training epochs on cuda:0:   0%|          | 0/30 [00:00<?, ?epoch/s]
Training batches on cuda:0:   0%|          | 0/749 [00:00<?, ?batch/s]
Training batches on cuda:0:   1%|          | 6/749 [00:00<00:13, 56.34batch/s]
Training batches on cuda:0:   2%|▏         | 16/749 [00:00<00:09, 77.58batch/s]
Training batches on cuda:0:   3%|▎         | 25/749 [00:00<00:08, 82.48batch/s]
Training batches on cuda:0:   5%|▍         | 35/749 [00:00<00:08, 85.75batch/s]
Training batches on cuda:0:   6%|▌         | 45/749 [00:00<00:08, 87.20batch/s]
Training batches on cuda:0:   7%|▋         | 54/749 [00:00<00:08, 86.57batch/s]
Training batches on cuda:0:   9%|▊         | 64/749 [00:00<00:07, 88.01batch/s]
Training batches on cuda:0:  10%|▉         | 73/749 [00:00<00:07, 88.34batch/s]
Training batches on cuda:0:  11%|█         | 82/749 [00:00<00:07, 85.38batch/s]
Training batches on cuda:0:  12%|█▏        | 92/749 [00:01<00:07, 86.74batch/s]
Training batches on cuda:0:  14%|█▎        | 102/749 [00:01<0

Classifier fitting started...
fitting {m}...
fitting {m}...
Evaluation started...
evaluating model <evaluate.svm.SVM object at 0x000002615DA2B6A0>
Predicted 2001 entities with an accuracy of 56.5217%
resulted in following f scores: micro 0.5652173913043478 macro 0.31117559008059376
Confusion Matrix :
[[  0  70   1   4   1]
 [  0 857   7  89   3]
 [  0  96  34  14   0]
 [  0 311   5 238   1]
 [  0 236   3  29   2]]
evaluating model <evaluate.knn.KNN object at 0x000002610A739760>
Predicted 2001 entities with an accuracy of 54.3728%
resulted in following f scores: micro 0.543728135932034 macro 0.3085809778247186
Confusion Matrix :
[[  0  68   2   2   4]
 [  1 841   7  77  30]
 [  0  91  32  18   3]
 [  0 343   2 201   9]
 [  1 222   4  29  14]]
Save Data...


In [16]:
np.array([predictions, test_taget])

array([[1, 4, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 4, 1, 1]])

In [9]:
np.array(train_embeddings)

NameError: name 'np' is not defined

In [14]:
import numpy as np 
np.array(train_embeddings).tofile('../data/processed/test.csv',sep=',')

In [26]:
np.fromfile("../data/predicted/dmg777k+delete_number_literals-neighborhood@1$SVM",sep=',')

array([1., 1., 1., ..., 4., 1., 1.])

In [38]:
np.savetxt("test.csv",[predictions, test_taget],delimiter=',',fmt="%s")