In [13]:
#encoding: utf-8
import sys
sys.path.append("../")
import json
import pandas
from model_trainer.evalution import get_prediction, Evalution
from model_trainer.data_loader import load_train_data
from model_trainer.data_loader import load_test_data
from model_trainer.make_feature_file import Make_feature_file
from feature_functions import *
from classifier import *


class Trainer(object):
    def __init__(self,
                classifier,
                model_path,
                feature_function_list,
                train_feature_path,
                test_feature_path,
                test_result_path):

        self.classifier = classifier
        self.model_path = model_path
        self.feature_function_list = feature_function_list
        self.train_feature_path = train_feature_path
        self.test_feature_path = test_feature_path
        self.test_result_path = test_result_path


    def make_feature_file(self, train_AuthorIdPaperIds, test_AuthorIdPaperIds, dict_coauthor, dict_paperIdAuthorId_to_name_aff, PaperAuthor, Author):

        print(("-"*120))
        print(("\n".join([f.__name__ for f in feature_function_list])))
        print(("-" * 120))

        print("make train feature file ...")
        Make_feature_file(train_AuthorIdPaperIds, dict_coauthor, dict_paperIdAuthorId_to_name_aff, PaperAuthor, Author, self.feature_function_list, self.train_feature_path)
        print("make test feature file ...")
        Make_feature_file(test_AuthorIdPaperIds, dict_coauthor, dict_paperIdAuthorId_to_name_aff, PaperAuthor, Author, self.feature_function_list, self.test_feature_path)


    def train_mode(self):
        self.classifier.train_model(self.train_feature_path, self.model_path)

    def test_model(self):
        self.classifier.test_model(self.test_feature_path, self.model_path, self.test_result_path)





if __name__ == "__main__":


    ''' 特征函数列表 '''
    feature_function_list = [
        coauthor_1,
        coauthor_2,
        # stringDistance_1,
        # stringDistance_2,
    ]

    ''' 分类器 '''
    # 决策树，NB，等
    classifier = Classifier(skLearn_DecisionTree())
    # classifier = Classifier(skLearn_NaiveBayes())
    # classifier = Classifier(skLearn_svm())
    # classifier = Classifier(skLearn_lr())
    # classifier = Classifier(skLearn_KNN())
    # classifier = Classifier(sklearn_RandomForestClassifier())
    # classifier = Classifier(skLearn_AdaBoostClassifier())
    # classifier = Classifier(sklearn_VotingClassifier())

    ''' model path '''
    model_path = config.MODEL_PATH

    ''' train feature_file & test feature_file & test result path '''
    train_feature_path = config.TRAIN_FEATURE_PATH
    test_feature_path = config.TEST_FEATURE_PATH
    test_result_path = config.TEST_RESULT_PATH

    ''' Trainer '''
    trainer = Trainer(classifier, model_path, feature_function_list, train_feature_path, test_feature_path, test_result_path)

    ''' load data '''
    print("loading data...")
    train_AuthorIdPaperIds = load_train_data(config.TRAIN_FILE)  # 加载训练数据
    test_AuthorIdPaperIds = load_test_data(config.TEST_FILE)  # 加载测试数据
    # coauthor, 共作者数据
    dict_coauthor = json.load(open(config.COAUTHOR_FILE), encoding="utf-8")
    # (paperId, AuthorId) --> {"name": "name1##name2", "affiliation": "aff1##aff2"}
    dict_paperIdAuthorId_to_name_aff \
        = json.load(open(config.PAPERIDAUTHORID_TO_NAME_AND_AFFILIATION_FILE), encoding="utf-8")
    # 使用pandas加载csv数据
    PaperAuthor = pandas.read_csv(config.PAPERAUTHOR_FILE)  # 加载 PaperAuthor.csv 数据
    Author = pandas.read_csv(config.AUTHOR_FILE) # 加载 Author.csv 数据
    print("data is loaded...")

    # 为训练和测试数据，抽取特征，分别生成特征文件
    trainer.make_feature_file(train_AuthorIdPaperIds, test_AuthorIdPaperIds, dict_coauthor, dict_paperIdAuthorId_to_name_aff, PaperAuthor, Author)
    # 根据训练特征文件，训练模型
    trainer.train_mode()
    # 使用训练好的模型，对测试集进行预测
    trainer.test_model()
    # 对模型的预测结果，重新进行整理，得到想要的格式的预测结果
    get_prediction(config.TEST_FEATURE_PATH, config.TEST_RESULT_PATH, config.TEST_PREDICT_PATH)

    ''' 评估,（预测 vs 标准答案）'''
    gold_file = config.GOLD_FILE
    pred_file = config.TEST_PREDICT_PATH
    cmd = "python evalution.py %s %s" % (gold_file, pred_file)
    os.system(cmd)









Using skLearn decisionTree Classifier
loading data...
data is loaded...
------------------------------------------------------------------------------------------------------------------------
coauthor_1
coauthor_2
------------------------------------------------------------------------------------------------------------------------
make train feature file ...
make test feature file ...
==> Train the model ...
==> Test the model ...


In [16]:
#!/usr/bin/env python
#encoding: utf-8
import os
import sys
import importlib
importlib.reload(sys)
# sys.setdefaultencoding('utf-8')
sys.path.append("../")
import util
import config
from confusion_matrix import Alphabet, ConfusionMatrix


# 对模型的预测结果，重新进行整理，得到想要的格式的预测结果
def get_prediction(test_feature_path, test_result_path, to_file):
    feature_list = [line.strip() for line in open(test_feature_path)]
    predict_list = [line.strip() for line in open(test_result_path)]

    dict_authorId_to_predict = {}
    for feature, predict in zip(feature_list, predict_list):
        paperId, authorId = feature.split(" # ")[-1].split(" ")
        paperId = int(paperId)
        authorId = int(authorId)

        if authorId not in dict_authorId_to_predict:
            dict_authorId_to_predict[authorId] = {}
            dict_authorId_to_predict[authorId]["ConfirmedPaperIds"] = []
            dict_authorId_to_predict[authorId]["DeletedPaperIds"] = []

        if predict == "1":
            dict_authorId_to_predict[authorId]["ConfirmedPaperIds"].append(paperId)
        if predict == "0":
            dict_authorId_to_predict[authorId]["DeletedPaperIds"].append(paperId)

    # to csv
    items = sorted(list(dict_authorId_to_predict.items()), key=lambda x: x[0])

    data = []
    for item in items:
        AuthorId = item[0]
        ConfirmedPaperIds = " ".join(map(str, item[1]["ConfirmedPaperIds"]))
        DeletedPaperIds = " ".join(map(str, item[1]["DeletedPaperIds"]))

        data.append({"AuthorId": AuthorId, "ConfirmedPaperIds": ConfirmedPaperIds, "DeletedPaperIds": DeletedPaperIds})

    util.write_dict_to_csv(["AuthorId", "ConfirmedPaperIds", "DeletedPaperIds"], data, to_file)


# 评估。（预测 vs 标准答案）
def Evalution(gold_file_path, pred_file_path):
    gold_authorIdPaperId_to_label = {}
    pred_authorIdPaperId_to_label = {}

    gold_data = util.read_dict_from_csv(gold_file_path)
    for item in gold_data:
        AuthorId = item["AuthorId"]
        # 正样本
        for paperId in item["ConfirmedPaperIds"].split(" "):
            gold_authorIdPaperId_to_label[(AuthorId, paperId)] = "1"
        # 负样本
        for paperId in item["DeletedPaperIds"].split(" "):
            gold_authorIdPaperId_to_label[(AuthorId, paperId)] = "0"

    pred_data = util.read_dict_from_csv(pred_file_path)
    for item in pred_data:
        AuthorId = item["AuthorId"]
        # 正样本
        for paperId in item["ConfirmedPaperIds"].split(" "):
            pred_authorIdPaperId_to_label[(AuthorId, paperId)] = "1"
        # 负样本
        for paperId in item["DeletedPaperIds"].split(" "):
            pred_authorIdPaperId_to_label[(AuthorId, paperId)] = "0"

    # evaluation
    alphabet = Alphabet()
    alphabet.add("0")
    alphabet.add("1")

    cm = ConfusionMatrix(alphabet)
    for AuthorId, paperId in gold_authorIdPaperId_to_label:
        gold = gold_authorIdPaperId_to_label[(AuthorId, paperId)]
        pred = pred_authorIdPaperId_to_label[(AuthorId, paperId)]
        cm.add(pred, gold)

    return cm



if __name__ == '__main__':
    gold_file_path = "/Users/zhanghongwei/Desktop/数据挖掘/kdd/KDD_Benchmark/data/dataset/valid_set/Valid.gold.csv"
    pred_file_path = "/Users/zhanghongwei/Desktop/数据挖掘/kdd/KDD_Benchmark/predict/test.predict"


    cm = Evalution(gold_file_path, pred_file_path)
    # accuracy
    acc = cm.get_accuracy()
    # 打印评估结果
    print("")
    print("##" * 20)
    print("    评估结果, 以Accuracy为准")
    print("##" * 20)
    print("")
    print("准确率: {:.2%}".format(acc))
    cm.print_out()



########################################
    评估结果, 以Accuracy为准
########################################

准确率: 74.65%
row = predicted, column = truth
  0     1      
0 426.0 266.0  
1 319.0 1297.0 

0 	precision 0.615607 	recall 0.571812	 F1 0.592902
1 	precision 0.802599 	recall 0.829814	 F1 0.815980
* Overall accuracy rate = 0.746534
* Average precision 0.709103 	 recall 0.700813	 F1 0.704441
