# Knowledge Graph for Movie

### Read the dumped KG data and try to process

In [1]:
# Import packages

from typing import Union, List
from enum import Enum

import re
import pickle

import pandas as pd
import numpy as np
import seaborn as sns

import rdflib
from pyparsing import ParseException

In [2]:
with open('k-graph.bin', 'rb') as f:
    g = pickle.load(f)

http://ckb.org/ontology/#镜�^ does not look like a valid URI, trying to serialize this will break.
http://ckb.org/ontology/#侯马董�^坚�呀鹉棺┑� does not look like a valid URI, trying to serialize this will break.


### Part 1 - Try to match movie.txt with knowledge graph

In [3]:
from pyparsing import ParseException
import pprint
import pandas as pd

QUERY = r'SELECT * WHERE {<http://ckb.org/ontology/#_%s_> ?b ?c}'

c_list = list()
e_list = list()

data = pd.read_csv("C:/Users/Thesharing/Downloads/YunPan/DeeCamp_2018/Movie.txt", sep='\t', encoding='utf-8')

for i, name in enumerate(data['name']):
    if name is not None and type(name) is str:
        n = name.split(' ')[0]
        if n not in c_list:
            try:
                res = g.query(QUERY % n)
                if len(res) > 0:
                    c_list.append(n)
            except ParseException:
                print('Error at %d %s %s' % (i, name, n))
                e_list.append(n)

Error at 21226 巨蟒剧团：前所未有的表演|And Now for Something Completely Different 巨蟒剧团：前所未有的表演|And
Error at 30824 夜|戏 夜|戏
Error at 42068 唯我独尊\巴黎单身派对 La fabrique des sentiments 唯我独尊\巴黎单身派对
Error at 45758 核弹总动员|核弹快车 Death Train 核弹总动员|核弹快车
Error at 58505 芭比之拇指姑娘\芭比呈献花仙子 Barbie Presents Thumbelina 芭比之拇指姑娘\芭比呈献花仙子


In [4]:
import re

pattern = re.compile('([A-Za-z])+')

good_list = list()
for item in c_list:
    m = pattern.match(item)
    if m is None or len(m.group()) != len(item):
        good_list.append(item)

In [5]:
len(good_list)

12644

In [8]:
with open('good_list.txt', 'w', encoding='utf-8') as f:
    for i in good_list:
        print('http://ckb.org/ontology/#_%s_' % i, file=f)

### Part 2 - Match movie.txt with knowledge graph

In [78]:
# Read movie.txt

import pprint

data = pd.read_csv("C:/Users/Thesharing/Downloads/YunPan/DeeCamp_2018/Movie.txt", sep='\t', encoding='utf-8')

eng_pattern = re.compile(u'[\u0021-\u0126\s]+')
chn_pattern = re.compile(u"[\u4e00-\u9fa5\u3000-\u303f\ufb00-\ufffd][\u4e00-\u9fa5\u3000-\u303f\ufb00-\ufffd\s]+")

class NameType(Enum):
    NONE = 0
    ENG = 1
    CHN = 2
    MERGE = 3

def process_name(name):
    if type(name) is not str:
        return None, NameType.NONE
    elif eng_pattern.fullmatch(name):
        return name, NameType.ENG
    else:  
        l = re.split('\s+', name.strip())
        if eng_pattern.fullmatch(l[0]):
            c_l = chn_pattern.findall(name)
            return ' '.join(list(item.strip() for item in c_l)), NameType.MERGE
        else:
            return l[0], NameType.CHN

QUERY = r'SELECT * WHERE {<http://ckb.org/ontology/#_%s_> ?b ?c}'
TYPE_QUERY = r'SELECT * WHERE {<http://ckb.org/ontology/#_%s_> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> ?c}'
ENG_QUERY = r'SELECT * WHERE {?a <http://ckb.org/ontology/#英文名> "%s"}'

res = g.query(TYPE_QUERY)
for r in res:
    print(r)

prefix_len = len('http://ckb.org/ontology/#')

t_list = []
f_list = []
d_list = []
e_list = []

idx = 0

with open('movie_names.txt', 'w', encoding='utf-8') as f:
    for item in data['name']:
        idx += 1
        name, name_type = process_name(item)
        if name is not None:
            try:
                if name in t_list:
                    d_list.append(item)
                    print(item, name, 2, file=f)
                else:
                    res = g.query(QUERY % name.replace(' ', '_'))
                    if len(res) > 0:
                        l = []
                        for i in g.query(TYPE_QUERY % name.replace(' ', '_')):
                            l.append(i['c'][prefix_len:])
                        print(item, name, 1, l, file=f)
                        t_list.append(name)
                    else:
                        print(item, name, 0, file=f)
                        f_list.append(item)
            except ParseException:
                print('Error at %s %s' % (item, name))
                e_list.append(item)

Error at 巨蟒剧团：前所未有的表演|And Now for Something Completely Different 巨蟒剧团：前所未有的表演|And
Error at 夜|戏 夜|戏
Error at 唯我独尊\巴黎单身派对 La fabrique des sentiments 唯我独尊\巴黎单身派对
Error at 核弹总动员|核弹快车 Death Train 核弹总动员|核弹快车
Error at 芭比之拇指姑娘\芭比呈献花仙子 Barbie Presents Thumbelina 芭比之拇指姑娘\芭比呈献花仙子


In [80]:
print(len(t_list), len(f_list), len(e_list), len(d_list))

12671 63620 5 4799


In [97]:
# Save to file

with open('t_list.txt', 'w', encoding='utf-8') as f:
    for t in t_list:
        print(t, file=f)

### Part 3 - Fix problem in '主演'

In [103]:
QUERY = r'SELECT * WHERE {?a <http://ckb.org/ontology/#地区> ?c}'

with open('district.txt', 'w', encoding='utf-8') as f:
    for i in g.query(QUERY):
        print(i['c'][prefix_len:], file=f)

In [85]:
g.remove((URIRef(u'http://ckb.org/ontology/#_宝贝计划_'), URIRef(u'http://ckb.org/ontology/#主演') ,URIRef(u'http://ckb.org/ontology/#成龙')))

In [83]:
from rdflib import URIRef, Literal
g.add((URIRef(u'http://ckb.org/ontology/#_宝贝计划_'), URIRef(u'http://ckb.org/ontology/#主演') ,URIRef(u'http://ckb.org/ontology/#成龙')))

In [1]:
import re


chn_pattern = re.compile('[\u4e00-\u9fa5][\u4e00-\u9fa5·\-]+')
bracelet_pattern = re.compile('\(.+?\)')
dot_pattern = re.compile('\.{2,}')
punc_pattern = re.compile('[,;；]')
eng_pattern = re.compile('[A-Za-z\.]+_?[A-Za-z\.\-]+_?[A-Za-z\.\-]+_?[A-Za-z\.\-]+')


def process_name(name):
    name = name.replace('等', ' ').replace('・', '·').replace('•', '·')
    name = re.sub(bracelet_pattern, ' ', name)
    name = re.sub(dot_pattern, ' ', name)
    return list(chn_pattern.findall(name) + eng_pattern.findall(name))

In [3]:
from rdflib import URIRef, Literal
from pyparsing import ParseException
import pickle

with open('k-graph.bin', 'rb') as f:
    g = pickle.load(f)
    
print('Loaded.')

prefix = 'http://ckb.org/ontology/#'
prefix_len = len(prefix)
QUERY = r'SELECT * WHERE {<http://ckb.org/ontology/#%s> <http://ckb.org/ontology/#%s> ?c}'
category_list = ['导演', '主演', '制片人', '编剧']

with open('t_list.txt', 'r', encoding='utf-8') as f: 
    for item in f.readlines():
        movie_name = '_' + item.strip('\n') + '_'
        for category in category_list:
            try:
                res = g.query(QUERY % (movie_name, category))
                for r in res:
                    g.remove((URIRef(prefix + movie_name), URIRef(prefix + category), URIRef(r['c'])))
                    for n in process_name(r['c'][prefix_len:]):
                        g.add((URIRef(prefix + movie_name), URIRef(prefix + category), URIRef(prefix + n)))
            except ParseException:
                print(QUERY % (movie_name, category))

http://ckb.org/ontology/#镜�^ does not look like a valid URI, trying to serialize this will break.
http://ckb.org/ontology/#侯马董�^坚�呀鹉棺┑� does not look like a valid URI, trying to serialize this will break.


Loaded.
SELECT * WHERE {<http://ckb.org/ontology/#_Wii Sports_> <http://ckb.org/ontology/#导演> ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_Wii Sports_> <http://ckb.org/ontology/#主演> ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_Wii Sports_> <http://ckb.org/ontology/#制片人> ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_Wii Sports_> <http://ckb.org/ontology/#编剧> ?c}


In [14]:
QUERY = r'SELECT * WHERE {?a <http://ckb.org/ontology/#片长> ?c}'
res = g.query(QUERY)
with open('length.txt', 'w', encoding='utf-8') as f:
    for i in res:
        print(i, file=f)

In [7]:
QUERY = r'SELECT * WHERE {<http://ckb.org/ontology/#_宝贝计划_> ?b ?c}'
res = g.query(QUERY)
for i in res:
    print(i)

(rdflib.term.URIRef('http://ckb.org/ontology/#中文名'), rdflib.term.Literal('宝贝计划'))
(rdflib.term.URIRef('http://ckb.org/ontology/#电影公司'), rdflib.term.URIRef('http://ckb.org/ontology/#华谊兄弟影业投资有限公司,成龙英皇电影有限公司'))
(rdflib.term.URIRef('http://ckb.org/ontology/#英文名'), rdflib.term.Literal('ROB-B-HOOD'))
(rdflib.term.URIRef('http://ckb.org/ontology/#导演'), rdflib.term.URIRef('http://ckb.org/ontology/#陈木胜'))
(rdflib.term.URIRef('http://ckb.org/ontology/#发行时间'), rdflib.term.Literal('2006年9月28日'))
(rdflib.term.URIRef('http://ckb.org/ontology/#别名'), rdflib.term.Literal('BB计划'))
(rdflib.term.URIRef('http://ckb.org/ontology/#主演'), rdflib.term.URIRef('http://ckb.org/ontology/#成龙,古天乐,高圆圆,许冠文,陈宝国,应采儿,蔡卓妍,杜丽莎'))
(rdflib.term.URIRef('http://ckb.org/ontology/#ABSTRACT'), rdflib.term.Literal('宝贝计划，成龙06年最新作品，动作喜剧片。剧中成龙扮演的人字拖和古天乐扮演的百达通为了三百万而卷入了针对一个小宝宝的绑架案，结果弄出了一出啼笑皆非，但又富含人生哲理的故事。'))
(rdflib.term.URIRef('http://ckb.org/ontology/#片长'), rdflib.term.Literal('135分钟'))
(rdflib.term.URIRef('http://ckb.org/ontology/#地区

In [6]:
prefix = 'http://ckb.org/ontology/#'
prefix_len = len(prefix)
QUERY = r'SELECT * WHERE {<http://ckb.org/ontology/#%s> "类型" ?c}'

with open('t_list.txt', 'r', encoding='utf-8') as f: 
    for item in f.readlines():
        movie_name = '_' + item.strip('\n') + '_'
        try:
            res = g.query(QUERY % (movie_name))
            for r in res:
                print(r)
        except ParseException:
            print(QUERY % (movie_name))

SELECT * WHERE {<http://ckb.org/ontology/#_喷火女郎_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_皇家国教骑士团_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_周六夜现场_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_智取威虎山_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_X档案_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_伦敦_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_跳跃大搜查线_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_图书馆战争_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_我的初恋_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_买房夫妻_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_盗火线_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_爱的奇迹_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_美国偶像_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_飞天大盗_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_不可思议的教室_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_公众之敌_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_逃脱_> "

SELECT * WHERE {<http://ckb.org/ontology/#_纪念日_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_天空的颜色_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_宇宙_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_邪恶力量_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_复仇_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_单身毒妈_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_死亡之舞_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_废柴联盟_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_超感神探_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_两两相望_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_曾经爱过_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_好汉两个半_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_舞林争霸_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_誓言_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_崩溃_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_幸福摩天轮_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_爱无尽头_> "类型" ?c}
SEL

SELECT * WHERE {<http://ckb.org/ontology/#_蝙蝠侠大战幻影人_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_大团圆_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_碟中谍_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_委托人_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_超级明星_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_王牌大贱谍_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_龙威小子_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_鬼玩人_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_今生今世_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_柏林苍穹下_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_天袭_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_珍妮的画像_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_用心棒_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_碧血金沙_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_挑战星期天_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_色情酒店_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_侏罗纪公园_> "类型"

SELECT * WHERE {<http://ckb.org/ontology/#_骑劫地下铁_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_斯万的爱情_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_冬天的故事_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_孩子王_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_西部往事_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_野战排_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_怒海潜将_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_风儿踢踏踩_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_小活佛_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_巧妇怨_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_阮玲玉_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_寂寞芳心_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_监狱风云_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_紫雨_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_森林王子_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_呐喊_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_王牌对王牌_> "类型" ?c}
S

SELECT * WHERE {<http://ckb.org/ontology/#_意外的春天_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_紫色_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_蝙蝠侠与罗宾_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_辣妹抢银行_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_牺牲_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_龙种_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_距离_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_你那边几点_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_神父同志_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_太极张三丰_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_爱情短片_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_军官与绅士_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_最后一颗子弹_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_御法度_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_红色小提琴_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_爱的亡灵_> "类型" ?c}
SELECT * WHERE {<http://ckb.org/ontology/#_战地军魂_> "类型" 

KeyboardInterrupt: 

### Save for later

In [None]:
# Query one item

res = g.query(r"""
    PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX owl:<http://www.w3.org/2002/07/owl#>
    PREFIX ont:<http://ckb.org/ontology/#>
    SELECT * WHERE {?a rdf:type ont:电影}
    """)
print(len(res))

In [None]:
QUERY = """
        PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
        PREFIX owl:<http://www.w3.org/2002/07/owl#>
        PREFIX ont:<http://ckb.org/ontology/#>
        SELECT * WHERE {ont:_%s_ rdf:type ?c}
        """