In [1]:
import findspark
findspark.init()
import pyspark
import re
import json
import pysolr
import datetime
from pyspark.sql import SparkSession
from pyspark import SparkContext
import pyspark.sql
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType
from pyspark.sql import SQLContext
import sys

In [2]:
spark = SparkSession.builder \
    .master('local[*]') \
    .appName("FreebasePeople") \
    .getOrCreate()

In [3]:
# input_file = "freebase-head-1000000"
# input_file = "freebase-head-10000000"
input_file = "freebase-head-100000000"
freebase = spark.sparkContext.textFile(input_file)

In [4]:
re_name = '(\/[gm]\..+\t<http:\/\/rdf\.freebase\.com\/ns\/type\.object\.name>\t\".*\"@en)'
re_alias = '(\/[gm]\..+\t<http:\/\/rdf\.freebase\.com\/ns\/common\.topic\.alias>\t\".*\"@en)'
re_birth = '(\/[gm]\..+\t<http:\/\/rdf\.freebase\.com\/ns\/people\.person\.date_of_birth>\t)'
re_death = '(\/[gm]\..+\t<http:\/\/rdf\.freebase\.com\/ns\/people\.deceased_person\.date_of_death>\t)'

In [5]:
people = freebase \
    .filter(lambda x: re.search(re_name,x) or re.search(re_alias,x) or re.search(re_birth,x) or re.search(re_death,x)) \
    .distinct() \
    .map(lambda x: re.sub('(http\:\/\/rdf.freebase.com\/ns\/)|(\^\^.*\.)|(\@.*\.)|\<|\>|\"',"",x)) \
    .map(lambda x: x.split('\t')) 

In [6]:
# people.take(20)

In [7]:
schema = StructType([StructField('subject', StringType(), True),
                     StructField('predicate', StringType(), True),
                     StructField('object', StringType(), True, metadata = {"maxlength":2048})])

In [8]:
names = spark.createDataFrame(people.filter(lambda x: "type.object.name" in x[1]), schema)
aliases = spark.createDataFrame(people.filter(lambda x: "common.topic.alias" in x[1]), schema)
births = spark.createDataFrame(people.filter(lambda x: "people.person.date_of_birth" in x[1]), schema)
deaths = spark.createDataFrame(people.filter(lambda x: "people.deceased_person.date_of_death" in x[1]), schema)

In [None]:
# names.show()

In [9]:
# pomenovanie tabuliek
names.registerTempTable("names")
aliases.registerTempTable("aliases")
births.registerTempTable("births")
deaths.registerTempTable("deaths")

In [10]:
sql_context = SQLContext(spark.sparkContext)

In [11]:
sql = sql_context.sql("""
select names.object as name, 
ifnull(aliases.object, '-') as alias,
ifnull(cast(births.object as date), (cast(deaths.object as date) - 100*365) ) as birth,
ifnull(cast(deaths.object as date), (cast(births.object as date) + 100*365) ) as death
from names
left join births on names.subject = births.subject
left join deaths on names.subject = deaths.subject
left join aliases on names.subject = aliases.subject
where births.object is not null or deaths.object is not null
""")

In [None]:
# sql.show(20)

In [12]:
# n = 10
# n = 100
n = 1000
# result_files = "filteredPeople_1"
# result_files = "filteredPeople_10"
result_files = "filteredPeople_100"

In [None]:
sql.repartition(n).write.format('com.databricks.spark.csv') \
    .option("mapreduce.fileoutputcommitter.marksuccessfuljobs","false") \
    .save(result_files, header = 'true')

In [None]:
# sql.coalesce(3).toJSON().saveAsTextFile('jsonfiles')

In [None]:
# sql.write.format("org.apache.spark.sql.json").mode("overwrite").options(header="true",sep="\t").json('outpath', lineSep=",")

In [None]:
from os import listdir
import os
from os.path import isfile, join

readPath = result_files
writePath = result_files + '_csv'
os.mkdir(writePath)

file_list = [f for f in listdir(readPath)]

for i in file_list:
    print(i)
    filename, file_extension = os.path.splitext(i)
    reg = '(\.[\w]+-[\w]+)|([\w]+-[\w]+)'
    result = re.match(reg, filename).group()
    if file_extension == '.csv':
        filename = filename.split('-')[1]
        os.rename(readPath + '/' + i, writePath + "/" + result + file_extension)

In [13]:
mysolr = pysolr.Solr('http://localhost:8983/solr/freebase_people_100/')

In [None]:
# mysolr.delete(q='*', commit=True)

In [None]:
# mysolr.add(file)

In [45]:
person_name_1 = 'Takahama'
person_name_2 = 'Louise Hoffsten'

query_1 = 'name:*' + person_name_1 + '* OR alias:*' + person_name_1 + '*'
query_2 = 'name:*' + person_name_2 + '* OR alias:*' + person_name_1 + '*'
# print(q)
result_1 = mysolr.search(query_1)
result_2 = mysolr.search(query_2)
list(result_1), list(result_2)

([{'name': ['Yūto Takahama'],
   'alias': ['-'],
   'birth': ['1996-08-08'],
   'death': ['2096-07-14'],
   'id': '967a33a1-3c35-45e2-9d3b-20dd7861b689',
   '_version_': 1684171459796140045}],
 [{'name': ['Louise Bénédicte de Bourbon, Duchess of Maine'],
   'alias': ['Louise Bénédicte de Bourbon'],
   'birth': ['1676-11-08'],
   'death': ['1753-01-23'],
   'id': 'b00b7df6-99b8-4211-b0cc-70d11e343ff7',
   '_version_': 1684171215111979009},
  {'name': ['Louise Hoffsten'],
   'alias': ['Hoffsten, Louise'],
   'birth': ['1965-09-06'],
   'death': ['2065-08-12'],
   'id': '77a8c9f9-7723-49fb-96b4-3720c4000021',
   '_version_': 1684171354258014209},
  {'name': ['Louise Ellman'],
   'alias': ['Louise Ellman MP'],
   'birth': ['1945-11-14'],
   'death': ['2045-10-20'],
   'id': '0e987a44-ad67-4edf-8b42-a39b27779d4a',
   '_version_': 1684171358099996679},
  {'name': ['Louise Wright'],
   'alias': ['-'],
   'birth': ['1846-01-01'],
   'death': ['1915-01-01'],
   'id': '2164df66-51af-4f93-b237-4f

In [49]:
for line in result_1:
    person_1, alias_1, dob_1, dod_1 = str(line['name'][0]), str(line['alias'][0]), line['birth'][0], line['death'][0]
    print(person_1 + ", " + alias_1 + ", " + dob_1 + ", " + dod_1)
    
for line in result_2:
    person_2, alias_2, dob_2, dod_2 = str(line['name'][0]), str(line['alias'][0]), line['birth'][0], line['death'][0]
    print(person_2 + ", " + alias_2 + ", " + dob_2 + ", " + dod_2)


Yūto Takahama, -, 1996-08-08, 2096-07-14
Louise Bénédicte de Bourbon, Duchess of Maine, Louise Bénédicte de Bourbon, 1676-11-08, 1753-01-23
Louise Hoffsten, Hoffsten, Louise, 1965-09-06, 2065-08-12
Louise Ellman, Louise Ellman MP, 1945-11-14, 2045-10-20
Louise Wright, -, 1846-01-01, 1915-01-01
Louise Troy, Louise Tory, 1933-11-09, 1994-05-05
Louise Henry, Jessouise Heiman, 1911-06-14, 2011-12-12
Louise Orth, The Biograph Blonde, 1893-03-22, 1993-02-26
Louise de Bourbon, -, 1603-02-02, 1637-01-01
Louise Henry, Jessie Louise Heiman, 1911-06-14, 2011-12-12
Louise Ellman, Louise Joyce Ellman, 1945-11-14, 2045-10-20


In [47]:
def getDifference(dob_1, dod_1, dob_2, dod_2):
    if dod_1 >= dob_2 and dob_1 <= dod_2:
        return True
    return False

In [50]:
meet = getDifference(dob_1, dod_1, dob_2, dod_2)
if meet:
    print("Osoby" + person_1 +" a " + person_2 + " sa mohli stretnut.")
else:
    print("Osoby" + person_1 +" a " + person_2 + " sa nemohli stretnut.")

OsobyYūto Takahama a Louise Ellman sa mohli stretnut.
