In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import datetime

# Start spark session
startTime = datetime.datetime.now()

spark = SparkSession \
    .builder\
    .appName("Anime importer") \
    .master("spark://spark-master:7077") \
    .config("spark.driver.memory", "15g") \
    .config("spark.jars", "/extra_jars/neo4j-connector-apache-spark_2.12-4.0.1_for_spark_3.jar") \
    .config("neo4j.url", "bolt://neo4j:7687")\
    .config("neo4j.authentication.type", "basic")\
    .config("neo4j.authentication.basic.username", "neo4j")\
    .config("neo4j.authentication.basic.password", "password")\
    .getOrCreate()

print(datetime.datetime.now()-startTime)

In [None]:
# read users
startTime = datetime.datetime.now()

data_file = '/import/anime/users_cleaned.csv' 

users = spark.read.csv(data_file, header=True, sep=",", inferSchema=True).cache()
print('Csv users = {}'.format(users.count()))
users.printSchema()
users.show(1, vertical=True)

print(datetime.datetime.now()-startTime)

# write users to neo4j
startTime = datetime.datetime.now()

users.write.format("org.neo4j.spark.DataSource") \
    .option("node.keys", "username")\
    .option("schema.optimization.type", "INDEX")\
    .mode("Overwrite")\
    .option("labels", ":User") \
    .save()

print(datetime.datetime.now()-startTime)

In [None]:
# read animes
startTime = datetime.datetime.now()

data_file = '/import/anime/anime_cleaned.csv' 

animes = spark.read.csv(data_file, header=True, sep=",", inferSchema=True).cache() 
print('Total Records = {}'.format(animes.count()))
animes.printSchema()
animes.show(1, vertical=True)

print(datetime.datetime.now()-startTime)

# write animes to neo4j
startTime = datetime.datetime.now()

animes.write.format("org.neo4j.spark.DataSource") \
    .option("node.keys", "anime_id")\
    .option("schema.optimization.type", "INDEX")\
    .mode("Overwrite")\
    .option("labels", ":Anime") \
    .save()

print(datetime.datetime.now()-startTime)

In [None]:
# get schema
sample_file = '/import/anime/animelists_cleaned_sample.csv' 

sample = spark.read.csv(data_file, header=True, sep=",", inferSchema=True, timestampFormat="yyyy-MM-dd HH:mm:ss").cache() 
sample.printSchema()
sampleSchema = df.schema

# read relationships
startTime = datetime.datetime.now()

data_file = '/import/anime/animelists_cleaned.csv' 

relationships = spark.read.csv(data_file, header=True, sep=",", mode="DROPMALFORMED", schema=sampleSchema, timestampFormat="yyyy-MM-dd HH:mm:ss").cache() 
relationships.show(2)

print('Records read = {}'.format(relationships.count()))
print(datetime.datetime.now()-startTime)

previousDbSize=(spark.read.format("org.neo4j.spark.DataSource")
 .option("url", "bolt://neo4j:7687")
 .option("query", "MATCH (n) RETURN n")
 .load().count())

# write animes to neo4j
startTime = datetime.datetime.now()

relationships.write.format("org.neo4j.spark.DataSource") \
    .mode("Overwrite")\
    .option("batch.size", 256)\
    .option("relationship.properties", "my_watched_episodes,my_start_date,my_finish_date,my_score,my_status,my_rewatching,my_rewatching_ep,my_last_updated,my_tags")\
    .option("relationship", "WATCHED") \
    .option("relationship.save.strategy", "keys") \
    .option("relationship.source.save.mode", "Match")\
    .option("relationship.source.labels", ":User")\
    .option("relationship.source.node.keys", "username")\
    .option("relationship.target.save.mode", "Match")\
    .option("relationship.target.labels", ":Anime")\
    .option("relationship.target.node.keys", "anime_id")\
    .save()

currentDbSize=(spark.read.format("org.neo4j.spark.DataSource")
 .option("url", "bolt://neo4j:7687")
 .option("query", "MATCH (n) RETURN n")
 .load().count())

insertCount = previousDbSize-currentDbSize
print('Inserted Records = {}'.format(insertCount))
print(datetime.datetime.now()-startTime)