# RulER: Scaling Up Record-level Matching Rules

# Preparing data for RulER 

In [1]:
import RulER.Commons.implicits
import RulER.Commons.implicits._
import RulER.DataStructure.Rule
import RulER.DataStructure.ThresholdTypes.ED
import RulER.DataStructure.ThresholdTypes.JS
import RulER.Commons.CommonFunctions.loadProfilesAsDF
import RulER.SimJoins.EDJoin.EDJoin
import RulER.SimJoins.PPJoin.PPJoin
import java.util.Calendar

In [2]:
//Load the dataset
val imdb = loadProfilesAsDF("imdb.csv")

imdb = [_rowId: bigint, _c0: string ... 12 more fields]


[_rowId: bigint, _c0: string ... 12 more fields]

In [3]:
%%dataframe --limit 1
imdb

_rowId,_c0,imdbid,title,year,genres,director,writer,cast,runtime,country,language,rating,plot
0,0,2399574,# (2012),2012,"Short, Comedy",Zak Longo,Eddie Klein,"Bianca Siavoshy (Allison), Rhoda Pell (Sarah), Eugene Kim (Adam), Maronzio Vance (Buster), Eddie Klein (Dave)",15,USA,English,,"After too many beers, Adam leaves the worst voice-mail in the history of voice-mails for the girl of his dreams"


# Defining a rule
First, we define a complex rule to find the matches

In [4]:
//Predicates
val r1 = Rule("title", JS, 0.8)
val r2 = Rule("title", ED, 3)
val r3 = Rule("director", JS, 0.7)
val r4 = Rule("cast", JS, 0.7)
val r5 = Rule("country", ED, 2)
val r6 = Rule("plot", JS, 0.8)

//Rule
val rule = (r1 and r3) or (r2 and r4) or (r5 and r6)

rule = (title,JS,0.8 AND director,JS,0.7) OR (title,ED,3.0 AND cast,JS,0.7) OR (country,ED,2.0 AND plot,JS,0.8)


r1: RulER.DataStructure.Rule = title,JS,0.8
r2: RulER.DataStructure.Rule = title,ED,3.0
r3: RulER.DataStructure.Rule = director,JS,0.7
r4: RulER.DataStructure.Rule = cast,JS,0.7
r5: RulER.DataStructure.Rule = country,ED,2.0
r6: RulER.DataStructure.Rule = plot,JS,0.8


(title,JS,0.8 AND director,JS,0.7) OR (title,ED,3.0 AND cast,JS,0.7) OR (country,ED,2.0 AND plot,JS,0.8)

# Running the rule by using existing algorithms
By using the existing algorithms (e.g. PPJoin, EDJoin) it is possible to execute the rule as a combination of intersections and unions

In [5]:
val tStart = Calendar.getInstance().getTimeInMillis

//Obtaining the matches with PPJoin/EDJoin
val and1 = PPJoin(imdb, r1).intersect(PPJoin(imdb, r3))
val and2 = EDJoin(imdb, r2).intersect(PPJoin(imdb, r4))
val and3 = EDJoin(imdb, r5).intersect(PPJoin(imdb, r6))

//Final results
val res = and1.union(and2).union(and3).distinct()
val tmp = imdb.join(res, imdb("_rowId") === res("id1"))
val results = imdb.join(tmp, tmp("id2") === imdb("_rowId"))
results.cache()
results.count()
val tEnd = Calendar.getInstance().getTimeInMillis
println("Execution time (s) "+(tEnd-tStart)/1000.0)



tEnd = 1585158892902


tStart: Long = 1585158812065
and1: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id1: bigint, id2: bigint]
and2: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id1: bigint, id2: bigint]
and3: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id1: bigint, id2: bigint]
res: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id1: bigint, id2: bigint]
tmp: org.apache.spark.sql.DataFrame = [_rowId: bigint, _c0: string ... 14 more fields]
results: org.apache.spark.sql.DataFrame = [_rowId: bigint, _c0: string ... 28 more fields]


1585158892902

In [6]:
%%dataframe --limit 1
results

_rowId,_c0,imdbid,title,year,genres,director,writer,cast,runtime,country,language,rating,plot,_rowId.1,_c0.1,imdbid.1,title.1,year.1,genres.1,director.1,writer.1,cast.1,runtime.1,country.1,language.1,rating.1,plot.1,id1,id2
270,270,3520062,"""3 Minute Update, The"" February 7, 2014 (2014)",2014,News,,,Kalena Liane (Celebrity Host),,,,,"SPORTS Jay-Z: From Hip Hop Icon to Sports Agent! First, he was a Hip Hop Icon, then he took over the fashion world, now the Icon known as Jay-Z is looking to take over the sports world. Here's how Jay-Z went from Hip Hop Icon to Sports Agent. TECHNOLOGY The Amazing Amazon: Amazon keeps growing to the point where they dictate market growth, technology and even the US Postal Service's schedule",144,144,3486534,"""3 Minute Update, The"" January 23, 2014 (2014)",2014,News,,,Kalena Liane (Celebrity Host),,,,,"Justin Bieber's Bad Boy Recap. Catch up on the roller-coaster ride that led to Justin Bieber's arrest. 3 Minute Update recaps Justin's Bad Boy moments. Strippers, Drugs, and the Biebs...Oh...My... SPORTS The Road to Super Bowl XLVIII.The Road to Super Bowl XLVIII has been riddled with injuries and upsets. Lets take a look back at the NFL stories that built the road to Super Bowl XLVIII. This video includes: Peyton Manning's on going ankle injuries, The Dallas Cowboy's epic collapse, Seattle Seahawks wide receiver woes, Adrian Foster's failed season, and Aaron Rodger's season ending injury. Top Fashion Icons never sleep, so while you were nodding off, here's an update on the movers and shakers in the fashion world. This update Includes: Chanel's Karl Lagerfeld's legal issues, John Galliano's comeback, New Jackie Kennedy letters, Victoria and David Beckham's new fashion lines, and Giorgio Armani's new real estate venture",144,270


# Running the rule by using RulER

In [7]:
val tStart = Calendar.getInstance().getTimeInMillis
val results = imdb.joinWithRules(imdb, rule)
results.count()
val tEnd = Calendar.getInstance().getTimeInMillis
println("Execution time (s) "+(tEnd-tStart)/1000.0)

Execution time (s) 13.28                                                        


tEnd = 1585158961803


tStart: Long = 1585158948523
results: org.apache.spark.sql.DataFrame = [_rowId: bigint, _c0: string ... 28 more fields]


1585158961803

In [8]:
%%dataframe --limit 1
results

_rowId,_c0,imdbid,title,year,genres,director,writer,cast,runtime,country,language,rating,plot,_rowId.1,_c0.1,imdbid.1,title.1,year.1,genres.1,director.1,writer.1,cast.1,runtime.1,country.1,language.1,rating.1,plot.1,id1,id2
844,844,4184766,#seguiilconiglio (I) (2014),2014,"Short, Horror",Luigi Pietrobono,Luigi Pietrobono,Roberto Luigi Mauri (Kid),9::(original release),Italy,"Italian, English",,"A journey into the psyche in search of happiness, where fact and fiction are at the same level as hidden as Chinese boxes",843,843,4184842,#seguiilconiglio (II) (2014),2014,"Short, Horror",Luigi Pietrobono,Luigi Pietrobono,Roberto Luigi Mauri (Kid),9::(original release),Italy,"Italian, English",,"A journey into the psyche in search of happiness, where fact and fiction are at the same level as hidden as Chinese boxes",843,844


# Join multiple datasets example

In [9]:
val roger_ebert = loadProfilesAsDF("roger_ebert.csv")

roger_ebert = [_rowId: bigint, id: string ... 7 more fields]


[_rowId: bigint, id: string ... 7 more fields]

In [10]:
val rotten_tomatoes = loadProfilesAsDF("rotten_tomatoes.csv")

rotten_tomatoes = [_rowId: bigint, Id: string ... 16 more fields]


[_rowId: bigint, Id: string ... 16 more fields]

In [11]:
%%dataframe --limit 1
roger_ebert

_rowId,id,movie_name,year,directors,actors,movie_rating,genre,duration
0,0,High-Rise,2015,Ben Wheatley,"Tom Hiddleston, Jeremy Irons, Sienna Miller",6.8,"Action, Drama, Sci-Fi",112 min


In [12]:
%%dataframe --limit 1
rotten_tomatoes

_rowId,Id,Name,Year,Release Date,Director,Creator,Actors,Cast,Language,Country,Duration,RatingValue,RatingCount,ReviewCount,Genre,Filming Locations,Description
0,tt0054215,Psycho,1960,8 September 1960 (USA),Alfred Hitchcock,"Joseph Stefano,Robert Bloch","Anthony Perkins,Janet Leigh,Vera Miles","Anthony Perkins,Vera Miles,John Gavin,Janet Leigh,Martin Balsam,John McIntire,Simon Oakland,Frank Albertson,Patricia Hitchcock,Vaughn Taylor,Lurene Tuttle,John Anderson,Mort Mills",English,USA,109 min,8.6,379998,"976 user,290 critic","Horror,Mystery,Thriller","Title and Trust Building, 114 West Adams Street, downtown Phoenix, Arizona, USA","A Phoenix secretary steals $40,000 from her employer's client, goes on the run and checks into a remote motel run by a young man under the domination of his mother."


# Defining the rule

In [13]:
val r1 = Rule("movie_name", JS, 0.8, "Name")
val r2 = Rule("actors", JS, 0.5, "Actors")
val r3 = Rule("directors", ED, 2, "Director")

val rule = (r1 and r2) or (r1 and r3)

rule = (movie_name,JS,0.8 AND actors,JS,0.5) OR (movie_name,JS,0.8 AND directors,ED,2.0)


r1: RulER.DataStructure.Rule = movie_name,JS,0.8
r2: RulER.DataStructure.Rule = actors,JS,0.5
r3: RulER.DataStructure.Rule = directors,ED,2.0


(movie_name,JS,0.8 AND actors,JS,0.5) OR (movie_name,JS,0.8 AND directors,ED,2.0)

# Join the datasets by using the rule

In [15]:
val matches = roger_ebert.joinWithRules(rotten_tomatoes, rule)



matches = [_rowId: bigint, Id: string ... 27 more fields]


[_rowId: bigint, Id: string ... 27 more fields]

In [16]:
%%dataframe --limit 1
matches

_rowId,Id,Name,Year,Release Date,Director,Creator,Actors,Cast,Language,Country,Duration,RatingValue,RatingCount,ReviewCount,Genre,Filming Locations,Description,_rowId.1,id,movie_name,year,directors,actors,movie_rating,genre,duration,id1,id2
26,tt0120611,Blade,1998,21 August 1998 (USA),Stephen Norrington,David S. Goyer,"Wesley Snipes,Stephen Dorff,Kris Kristofferson","Wesley Snipes,Stephen Dorff,Kris Kristofferson,N'Bushe Wright,Donal Logue,Udo Kier,Arly Jover,Traci Lords,Kevin Patrick Walls,Tim Guinee,Sanaa Lathan,Eric Edwards,Donna Wong,Carmen Thomas,Shannon Lee","English,Russian",USA,120 min,7.1,176103,"572 user,179 critic","Action,Horror","Long Beach, California, USA","A half-vampire, half-mortal man becomes a protector of the mortal race, while slaying evil vampires.",3415,3415,Blade,1998,Stephen Norrington,"Wesley Snipes, Stephen Dorff, Kris Kristofferson",7.1,"Action, Horror",120 min,3415,26
