In [None]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--driver-memory 4g --executor-memory 4g --num-executors 5 pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Sergey Grishaev Link Prediction app").getOrCreate()

In [None]:
spark

In [None]:
!hadoop fs -mkdir /tmp/trainGraph/

In [None]:
!hadoop fs -put prediction.csv /tmp/

In [None]:
graphPath = "/tmp/trainGraph/"
usersToPredictPath = "/tmp/prediction.csv"

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

schema = StructType(fields=[
    StructField("user", IntegerType()),
    StructField("friendsString", StringType())
])

data = spark.read.format("csv") \
        .schema(schema) \
        .option("delimiter", "\t") \
        .load(graphPath).repartition(15)

In [None]:
data.show(5)

In [None]:
data.rdd.getNumPartitions()

In [None]:
from pyspark.sql.functions import col, explode, collect_list, sort_array, size, split, lit


In [None]:
from pyspark.sql.functions import pandas_udf

def cutStartEndBrackets(series):
    return series.str[2:-2]

cutStartEndBracketsUDF = pandas_udf(cutStartEndBrackets, StringType())

userFriend = \
    data.select(col("user"), split(cutStartEndBracketsUDF(col("friendsString")), "\),\(").alias("friendsMasks"))\
    .withColumn("friendMask", explode('friendsMasks'))\
    .withColumn("friend", split(col("friendMask"), ",")[0])\
    .select(col("user").cast("integer"), col("friend").cast("integer")).cache()

<img src="pics/step1.jpg" width=700/>

In [None]:
userFriend.count()

In [None]:
userFriend.show()

In [None]:
usersWithCommonFriend = userFriend\
    .groupBy("friend")\
    .agg(collect_list("user").alias("usersWithCommonFriend")) \
    .select("usersWithCommonFriend")\
    .where(size(col("usersWithCommonFriend")) >= 2)\
    .select(sort_array("usersWithCommonFriend").alias("sortedUsersWithCommonFriend"))\
    .drop("usersWithCommonFriend").cache()

<img src="pics/step2.jpg" width=700/>


In [None]:
usersWithCommonFriend.count()

In [None]:
usersWithCommonFriend.show(10, truncate=False)

In [None]:
schema_users_to_pred = StructType(fields=[
    StructField("user", IntegerType()),
])

usersToPredict = spark.read.format("csv") \
    .schema(schema_users_to_pred) \
    .load(usersToPredictPath) \
    .select(col("user").cast("integer")) \
    .rdd.map(lambda t : t.user).collect()

usersToPredictBC = spark.sparkContext.broadcast(set(usersToPredict))

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType

def pairsWithCommonFriend(usersWithCommonFriend):
    pairs = []
    for user1Index in range(0, len(usersWithCommonFriend)):
        for user2Index in range(user1Index + 1, len(usersWithCommonFriend)):
            if user1Index != user2Index:
                if (usersWithCommonFriend[user1Index] in usersToPredictBC.value or \
                usersWithCommonFriend[user2Index] in usersToPredictBC.value):
                    pairs.append((usersWithCommonFriend[user1Index], usersWithCommonFriend[user2Index]))
    return pairs

schema = ArrayType(ArrayType(IntegerType()))

pairsWithCommonFriendUdf = udf(pairsWithCommonFriend, schema)

<img src="pics/step4_2.jpg" width=700/>

In [None]:
from pyspark.sql.functions import pandas_udf
import pandas as pd

def pairsWithCommonFriend(series):
    pairs_lists = []
    for usersWithCommonFriend in series:
        pairs = []
        for user1Index in range(0, len(usersWithCommonFriend)):
            for user2Index in range(user1Index + 1, len(usersWithCommonFriend)):
                if user1Index != user2Index:
                    if usersWithCommonFriend[user1Index] in usersToPredictBC.value or \
                    usersWithCommonFriend[user2Index] in usersToPredictBC.value:
                        pairs.append((usersWithCommonFriend[user1Index], usersWithCommonFriend[user2Index]))
        pairs_lists.append(pairs)
    return pd.Series(pairs_lists)
        
pairsWithCommonFriendUdf = pandas_udf(pairsWithCommonFriend, schema)

commonFriendsCounts = usersWithCommonFriend\
            .select(pairsWithCommonFriendUdf("sortedUsersWithCommonFriend").alias("pairsWithCommonFriend"))\
            .where(size(col("pairsWithCommonFriend")) > 0).repartition(24).cache()    

commonFriendsCounts\
    .withColumn("pairWithCommonFriend", explode("pairsWithCommonFriend"))\
    .drop(col("pairsWithCommonFriend"))\
    .groupBy(col("pairWithCommonFriend"))\
    .count()\
    .show(20)

In [None]:
def pairsWithCommonFriend(series):
    pairs_lists = []
    for usersWithCommonFriend in series:
        pairs = []
        for user1Index in range(0, len(usersWithCommonFriend)):
            for user2Index in range(user1Index + 1, len(usersWithCommonFriend)):
                if user1Index != user2Index:
                    pairs.append((usersWithCommonFriend[user1Index], usersWithCommonFriend[user2Index]))
        pairs_lists.append(pairs)
    return pd.Series(pairs_lists)
         
pairsWithCommonFriendUdf = pandas_udf(pairsWithCommonFriend, schema_pandas)

commonFriendsCounts = usersWithCommonFriend\
            .select(pairsWithCommonFriendUdf("sortedUsersWithCommonFriend").alias("pairsWithCommonFriend"))\
            .where(size(col("pairsWithCommonFriend")) > 0)    

commonFriendsCounts\
    .withColumn("pairWithCommonFriend", explode("pairsWithCommonFriend"))\
    .drop(col("pairsWithCommonFriend"))\
    .groupBy(col("pairWithCommonFriend"))\
    .count()\
    .show(20)

In [None]:
from functools import partial

def pairsWithCommonFriendUpgraded(series, modulo):
    pairs_lists = []

    for usersWithCommonFriend in series:
        pairs = []
        for user1Index in range(0, len(usersWithCommonFriend)):
             for user2Index in range(user1Index + 1, len(usersWithCommonFriend)):
                    if user1Index != user2Index and user1Index % 13 == modulo:
                        pairs.append((usersWithCommonFriend[user1Index], usersWithCommonFriend[user2Index]))
        pairs_lists.append(pairs)
    return pd.Series(pairs_lists)


for i in range(13):
    pairsWithCommonFriendUdfUpgraded = pandas_udf(partial(pairsWithCommonFriendUpgraded, modulo=i), schema)

    commonFriendsCounts = usersWithCommonFriend\
            .select(pairsWithCommonFriendUdfUpgraded("sortedUsersWithCommonFriend").alias("pairsWithCommonFriend"))\
            .where(size(col("pairsWithCommonFriend")) > 0)\
            .write.parquet("pairs/" + str(i), mode = "overwrite")

In [None]:
spark.read.parquet("pairs/*").count()

In [None]:
spark.read.parquet("pairs/0").count()

In [None]:
spark.read.parquet("pairs/*")\
    .withColumn("pairWithCommonFriend", explode("pairsWithCommonFriend"))\
    .drop(col("pairsWithCommonFriend"))\
    .groupBy(col("pairWithCommonFriend"))\
    .count()\
    .show(5)

In [None]:
spark.stop()