In [1]:
import os
import datetime
import socket
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, length, when, col, desc
from pyspark.sql.types import BooleanType, IntegerType, LongType, StringType, ArrayType, FloatType, StructType, StructField
import pyspark.sql.functions as F
from pyspark.sql.functions import pandas_udf
from pyspark.sql.functions import PandasUDFType
from jinja2 import Environment, FileSystemLoader

In [2]:
# setting constants
APP_NAME = "VVORONIN-SPARK-APP"
NORMALIZED_APP_NAME = APP_NAME.replace('/', '_').replace(':', '_')

APPS_TMP_DIR = os.path.join(os.getcwd(), "tmp")
APPS_CONF_DIR = os.path.join(os.getcwd(), "conf")
APPS_LOGS_DIR = os.path.join(os.getcwd(), "logs")
LOG4J_PROP_FILE = os.path.join(APPS_CONF_DIR, "pyspark-log4j-{}.properties".format(NORMALIZED_APP_NAME))
LOG_FILE = os.path.join(APPS_LOGS_DIR, 'pyspark-{}.log'.format(NORMALIZED_APP_NAME))
EXTRA_JAVA_OPTIONS = "-Dlog4j.configuration=file://{} -Dspark.hadoop.dfs.replication=1 -Dhttps.protocols=TLSv1.0,TLSv1.1,TLSv1.2,TLSv1.3"\
    .format(LOG4J_PROP_FILE)

LOCAL_IP = socket.gethostbyname(socket.gethostname())

In [3]:
# preparing configuration files from templates
for directory in [APPS_CONF_DIR, APPS_LOGS_DIR, APPS_TMP_DIR]:
    if not os.path.exists(directory):
        os.makedirs(directory)

env = Environment(loader=FileSystemLoader('/opt'))
template = env.get_template("pyspark_log4j.properties.template")
template\
    .stream(logfile=LOG_FILE)\
    .dump(LOG4J_PROP_FILE)

# Run spark

In [4]:
# spark = SparkSession\
#     .builder\
#     .appName(APP_NAME)\
#     .master("k8s://https://10.32.7.103:6443")\
#     .config("spark.driver.host", LOCAL_IP)\
#     .config("spark.driver.bindAddress", "0.0.0.0")\
#     .config("spark.executor.instances", "3")\
#     .config("spark.executor.cores", '2')\
#     .config("spark.memory.fraction", "0.8")\
#     .config("spark.memory.storageFraction", "0.6")\
#     .config("spark.executor.memory", "4g")\
#     .config("spark.driver.extraJavaOptions", EXTRA_JAVA_OPTIONS)\
#     .config("spark.kubernetes.namespace", "{{user}}")\
#     .config("spark.kubernetes.driver.label.appname", APP_NAME)\
#     .config("spark.kubernetes.executor.label.appname", APP_NAME)\
#     .config("spark.kubernetes.container.image.pullPolicy", "Always")\
#     .config("spark.kubernetes.container.image", "node03.st:5000/spark-executor:{{user}}")\
#     .config("spark.local.dir", "/tmp/spark")\
#     .getOrCreate()

SPARK_ADDRESS = "local[4]"

spark = SparkSession\
    .builder\
    .appName("VVORONIN-SPARK-APP")\
    .master(SPARK_ADDRESS)\
    .config("spark.ui.port", "4040")\
    .config("spark.memory.fraction", "0.8")\
    .config("spark.memory.storageFraction", "0.6")\
    .config("spark.driver.memory", "4g")\
    .config("spark.driver.extraJavaOptions", EXTRA_JAVA_OPTIONS)\
    .config("spark.executor.memory", "6g")\
    .getOrCreate()

In [5]:
# printing important urls and pathes
print("Web UI: {}".format(spark.sparkContext.uiWebUrl))
print("\nlog4j file: {}".format(LOG4J_PROP_FILE))
print("\ndriver log file: {}".format(LOG_FILE))

Web UI: http://jupyter-spark-75c86b84fb-grxm2:4040

log4j file: /home/jovyan/work/conf/pyspark-log4j-VVORONIN-SPARK-APP.properties

driver log file: /home/jovyan/work/logs/pyspark-VVORONIN-SPARK-APP.log


# Read data

In [6]:
#there were created five tasks in apache spark webUI
posts_df = spark.read.json("file:///home/jovyan/shared-data/bigdata20/posts_api.json")
posts_likes_df = spark.read.parquet("file:///home/jovyan/shared-data/bigdata20/posts_likes.parquet")
followers_df = spark.read.parquet("file:///home/jovyan/shared-data/bigdata20/followers.parquet")
followers_posts_df = spark.read.json("file:///home/jovyan/shared-data/bigdata20/followers_posts_api_final.json")
followers_posts_likes_df = spark.read.parquet("file:///home/jovyan/shared-data/bigdata20/followers_posts_likes.parquet")

In [7]:
followers_df.head(5)

[Row(profile=-94, follower=34),
 Row(profile=-94, follower=87),
 Row(profile=-94, follower=102),
 Row(profile=-94, follower=175),
 Row(profile=-94, follower=533)]

In [8]:
followers_posts_likes_df.printSchema()

root
 |-- itemType: string (nullable = true)
 |-- ownerId: integer (nullable = true)
 |-- itemId: integer (nullable = true)
 |-- likerId: integer (nullable = true)



In [9]:
posts_likes_df.printSchema()
#itemId = PostID - уникальный номер поста
#ownerId - id группы, где пост размещен
#likerId - ID того, кто создал пост

root
 |-- itemType: string (nullable = true)
 |-- ownerId: integer (nullable = true)
 |-- itemId: integer (nullable = true)
 |-- likerId: integer (nullable = true)



# Task1
#### Find the top 20 posts in the group: 
* by likes; 
* by comments; 
* by reposts

### топ-20 постов по числу лайков 

In [10]:
likes = posts_df.select('id', col('likes.count').name('likes'))\
    .orderBy("likes", ascending = False)\
    .limit(20)

In [11]:
likes.collect()

[Row(id=32022, likes=1637),
 Row(id=35068, likes=1629),
 Row(id=17492, likes=1516),
 Row(id=18526, likes=1026),
 Row(id=19552, likes=955),
 Row(id=41468, likes=952),
 Row(id=19419, likes=868),
 Row(id=29046, likes=824),
 Row(id=32546, likes=786),
 Row(id=24085, likes=765),
 Row(id=40180, likes=759),
 Row(id=33658, likes=708),
 Row(id=13532, likes=633),
 Row(id=40842, likes=631),
 Row(id=35117, likes=588),
 Row(id=17014, likes=581),
 Row(id=19583, likes=553),
 Row(id=19809, likes=552),
 Row(id=27455, likes=550),
 Row(id=11999, likes=549)]

альтернативный вариант через группировку данных

In [12]:
pl_df = posts_likes_df.groupby("itemId")\
    .agg(F.count('likerId').name("likes_count"))\
    .withColumnRenamed("itemId", "post_id")\
    .orderBy("likes_count", ascending = False)\
    .limit(20)

In [13]:
#поскольку реализована схема "ленивых" вычислений для отображения результатов требуется отдельной командой вычислить функцию
pl_df.collect()

[Row(post_id=32022, likes_count=1654),
 Row(post_id=35068, likes_count=1630),
 Row(post_id=17492, likes_count=1538),
 Row(post_id=18526, likes_count=1028),
 Row(post_id=19552, likes_count=955),
 Row(post_id=41468, likes_count=952),
 Row(post_id=19419, likes_count=868),
 Row(post_id=29046, likes_count=824),
 Row(post_id=32546, likes_count=786),
 Row(post_id=24085, likes_count=765),
 Row(post_id=40180, likes_count=759),
 Row(post_id=33658, likes_count=708),
 Row(post_id=13532, likes_count=633),
 Row(post_id=40842, likes_count=631),
 Row(post_id=35117, likes_count=588),
 Row(post_id=17014, likes_count=581),
 Row(post_id=19583, likes_count=553),
 Row(post_id=19809, likes_count=552),
 Row(post_id=27455, likes_count=550),
 Row(post_id=11999, likes_count=549)]

### топ-20 постов по числу комментов

In [14]:
pс_df = posts_df.groupby("id")\
    .agg(F.count('comments').name("comments_count"))\
    .withColumnRenamed("id", "post_id")\
    .orderBy("comments_count", ascending = False)\
    .limit(20)

In [15]:
pс_df.collect()

[Row(post_id=13509, comments_count=1),
 Row(post_id=1642, comments_count=1),
 Row(post_id=17201, comments_count=1),
 Row(post_id=18957, comments_count=1),
 Row(post_id=34453, comments_count=1),
 Row(post_id=15894, comments_count=1),
 Row(post_id=34304, comments_count=1),
 Row(post_id=1551, comments_count=1),
 Row(post_id=17558, comments_count=1),
 Row(post_id=5148, comments_count=1),
 Row(post_id=17647, comments_count=1),
 Row(post_id=3327, comments_count=1),
 Row(post_id=3937, comments_count=1),
 Row(post_id=27693, comments_count=1),
 Row(post_id=1409, comments_count=1),
 Row(post_id=3155, comments_count=1),
 Row(post_id=16100, comments_count=1),
 Row(post_id=28283, comments_count=1),
 Row(post_id=2989, comments_count=1),
 Row(post_id=2994, comments_count=1)]

### топ-20 постов по числу репостов

In [16]:
pr_df = posts_df.select('id', col('reposts.count').name('reposts'))\
    .orderBy('reposts', ascending = False)\
    .limit(20)

In [17]:
pr_df.collect()

[Row(id=17492, reposts=334),
 Row(id=19552, reposts=246),
 Row(id=32022, reposts=210),
 Row(id=11842, reposts=129),
 Row(id=19419, reposts=126),
 Row(id=13532, reposts=110),
 Row(id=17014, reposts=105),
 Row(id=35068, reposts=101),
 Row(id=41266, reposts=92),
 Row(id=12593, reposts=90),
 Row(id=29046, reposts=87),
 Row(id=11999, reposts=85),
 Row(id=41468, reposts=85),
 Row(id=19809, reposts=84),
 Row(id=17167, reposts=81),
 Row(id=10833, reposts=78),
 Row(id=18543, reposts=77),
 Row(id=16596, reposts=76),
 Row(id=18156, reposts=74),
 Row(id=37262, reposts=71)]

# Task 2
#### Find the top 20 users by 
* likes
* reposts they have made (to trace reposts use "copy_history" field)

### топ-20 пользователей по числу лайков

In [18]:
posts_likes_df.printSchema()

root
 |-- itemType: string (nullable = true)
 |-- ownerId: integer (nullable = true)
 |-- itemId: integer (nullable = true)
 |-- likerId: integer (nullable = true)



In [19]:
posts_likes_df.groupby('likerId')\
    .agg(F.count('itemId').name('likes'))\
    .withColumnRenamed("likerId", "user_id")\
    .orderBy('likes', ascending = False)\
    .limit(20)\
    .collect()

[Row(user_id=2070090, likes=4801),
 Row(user_id=2397858, likes=2055),
 Row(user_id=1475301, likes=1829),
 Row(user_id=18239, likes=1569),
 Row(user_id=546612, likes=1245),
 Row(user_id=6371, likes=907),
 Row(user_id=1841959, likes=746),
 Row(user_id=78440957, likes=709),
 Row(user_id=120248, likes=699),
 Row(user_id=40981497, likes=611),
 Row(user_id=22158, likes=553),
 Row(user_id=207628162, likes=548),
 Row(user_id=329377723, likes=504),
 Row(user_id=76071304, likes=474),
 Row(user_id=14805173, likes=440),
 Row(user_id=317799, likes=385),
 Row(user_id=56355640, likes=375),
 Row(user_id=52042971, likes=338),
 Row(user_id=7437271, likes=336),
 Row(user_id=136506644, likes=335)]

### топ-20 пользователей по числу репостов

In [20]:
posts_df.groupby(col('copy_history.owner_id').alias('user_id'))\
    .agg(F.count(col('copy_history')).alias('reposts_count'))\
    .orderBy('reposts_count', ascending = False)\
    .limit(20)\
    .collect()
#не получается отфильтровать отрицательные id, потому что used_id имеет тип "массив" и не сравнимо с константой
#.where(col('copy_history.id') > 0)\

[Row(user_id=[-33773], reposts_count=186),
 Row(user_id=[-76139618], reposts_count=144),
 Row(user_id=[-45636106], reposts_count=95),
 Row(user_id=[-53958282], reposts_count=70),
 Row(user_id=[-45660640], reposts_count=52),
 Row(user_id=[-97819925], reposts_count=46),
 Row(user_id=[-2499902], reposts_count=44),
 Row(user_id=[-103229636], reposts_count=28),
 Row(user_id=[-94359346], reposts_count=26),
 Row(user_id=[-51664920], reposts_count=24),
 Row(user_id=[-46907025], reposts_count=23),
 Row(user_id=[-122783310], reposts_count=22),
 Row(user_id=[-78459300], reposts_count=21),
 Row(user_id=[18239], reposts_count=20),
 Row(user_id=[-57339370], reposts_count=18),
 Row(user_id=[-45375087], reposts_count=17),
 Row(user_id=[-644236], reposts_count=16),
 Row(user_id=[-23303030], reposts_count=16),
 Row(user_id=[-3900734], reposts_count=14),
 Row(user_id=[-39268951], reposts_count=14)]

# Task 3
get reposts of the original posts of the itmo group (posts.json) from user posts (the result should be similar to (group_post_id, Array (user_post_ids)))

In [21]:
followers_posts_df.printSchema()

root
 |-- attachments: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- album: struct (nullable = true)
 |    |    |    |-- created: long (nullable = true)
 |    |    |    |-- description: string (nullable = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- owner_id: long (nullable = true)
 |    |    |    |-- size: long (nullable = true)
 |    |    |    |-- thumb: struct (nullable = true)
 |    |    |    |    |-- access_key: string (nullable = true)
 |    |    |    |    |-- album_id: long (nullable = true)
 |    |    |    |    |-- date: long (nullable = true)
 |    |    |    |    |-- id: long (nullable = true)
 |    |    |    |    |-- lat: double (nullable = true)
 |    |    |    |    |-- long: double (nullable = true)
 |    |    |    |    |-- owner_id: long (nullable = true)
 |    |    |    |    |-- sizes: array (nullable = true)
 |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    

In [22]:
followers_df.printSchema()

root
 |-- profile: integer (nullable = true)
 |-- follower: integer (nullable = true)



структура followers_posts_df такая же как и posts_df, но рассматривает множество пользователей, а не одну лишь группу ИТМО как во втором случае. Атрибут copy_history из followers_posts_df имеется лишь у записей на стене пользователя, являющихся репостами. Выберем из followers_posts_df лишь те посты, которые являются репостами (на стене пользователя могут быть и собственные записи) и для которых id владельца стены, где размещен пост copy_history[owner_id] = -94, т.е. репосты из группы ИТМО. 
Сгруппируем полученные таким образом посты по их id в группе ИТМО (атрибут copy_history[id]), поставив второй колонкой число постов с данным copy_history[id], что эквивалентно числу пользователей репостнувших запись к себе на стену. 

In [23]:
ur_df = followers_posts_df.select('owner_id', col('copy_history')['id'][0].name('orig_post_id'))\
    .where(col('copy_history')['owner_id'][0] == -94)\
    .withColumnRenamed('owner_id', 'user_id')

In [24]:
ur_df.groupby('orig_post_id')\
    .agg(F.collect_list('user_id').name('users'))\
    .collect()

[Row(orig_post_id=41424, users=[1475301, 282843035, 172808182]),
 Row(orig_post_id=42388, users=[180907432]),
 Row(orig_post_id=39407, users=[8082648, 2031644]),
 Row(orig_post_id=39719, users=[89417157]),
 Row(orig_post_id=38963, users=[6591522, 15641504, 172808182, 377805819, 477893414]),
 Row(orig_post_id=40011, users=[1546152, 1098272, 317799]),
 Row(orig_post_id=39259, users=[86937823, 159562593, 135556, 28405519, 253204918, 113773552, 15900015, 319622206]),
 Row(orig_post_id=40084, users=[268247082]),
 Row(orig_post_id=42009, users=[443525857, 281951154]),
 Row(orig_post_id=41506, users=[139081799, 29899117, 537403451, 527580876, 527580876]),
 Row(orig_post_id=38857, users=[484122052]),
 Row(orig_post_id=38854, users=[484122052]),
 Row(orig_post_id=40521, users=[418595183]),
 Row(orig_post_id=40996, users=[344349]),
 Row(orig_post_id=39082, users=[319622206]),
 Row(orig_post_id=42691, users=[161500412]),
 Row(orig_post_id=41039, users=[180907432]),
 Row(orig_post_id=38754, users=

# Step 4
find emoticons in posts and their comments (negative, positive, neutral)

In [None]:
import sys
!{sys.executable} -m pip install --user --trusted-host pypi-registry.supplementary-services.svc.cluster.local --index http://pypi-registry.supplementary-services.svc.cluster.local:8080 emoji

In [None]:
import emoji
from pyspark.sql.types import ArrayType, StringType

def emojies_in_text(text):
    emojies = [c for c in text if c in emoji.UNICODE_EMOJI['en'].keys()]
    return ''.join(emojies)

def emojies_filter(emojies):
    return len(emojies) > 0

emojies_in_text_udf = F.udf(emojies_in_text, StringType())
emojies_filter_udf = F.udf(emojies_filter, BooleanType())
  
counts = {}
counts_by_posts = {}

def update_counts(emojies):
    for e in emojies:
        counts[e] = 1 if e not in counts.keys else counts[e]+1
        
    for e in set(emojies):
        counts_by_posts[e] = 1 if e not in counts_by_posts.keys else counts_by_posts[e]+1

In [None]:
df_selected_emojies = posts_df\
    .select('text', emojies_in_text_udf('text').name("em"))\
    .select('text', 'em', F.length(F.col('em')).name('length'))\
    .when("em !='' ")

df_selected_emojies.printSchema()
df_selected_emojies.show()

# Task 5
Probable “fans”. Find for each user the top 10 other users whose posts this user likes.

In [25]:
followers_posts_likes_df.printSchema()
#itemId = PostID - уникальный номер поста
#ownerId - id стены, где пост размещен
#likerId - ID того, кто ставит лайк

root
 |-- itemType: string (nullable = true)
 |-- ownerId: integer (nullable = true)
 |-- itemId: integer (nullable = true)
 |-- likerId: integer (nullable = true)



In [26]:
def fans_of(user):
    return followers_posts_likes_df.groupBy("likerId", "ownerId")\
        .agg(F.count("ownerId").name("likes"))\
        .where('ownerId == {}'.format(user))\
        .withColumnRenamed("likerId", 'fan')\
        .orderBy('likes', ascending = False)\
        .limit(10)

In [27]:
df1 = fans_of(27419)
df1.show()

+---------+-------+-----+
|      fan|ownerId|likes|
+---------+-------+-----+
|  1925168|  27419|    6|
|     9383|  27419|    4|
|529276371|  27419|    4|
|  6866116|  27419|    3|
|    24147|  27419|    3|
| 95884146|  27419|    3|
|291831320|  27419|    3|
|100187585|  27419|    3|
|  1622246|  27419|    3|
|   422720|  27419|    3|
+---------+-------+-----+



# Task 6
Probable friends. If two users like each other posts they may be friends. Find pairs of users where both users are top likers of each other.

In [28]:
df2 = fans_of(24147)
df2.show()

+---------+-------+-----+
|      fan|ownerId|likes|
+---------+-------+-----+
|331853691|  24147|   49|
| 31609798|  24147|   44|
|529276371|  24147|   43|
|431349700|  24147|   33|
|    24147|  24147|   25|
| 34587765|  24147|   24|
|   188799|  24147|   21|
| 13898859|  24147|   19|
| 10260310|  24147|   15|
| 70922560|  24147|   15|
+---------+-------+-----+



In [29]:
def isFriends(userId1, userId2):
    df1 = fans_of(userId1)
    df2 = fans_of(userId2)
    df3 = df1.alias("a")\
        .join(df2.alias("b"), [(col('b.fan') == col('a.ownerId')) & (col('a.fan') == col('b.ownerId'))], 'inner')\
        .select(col('a.fan'), col('a.ownerId'))
    return df3.count() > 0

In [30]:
print(f'UserId 24147 and userId 27419 are friends: {isFriends(27419, 24147)}')
#false потому что 27419 не в топе у 24147, хотя у последнего первый в топе есть (см. ячейки с визуализацией df1, df2 выше)

UserId 24147 and userId 27419 are friends: False


In [31]:
'''

идея: считаем предварительно число лайков пользователей друг другу. 
Берем окно по конкретному ownerId с сортировкой по убыванию по числу лайков.
Из этого окна берем первые 10 строчек, т.е. топ-10 фанов данного ownerId.
Сортировка по likerId нужна, чтобы из пользователей с одинаковым count всегда на первых позициях были одни и те же.
Получаем датасет, состоящий из топ-10 каждого ownerId.
Выполняем inner join этого датасета с самим собой. Ключ: ownerId == likerId & likerId == ownerId, т.е. пользователи
лайкают друг друга, а так как у нас датасет это совокупность топ-10, то эти пользователи еще и в топ-10 друг у друга.
Дропаем строки, где пользователь лайкает сам себя.
Получаем искомый датасет.
Результат для конкретной пары можно проверить функцией isFriends()

'''
from pyspark.sql.window import *
from pyspark.sql.functions import row_number
w = Window.partitionBy("ownerId").orderBy(desc('count'), 'likerId')

In [32]:
dfTop = followers_posts_likes_df.groupBy('ownerId', 'likerId')\
    .agg(F.count('likerId').name('count'))\
    .withColumn("rn", row_number().over(w))\
    .where(col("rn") < 11)

In [33]:
prob_friends_df = dfTop.alias("a")\
        .join(dfTop.alias("b"), [(col('b.likerId') == col('a.ownerId')) & (col('a.likerId') == col('b.ownerId'))], 'inner')\
        .select(col('a.likerId'), col('a.ownerId'))\
        .where(col('a.likerId') != col('a.ownerId'))

In [34]:
prob_friends_df.collect()

[Row(likerId=168438070, ownerId=90898752),
 Row(likerId=209077977, ownerId=272076217),
 Row(likerId=49894967, ownerId=56706631),
 Row(likerId=40147706, ownerId=173546700),
 Row(likerId=180062188, ownerId=205353671),
 Row(likerId=371979170, ownerId=460296349),
 Row(likerId=222900543, ownerId=9836958),
 Row(likerId=94967714, ownerId=162535930),
 Row(likerId=106679661, ownerId=439992443),
 Row(likerId=136217422, ownerId=25302595),
 Row(likerId=508532888, ownerId=119179149),
 Row(likerId=2610724, ownerId=3860798),
 Row(likerId=100686926, ownerId=54101707),
 Row(likerId=1181958, ownerId=147035701),
 Row(likerId=560369353, ownerId=546205633),
 Row(likerId=2392313, ownerId=65913859),
 Row(likerId=132884992, ownerId=152266279),
 Row(likerId=20098738, ownerId=91826324),
 Row(likerId=28405519, ownerId=98102371),
 Row(likerId=460296349, ownerId=371979170),
 Row(likerId=590636, ownerId=50344793),
 Row(likerId=71427292, ownerId=70730078),
 Row(likerId=10025180, ownerId=366769452),
 Row(likerId=2350

In [35]:
isFriends(1181958, 147035701)

True

### Draft
Источник вдохновения. Удаляется без последствий для основной части

In [None]:
followers_posts_likes_df.groupBy('ownerId', 'likerId')\
    .agg(F.count('likerId').name('count'))\
    .where((col('ownerId') == 2212))\
    .orderBy('likerId')\
    .collect()

In [None]:
users_df = list(followers_posts_likes_df.select('likerId')\
    .distinct()\
    .toPandas()['likerId'])

In [None]:
prob_friends = {}
for userId in users_df:
    fans = fans_of(userId)
    if fans.count() != 0:
        prob_friends[userId] = []
        for fan in fans.select('fan').collect()[0]:
            if isFriends(fan, userId):
                prob_friends[userId].append(fan)
prob_friends

In [None]:
windowSpecAgg  = Window.partitionBy("likerId")

followers_posts_likes_df

In [None]:
prob_friends_df = followers_posts_likes_df.alias("a")\
        .join(df1.alias("b"), [(col('b.likerId') == col('a.ownerId')) & (col('a.likerId') == col('b.ownerId'))], 'inner')\
        .select(col('a.likerId'), col('a.ownerId'))\
        .groupBy('ownerId', 'likerId')\
        .agg(F.count('ownerId').name('count'))\
        .orderBy('likerId')

In [None]:
prob_friends_df.show()

In [None]:
#a = {}
#a['key'] = []
#a['rer'] = []
a['key'].append(5)
a['rer'].append(1)
a

In [None]:
followers_posts_likes_df.where('likerId = 2767')\
    .agg(F.count('ownerId').name('likes'))\
    .collect()