# CS483 - Colab 1
## Word Count in Spark

### Setup

In [88]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

openjdk-8-jdk-headless is already the newest version (8u422-b05-1~22.04).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [89]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [90]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext
import pandas as pd

# create the Spark Session
spark = SparkSession.builder.getOrCreate()

# create the Spark Context
sc = spark.sparkContext

In [106]:
lines = sc.textFile("soc-LiveJournal1Adj.txt")
important_users = [11, 8997, 2791,  4985,  8961,  4049,  5060,  739,  1724,  9550, 3151]

In [107]:
def show_rdd(db):
  sample_content = db.take(25)

  # Print the sample data
  for item in sample_content:
      print(item)

In [108]:
# Parse the input file to create an RDD of (user, friends_list) pairs
user_friends = lines.map(lambda line: line.split("\t")) \
                    .map(lambda x: (int(x[0]), set(map(int, x[1].split(","))) if len(x) == 2 and x[1] else set()))

user_friends_sorted = user_friends.sortBy(lambda x: important_users.index(x[0]) if x[0] in important_users else len(important_users))
show_rdd(user_friends)

(0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94})
(1, {0, 31232, 29826, 35589, 5, 135, 4999, 34439, 8715, 6156, 13840, 19217, 27536, 34450, 20, 13845, 35605, 35606, 22939, 29724, 35613, 29726, 28193, 33060, 14248, 29481, 12846, 44846, 35633, 30257, 21556, 14005, 43447, 49592, 12347, 32317, 35648, 49985, 29767, 49226, 31435, 34250, 38737, 15190, 44887, 13912, 34392, 34394, 34651, 35678, 29791, 23520, 30691, 8932, 17636, 34406, 34280, 2409, 32489, 20075, 20074, 623, 34418, 34420, 629, 45054, 10623})
(2, {0, 2755, 1220, 12453, 13795, 135, 49927, 24714, 6893, 41456, 16659, 117, 45046, 41878, 24539, 32828})
(3, {0, 13185, 27552, 41, 12, 1532, 38737, 55, 12636})
(4, {0, 1907

In [109]:
def iterate_users(base_user_id, base_user_friends):
    intersection = [
        (new_user[0], len(base_user_friends.intersection(new_user[1])))
          for new_user in collected_users_db
            if new_user[0] != base_user_id and new_user[0] not in base_user_friends
              and len(base_user_friends.intersection(new_user[1])) > 0
        ]

    sorted_intersections = sorted(intersection, key=lambda x: (-x[1], x[0]))
    return [x[0] for x in sorted_intersections[:10]]

collected_users_db = user_friends.collect()

In [110]:
recommendations = user_friends_sorted.map(lambda user: (user[0], iterate_users(user[0], user[1])))
show_rdd(recommendations)

(11, [27552, 7785, 27573, 27574, 27589, 27590, 27600, 27617, 27620, 27667])
(8997, [8998, 8987, 8992, 9001, 9003, 9009, 4849, 7174, 7279, 7364])
(2791, [21185, 8783, 13280, 18359, 18363, 23667, 35740, 2204, 2786, 5996])
(4985, [79, 577, 4839, 4984, 4986, 4987, 4988, 4989, 4990, 4991])
(8961, [12241, 8973, 8965, 8963, 8966, 8967, 7174, 8969, 12243, 7177])
(4049, [4871, 4875, 4889, 8492, 8685, 439, 660, 1100, 1137, 1156])
(5060, [5052, 5057, 5086, 14271, 98, 364, 575, 596, 611, 622])
(739, [732, 367, 381, 336, 21526, 28064, 677, 704, 728, 736])
(1724, [1711, 1663, 1712, 1718, 1662, 1697, 1700, 1715, 1716, 1658])
(9550, [9554, 9533, 9544, 9558, 153, 1220, 1421, 1436, 1951, 2413])
(3151, [3161, 43162, 3230, 3450, 8692, 161, 2036, 3136, 3137, 3162])
(0, [38737, 18591, 27383, 34211, 337, 352, 1532, 12143, 12561, 17880])
(1, [35621, 44891, 14150, 15356, 35630, 13801, 13889, 14078, 25228, 13805])
(2, [41087, 1, 5, 95, 112, 1085, 1404, 2411, 3233, 4875])
(3, [27679, 1, 10, 16, 29, 30, 38, 82, 8

In [111]:
recommendations_for_users = recommendations.take(len(important_users))
for user in recommendations_for_users:
  print(f"For user {user[0]}; I recommend the users {user[1]}")

For user 11; I recommend the users [27552, 7785, 27573, 27574, 27589, 27590, 27600, 27617, 27620, 27667]
For user 8997; I recommend the users [8998, 8987, 8992, 9001, 9003, 9009, 4849, 7174, 7279, 7364]
For user 2791; I recommend the users [21185, 8783, 13280, 18359, 18363, 23667, 35740, 2204, 2786, 5996]
For user 4985; I recommend the users [79, 577, 4839, 4984, 4986, 4987, 4988, 4989, 4990, 4991]
For user 8961; I recommend the users [12241, 8973, 8965, 8963, 8966, 8967, 7174, 8969, 12243, 7177]
For user 4049; I recommend the users [4871, 4875, 4889, 8492, 8685, 439, 660, 1100, 1137, 1156]
For user 5060; I recommend the users [5052, 5057, 5086, 14271, 98, 364, 575, 596, 611, 622]
For user 739; I recommend the users [732, 367, 381, 336, 21526, 28064, 677, 704, 728, 736]
For user 1724; I recommend the users [1711, 1663, 1712, 1718, 1662, 1697, 1700, 1715, 1716, 1658]
For user 9550; I recommend the users [9554, 9533, 9544, 9558, 153, 1220, 1421, 1436, 1951, 2413]
For user 3151; I recomme

In [112]:
recommendations = user_friends.map(lambda user: (user[0], iterate_users(user[0], user[1])))
full_recommendations = recommendations.collect()

In [113]:
for user in full_recommendations:
  print(f"{user[0]}  {user[1]}")

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
35341  [35335, 35342, 35398, 35338, 35346, 35349, 35328, 35403, 35333, 35380]
35342  [35336, 35326, 35341, 35338, 35327, 35349, 35328, 35330, 35403, 35350]
35343  [35377, 6789, 7411, 7511, 10664, 13179, 18857, 19125, 21127, 21135]
35344  [35355, 35330, 35349, 35372, 35382, 35329, 35347, 35368, 35369, 35403]
35346  [35326, 35373, 35371, 35331, 35341, 35327, 35338, 35328, 35330, 35352]
35347  [35344, 35355, 35382, 35332, 35335, 35336, 35361, 35373, 35389, 35409]
35351  [35326, 35369, 35383, 35384, 6789, 7411, 7511, 10664, 13179, 18857]
35352  [35339, 35333, 35382, 35338, 35344, 35346, 35353, 35355, 35363, 35373]
35355  [35344, 35361, 35373, 35371, 35398, 35382, 35330, 35333, 35347, 35329]
35356  [35326, 35335, 35344, 35349, 35368, 35371, 35389, 35398, 35409, 35334]
35357  [35326, 35327, 35333, 35334, 35335, 35338, 35339, 35341, 35344, 35346]
35358  [35326, 35335, 35338, 35339, 35341, 35361, 35363, 35371, 35389, 35398]
35359 

In [None]:
# Example list of tuples
list_of_tuples = [(1, 2), (3, 4), (5, 6)]

# Flatten the list of tuples
flattened_list = [item for sublist in list_of_tuples for item in sublist]

print(flattened_list)