In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = (
    SparkSession.builder.appName("HW3")
    .master("local[*]")
    .config("spark.driver.maxResultSize", 0)
    .getOrCreate()
)

sc: SparkContext = spark.sparkContext

24/01/12 23:29:38 WARN Utils: Your hostname, aac resolves to a loopback address: 127.0.1.1; using 172.27.171.43 instead (on interface wlp2s0)
24/01/12 23:29:38 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/12 23:29:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
import json
tweet_rdd = sc.textFile("twitter_data.jsonl")
json_rdd = tweet_rdd.map(lambda x: json.loads(x))

In [5]:
def create_row_of_user_tweet_matrix(x):
    related_tweet_ids = []
    user_id = x["user"]["id"]
    # Set the score of each tweet to one
    if x["tweet_type"] == "generated":
        if "id" in x:
            related_tweet_ids.append((x["id"], 1))
    if x["tweet_type"] == "replied":
        if "in_reply_to_status_id_str" in x:
            related_tweet_ids.append((x["in_reply_to_status_id_str"], 1))
    if x["tweet_type"] == "quoted":
        if "id" in x:
            related_tweet_ids.append((x["id"], 1))
        if "quoted_status" in x:
            if "id" in x["quoted_status"]:
                related_tweet_ids.append((x["quoted_status"]["id"], 1))
    if x["tweet_type"] == "retweeted":
        if "id" in x:
            related_tweet_ids.append((x["id"], 1))
        if "retweeted_status" in x:
            if "id" in x["retweeted_status"]:
                related_tweet_ids.append((x["retweeted_status"]["id"], 1))
    return (user_id, (related_tweet_ids))

# Create the user-tweet matrix. Rows represent users, and columns represent tweets.
user_tweet_matrix_rdd = (
    json_rdd.map(create_row_of_user_tweet_matrix)
    .reduceByKey(lambda x, y: x + y)
    .map(lambda x: (x[0], list(x[1])))
)

In [6]:
from collections import defaultdict

def update_scores_user_tweet_matrix(x):
    sums_dict = defaultdict(int)
    for unique_id, number in x[1]:
        sums_dict[unique_id] += number

    result_list = list(sums_dict.items())
    return (x[0], result_list)

# Update the user-tweet matrix so that each row, representing a user, contains scores related to their tweets. 
# Tweets with zero scores are not stored in this matrix.

updated_scores_user_tweet_matrix_rdd = user_tweet_matrix_rdd.map(
    update_scores_user_tweet_matrix
).sortBy(lambda x: len(x[1]), ascending=False)
updated_scores_user_tweet_matrix_list = updated_scores_user_tweet_matrix_rdd.collect()

                                                                                

In [None]:
import random

# This parameter indicates the number of samples utilized for testing the model.
number_of_samples = 100
test_users_matrix = updated_scores_user_tweet_matrix_rdd.filter(lambda x: len(x[1])>1).takeSample(
    False, number_of_samples
)
test_users_ids = [item[0] for item in test_users_matrix]


# Apply a filter to the matrix for rows that are present in the test_users matrix.
def filter_test_users(x):
    updated_tweets = []  # This variable represents updated tweets whose scores have been set to 0.
    new_tweets_scores = []
    if x[0] in test_users_ids:
        number_of_random_samples = len(x[1]) // 2 # Select half of the tweets from the test users to estimate their scores 
        # based on the results from the recommender system.
        if number_of_random_samples > 0:
            sampled_tweet_from_one_user = random.sample(x[1], number_of_random_samples)
            for tweet_id, score in sampled_tweet_from_one_user:
                updated_tweets.append((tweet_id, 0))
            new_tweets_scores = [
                item for item in x[1] if item[0] not in test_users_ids
            ] + updated_tweets
            return (x[0], new_tweets_scores), (x[0], new_tweets_scores)
        else:
            return x, None
    else:
        return x, None


# Create a matrix that includes users not present in the test data, along with 
# users from the test data for whom the user-tweet matrix has been updated.
user_tweet_matrix_updated_for_test_users_rdd = updated_scores_user_tweet_matrix_rdd.map(
    lambda x: filter_test_users(x)[0]
)

# Create an RDD in which each element contains a tuple. The first element of the tuple
# is related to the user ID of a test user, and the second element contains a list of 
# tweet IDs whose scores have been changed to zero.
user_tweet_matrix_for_test_users = updated_scores_user_tweet_matrix_rdd.map(
    lambda x: filter_test_users(x)[1]
).filter(lambda x: True if x is not None else False).collect()

# Discover Similar Users and Tweets for a Single User
The `main_user_id` variable represents the user ID for whom we intend to find similar users and tweets.

In [None]:
main_user_id = "931938860963061760"
# Retrieve the tweets associated with the user with the ID 'main_user_id'
main_user_tweet_list = []
updated_scores_user_tweet_matrix_list = updated_scores_user_tweet_matrix_rdd.collect()
for item in updated_scores_user_tweet_matrix_list:
    if item[0] == main_user_id:
        main_user_tweet_list = main_user_tweet_list + item[1]
        break
# Determine the size of the vector associated with the 'main_user_id' in the user-tweet matrix.
abs_main_user_tweet_list = 0
for tweet_id, score in main_user_tweet_list:
    abs_main_user_tweet_list = abs_main_user_tweet_list + (int(score)) ** 2


In [19]:
import math

def calculate_cosine_similarity(x, main_user_tweet_list, abs_main_user_tweet_list):
    all_tweet_list = main_user_tweet_list + x[1]

    mult_dict = defaultdict(lambda: 1)
    count_dict = defaultdict(int)
    abs_another_user_tweet_list = 0

    for unique_id, number in all_tweet_list:
        mult_dict[unique_id] *= number
        count_dict[unique_id] += 1

    for unique_id, count in count_dict.items():
        if count == 1:
            mult_dict[unique_id] = 0

    for tweet_id, score in x[1]:
        abs_another_user_tweet_list = abs_another_user_tweet_list + (int(score)) ** 2

    vector_product = sum(mult_dict.values(), 0)
    cosine_similarity = vector_product / (
        math.sqrt(abs_another_user_tweet_list * abs_main_user_tweet_list)
    )
    return ((x[0], cosine_similarity), x[1])


def filter_similarity_result(x, lower_threshold, upper_threshold):
    if x[0][1] > lower_threshold and x[0][1] < upper_threshold:
        return True
    else:
        return False

# Compute the cosine similarity between the user with the ID 'main_user_id' and 
# all other users present in the tweet-user matrix.
# lower_threshold = 0.03 # Select the lower threshold for cosine similarity.
# upper_threshold = 1 # Choose the upper threshold for cosine similarity.
# cosine_similarity_calculation = (
#     updated_scores_user_tweet_matrix_rdd.map(lambda x: calculate_cosine_similarity(x, main_user_tweet_list, abs_main_user_tweet_list))
#     .sortBy(lambda x: x[0][1], ascending=False)
#     .filter(lambda x: filter_similarity_result(x, lower_threshold, upper_threshold))
# )

# Create an RDD in which each element contains a tuple. The first element of the tuple
# is itself another tuple with the structure: (user_id, cosine_similarity). The second
# element of the main tuple contains a list of tweets related to the user_id
# similar_users_and_their_tweets = cosine_similarity_calculation.collect()

# Evaluating the Model: Estimating Zeroed Scores from Similar Users

In [26]:
for main_user in user_tweet_matrix_for_test_users[0:2]: # Iterate through the test users.
    abs_main_user_tweet_list = 0

    lower_threshold = 0.03 # Select the lower threshold for cosine similarity.
    upper_threshold = 1 # Choose the upper threshold for cosine similarity.
    for tweet_id, score in main_user[1]:
        abs_main_user_tweet_list = abs_main_user_tweet_list + (int(score)) ** 2 # Calculate the size of the vector for
        # each iterated test user.
    cosine_similarity_calculation = (
        updated_scores_user_tweet_matrix_rdd.map(lambda x: calculate_cosine_similarity(x, main_user[1], abs_main_user_tweet_list))
        .sortBy(lambda x: x[0][1], ascending=False)
        .filter(lambda x: filter_similarity_result(x, lower_threshold, upper_threshold))
    ) # Calculate cosine similarity between the iterated test user and all other users.

    similar_users_and_their_tweets_list = cosine_similarity_calculation.collect()

    estimated_tweet_ids_dict = defaultdict(int)
    estimated_tweet_ids_weights_dict = defaultdict(int)
    main_user_tweets_dict = {key: value for key, value in main_user[1] if value == 0}

    # item 1 is in this format: (user_id, cosine similarity). 
    # item2 is in this format: [(tweet_id1, score1), (tweet_id2, score2), ..., (tweet_idn, scoren)]    
    for item1, item2 in similar_users_and_their_tweets_list: # Iterate through the similar users.
        another_user_tweet_ids = dict(item2)
        common_tweet_ids = list(set(main_user_tweets_dict.keys()) & set(another_user_tweet_ids.keys())) 
        for tweet in main_user_tweets_dict:
            if tweet in another_user_tweet_ids.keys():
                estimated_tweet_ids_dict[tweet] += item1[1] * another_user_tweet_ids[tweet]
            estimated_tweet_ids_weights_dict[tweet] += item1[1]
    # Calculate the recommended score for each tweet whose value has been set to zero.
    recommended_scores_dict = {key: estimated_tweet_ids_dict[key]/estimated_tweet_ids_weights_dict[key] for key in estimated_tweet_ids_dict}
    

    # Calculate root mean squeare error for recommended scores.
    for user_id, tweets in test_users_matrix:
        if user_id == main_user[0]:
            main_tweets_with_real_scores = dict(tweets)
            break
    real_zeroed_tweet_values = {}
    rmse = 0
    recommended_scores_dict_keys = recommended_scores_dict.keys()
    for key,value in main_user_tweets_dict.items():
        real_zeroed_tweet_values[key] = main_tweets_with_real_scores[key]
        rmse = rmse + (recommended_scores_dict[key]-main_tweets_with_real_scores[key])**2

        
    rmse = math.sqrt(rmse/len(main_user[1]))
    for key, value in real_zeroed_tweet_values.items():
        print(f"Real value of the tweet id {key} is {value}")
        print(f"Estimated scores for zeroed tweets with the id {key} is:{recommended_scores_dict[key]}")
    print(f"rmse is {rmse}")
    print("###########################################################")


                                                                                

Real value of the tweet id 1727002206598307897 is 1
Estimated scores for zeroed tweets with the id 1727002206598307897 is:0.6550157400843163
Real value of the tweet id 1721596771799904516 is 1
Estimated scores for zeroed tweets with the id 1721596771799904516 is:1.024833377861876
Real value of the tweet id 1721173157464130024 is 1
Estimated scores for zeroed tweets with the id 1721173157464130024 is:0.6550157400843163
Real value of the tweet id 1723624846855110927 is 1
Estimated scores for zeroed tweets with the id 1723624846855110927 is:0.6550157400843163
Real value of the tweet id 1726975646742773931 is 1
Estimated scores for zeroed tweets with the id 1726975646742773931 is:0.6550157400843163
Real value of the tweet id 1727007505564438982 is 1
Estimated scores for zeroed tweets with the id 1727007505564438982 is:0.6550157400843163
Real value of the tweet id 1729191204229956029 is 1
Estimated scores for zeroed tweets with the id 1729191204229956029 is:0.6550157400843163
Real value of 



Real value of the tweet id 1725210966445474003 is 1
Estimated scores for zeroed tweets with the id 1725210966445474003 is:0.779668076235837
Real value of the tweet id 1725872469871898734 is 1
Estimated scores for zeroed tweets with the id 1725872469871898734 is:0.779668076235837
Real value of the tweet id 1724134393189392624 is 1
Estimated scores for zeroed tweets with the id 1724134393189392624 is:0.779668076235837
Real value of the tweet id 1724514641169068139 is 1
Estimated scores for zeroed tweets with the id 1724514641169068139 is:0.779668076235837
Real value of the tweet id 1724879815348560089 is 1
Estimated scores for zeroed tweets with the id 1724879815348560089 is:0.779668076235837
Real value of the tweet id 1726679750733898102 is 1
Estimated scores for zeroed tweets with the id 1726679750733898102 is:0.779668076235837
Real value of the tweet id 1726274555407389087 is 1
Estimated scores for zeroed tweets with the id 1726274555407389087 is:0.779668076235837
Real value of the tw

                                                                                