In [2]:
!pip install google-cloud-bigquery
!pip install db-dtypes
!pip install tqdm
!pip install pyspark
!pip install pandas



In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
import pandas as pd
import requests
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import os
from pyspark.sql import Row
from tqdm import tqdm


spark = SparkSession.builder.appName('recommendation_ai').getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/16 09:15:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
likes_df = spark.read.csv('likes.csv', inferSchema=True, header=True)
posts_df = spark.read.csv('posts.csv', inferSchema=True, header=True)
profiles_df = spark.read.csv('profiles.csv', inferSchema=True, header=True)

posts_df.show()

+--------------------+-----------------------+----------+--------------------+--------------------+--------------------+------------------------+---------------------+--------------------+---------------------+---------------------+---------+---------------------+--------------------+---------+---------------+-------------+--------------------+--------------------+----------------+--------+------+---------------+------------------+-------------+-------------------------------+--------------------+--------+--------------------+------------------------+--------------------+-------------------------------------+
+--------------------+-----------------------+----------+--------------------+--------------------+--------------------+------------------------+---------------------+--------------------+---------------------+---------------------+---------+---------------------+--------------------+---------+---------------+-------------+--------------------+--------------------+----------------

23/10/16 09:15:23 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [6]:
import concurrent.futures

def get_content(uri1, uri2):
    try:
        response = requests.get(uri1)
        return response.text
    except:
        try:
            response = requests.get(uri2)
            return response.text
        except:
            return ""
        
# Collect the unique URIs to a Python list
uri_pairs = posts_df.select('s3_metadata_location', 'content_uri').distinct().rdd.map(tuple).collect()


def get_content_pair(uri_pair):
    s3_metadata_location, content_uri = uri_pair
    content = get_content(content_uri, s3_metadata_location)
    return Row(s3_metadata_location=s3_metadata_location, content=content)

# Use a ThreadPoolExecutor to parallelize the requests
with concurrent.futures.ThreadPoolExecutor() as executor:
    rows = list(tqdm(executor.map(get_content_pair, uri_pairs), total=len(uri_pairs), desc="Loading content"))

# Create a DataFrame from the results
content_df = spark.createDataFrame(rows)

# Join the results back to the original DataFrame
posts_df = posts_df.join(content_df, on='s3_metadata_location', how='left')

Loading content: 100%|██████████| 7691/7691 [02:48<00:00, 45.77it/s]            


In [21]:
from pyspark.sql.functions import col
from pyspark.sql.functions import when
from pyspark.ml.feature import StringIndexer



# Rename the columns to match the names expected by ALS
ratings_df = likes_df.selectExpr("actioned_by_profile_id as userId", "publication_id as postId", "reaction as rating")



# Create a StringIndexer
indexer = StringIndexer(inputCol="userId", outputCol="userIdIndex")

# Index the userId column
ratings_df = indexer.fit(ratings_df).transform(ratings_df)

# Do the same for the postId column
indexer.setInputCol("postId")
indexer.setOutputCol("postIdIndex")
ratings_df = indexer.fit(ratings_df).transform(ratings_df)



ratings_df = ratings_df.filter(ratings_df.userId.isNotNull())

# Map "UPVOTE" to 1 and "DOWNVOTE" to -1
ratings_df = ratings_df.withColumn("rating", when(col("rating") == "UPVOTE", 1).otherwise(-1))

ratings_df.show(5)


+--------+--------------------+------+-----------+-----------+
|  userId|              postId|rating|userIdIndex|postIdIndex|
+--------+--------------------+------+-----------+-----------+
|  0x86a3|0x7c66-0x014b-DA-...|     1|      906.0|      124.0|
|  0x9457|0x06ab-0x70-DA-51...|     1|      913.0|       24.0|
|  0xbee1|       0xbee1-0x0cfd|     1|       42.0|     1526.0|
|  0xd8d1|0x9ae8-0x030a-DA-...|     1|        8.0|      853.0|
|0x0175a9|0xebdc-0x45-DA-27...|     1|      146.0|       67.0|
+--------+--------------------+------+-----------+-----------+
only showing top 5 rows



In [24]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

# Assume you have a DataFrame `ratings_df` with columns 'userId', 'postId', and 'rating'
# Split the data into training and test sets
(training, test) = ratings_df.randomSplit([0.8, 0.2])
# Create an ALS model
als = ALS(maxIter=5, regParam=0.01, userCol="userIdIndex", itemCol="postIdIndex", ratingCol="rating",
          coldStartStrategy="drop")

# Train the ALS model
model = als.fit(training)

# Make predictions on the test data
predictions = model.transform(test)

# Evaluate the model
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error = {rmse}")

Root-mean-square error = 0.7922848517756703


In [28]:
# Generate top 10 post recommendations for each user
userRecs = model.recommendForAllUsers(10)

userRecs.show()

# Convert the recommendations array to a string
userRecs = userRecs.withColumn("recommendations", col("recommendations").cast("string"))

# Save the DataFrame to a CSV file
userRecs.write.csv("user_recommendations.csv")

                                                                                

+-----------+--------------------+
|userIdIndex|     recommendations|
+-----------+--------------------+
|          1|[{872, 1.1825883}...|
|          3|[{154, 2.0642385}...|
|          5|[{87, 1.6188003},...|
|          6|[{77, 1.631887}, ...|
|          9|[{320, 1.8445369}...|
|         12|[{131, 1.6727555}...|
|         13|[{42, 1.819513}, ...|
|         15|[{90, 1.4823849},...|
|         16|[{28, 2.120071}, ...|
|         17|[{31, 1.5751598},...|
|         19|[{65, 1.8274947},...|
|         20|[{88, 1.6329093},...|
|         22|[{39, 1.4830574},...|
|         26|[{153, 1.6383357}...|
|         27|[{491, 1.465279},...|
|         28|[{275, 1.5412489}...|
|         31|[{334, 1.9206804}...|
|         34|[{155, 2.1228735}...|
|         35|[{44, 1.7933944},...|
|         37|[{275, 1.6352398}...|
+-----------+--------------------+
only showing top 20 rows



                                                                                