In [1]:
from pinecone import Pinecone
from dags.lib.IncrementalLoader import IncrementalLoader
from dags.lib.PineconeManager import PineconeManager
import pyspark
from delta import *
from pyspark.sql.functions import col, lit, max as spark_max, length, row_number, explode, unix_timestamp, from_unixtime, date_format, to_timestamp, concat, expr

import os

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
def create_spark_session():
    conf = (
        pyspark.conf.SparkConf()
        .setAppName("LetsTalk")
        .set(
            "spark.sql.catalog.spark_catalog",
            "org.apache.spark.sql.delta.catalog.DeltaCatalog",
        )
        .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .set("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
        .set("spark.hadoop.google.cloud.auth.service.account.enable", "true")
        .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", "/Users/alfio/projects/upc/BDMP2/docker/gcs.json")
        .set("spark.sql.shuffle.partitions", "4")
        .set("spark.jars", "../docker/gcs-connector-hadoop.jar")
        .setMaster(
            "local[*]"
        )
    )

    builder = pyspark.sql.SparkSession.builder.appName("LetsTalk").config(conf=conf)
    spark = configure_spark_with_delta_pip(builder).getOrCreate()
    return spark

spark = create_spark_session()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
absolute_path_to_landing = '/Users/alfio/projects/upc/BDMP2/data/letstalk_trusted_zone_bdma'
absolute_path_to_cloud = 'gs://letstalk_trusted_zone_bdma'

:: loading settings :: url = jar:file:/Users/alfio/python_venv/general/lib/python3.13/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/alfio/.ivy2/cache
The jars for the packages stored in: /Users/alfio/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-b8a0ce5e-d677-48d2-80ff-cbc039cb3172;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.3.0 in central
	found io.delta#delta-storage;3.3.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 75ms :: artifacts dl 2ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.3.0 from central in [default]
	io.delta#delta-storage;3.3.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0  

In [4]:
table_subpath = 'delta_tmdb/movies'
loader = IncrementalLoader(spark, absolute_path_to_landing, table_subpath)
matches = loader.get_new_data()
matches.head(5)


25/05/29 19:07:40 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

INFO:dags.lib.pt_utils:CDF not available — doing full load


[Row(fixture_id=1196675, status_long='not started', league=368, fixture_date='2025-04-19T10:00:00+00:00', period_first=None, period_second=None, referee=None, status_elapsed=None, status_extra=None, timestamp='2025-04-19T12:00:00+02', venue_id='jalan_besar_stadium', team_away_id=4203, team_home_id=4208, goals_away=None, goals_home=None),
 Row(fixture_id=1196676, status_long='first half', league=368, fixture_date='2025-04-19T12:15:00+00:00', period_first='2025-04-19T14:15:00+02', period_second=None, referee=None, status_elapsed=1, status_extra=None, timestamp='2025-04-19T14:15:00+02', venue_id='stadium_sultan_hassanal_bolkiah', team_away_id=4200, team_home_id=4202, goals_away=0, goals_home=0),
 Row(fixture_id=1196676, status_long='not started', league=368, fixture_date='2025-04-19T12:15:00+00:00', period_first=None, period_second=None, referee=None, status_elapsed=None, status_extra=None, timestamp='2025-04-19T14:15:00+02', venue_id='stadium_sultan_hassanal_bolkiah', team_away_id=4200, 

In [29]:
absolute_path_to_landing = absolute_path_to_cloud
tmdb_path = os.path.join(absolute_path_to_landing, 'delta_tmdb')
movie = spark.read.format("delta").load(os.path.join(tmdb_path, "movie"))
genre = spark.read.format("delta").load(os.path.join(tmdb_path, "genre"))
movie_genre = spark.read.format("delta").load(os.path.join(tmdb_path, "movie_genre")).select("film_id", "genre_id")

In [32]:
from pyspark.sql.functions import concat_ws, collect_list
movie_genres_compacted = movie_genre.join(genre, on=(movie_genre.genre_id == genre.genre_id)).groupby("film_id").agg(concat_ws(
            ", ",
            collect_list("genre")
        ).alias("genres") ).withColumnRenamed("film_id", "join_film_id")

enriched_movie = movie.join(movie_genres_compacted, on=(movie.film_id == movie_genres_compacted.join_film_id)) \
    .drop("popularity", "join_film_id") \
    .withColumn("text_to_embed", concat(col('title'), lit(" "), col('overview'), lit(" "), col('genres')))


In [33]:
from dags.lib.PineconeManager import prepare_data
data, reg = prepare_data(enriched_movie, "film_id", "ingestion_time", ["text_to_embed"])

In [38]:
data.filter(col("ingestion_time").isNull())

_id,title,original_title,original_language,overview,release_date,revenue,budget,runtime,adult,vote_average,vote_count,genres,text_to_embed


In [39]:
data.count()

                                                                                

138529

In [3]:
pinecone_key = os.getenv('PINECONE_API')
index_name = "letstalkvector"
namespace = "letstalk-ns"

pi = PineconeManager(index_name, namespace, pinecone_key)


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
pi.query("italy", 10)

{'result': {'hits': [{'_id': 'https://www.autoevolution.com/news/bmw-flexes-all-new-m2-cs-before-concorso-d-eleganza-villa-d-este-2025-ducktail-incoming-251877.html',
                      '_score': 0.2705961763858795,
                      'fields': {'author': 'Benny Kirk',
                                 'content': 'on the pristine and scenic shores '
                                            'of lake como in north italy  the '
                                            'concorso d eleganza villa d este '
                                            'is one of the country s premiere '
                                            'concours d elegance events  it '
                                            'was also the first of its kind to '
                                            't         chars',
                                 'description': 'on the pristine and scenic '
                                                'shores of lake como in north '
                       

In [8]:
pi.reset_index()

In [9]:
pi.print_stats()

{'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


In [9]:
func = pi.get_pinecone_loader()
data.rdd.foreachPartition(func)

                                                                                

In [8]:
pinecone_api_key = os.getenv('PINECONE_API')
pc = Pinecone(api_key=pinecone_api_key)

index_name = "letstalkvector"
if not pc.has_index(index_name):
    pc.create_index_for_model(
        name=index_name,
        cloud="gcp",
        region="europe-west4",
        embed={
            "model":"llama-text-embed-v2",
            "field_map":{"text": "text_to_embed"}
        }
    )