In [2]:
from pinecone import Pinecone
from dags.lib.IncrementalLoader import IncrementalLoader
from dags.lib.PineconeManager import PineconeManager
import pyspark
from delta import *
from pyspark.sql.functions import col, lit, max as spark_max, length, row_number, explode, unix_timestamp, from_unixtime, date_format, to_timestamp, concat, expr

import os

from dotenv import load_dotenv
load_dotenv()

True

In [4]:
def create_spark_session():
    conf = (
        pyspark.conf.SparkConf()
        .setAppName("LetsTalk")
        .set(
            "spark.sql.catalog.spark_catalog",
            "org.apache.spark.sql.delta.catalog.DeltaCatalog",
        )
        .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .set("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
        .set("spark.hadoop.google.cloud.auth.service.account.enable", "true")
        .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", "/Users/alfio/projects/upc/BDMP2/docker/gcs.json")
        .set("spark.sql.shuffle.partitions", "4")
        .set("spark.jars", "../docker/gcs-connector-hadoop.jar")
        .setMaster(
            "local[*]"
        )
    )

    builder = pyspark.sql.SparkSession.builder.appName("LetsTalk").config(conf=conf)
    spark = configure_spark_with_delta_pip(builder).getOrCreate()
    return spark

spark = create_spark_session()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
absolute_path_to_landing = '/Users/alfio/projects/upc/BDMP2/data/letstalk_trusted_zone_bdma'
absolute_path_to_cloud = 'gs://letstalk_trusted_zone_bdma'

In [4]:
table_subpath = 'delta_sports/matches'
loader = IncrementalLoader(spark, absolute_path_to_landing, table_subpath)
matches = loader.get_new_data()
matches.head(5)


25/05/29 19:07:40 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

INFO:dags.lib.pt_utils:CDF not available — doing full load


[Row(fixture_id=1196675, status_long='not started', league=368, fixture_date='2025-04-19T10:00:00+00:00', period_first=None, period_second=None, referee=None, status_elapsed=None, status_extra=None, timestamp='2025-04-19T12:00:00+02', venue_id='jalan_besar_stadium', team_away_id=4203, team_home_id=4208, goals_away=None, goals_home=None),
 Row(fixture_id=1196676, status_long='first half', league=368, fixture_date='2025-04-19T12:15:00+00:00', period_first='2025-04-19T14:15:00+02', period_second=None, referee=None, status_elapsed=1, status_extra=None, timestamp='2025-04-19T14:15:00+02', venue_id='stadium_sultan_hassanal_bolkiah', team_away_id=4200, team_home_id=4202, goals_away=0, goals_home=0),
 Row(fixture_id=1196676, status_long='not started', league=368, fixture_date='2025-04-19T12:15:00+00:00', period_first=None, period_second=None, referee=None, status_elapsed=None, status_extra=None, timestamp='2025-04-19T14:15:00+02', venue_id='stadium_sultan_hassanal_bolkiah', team_away_id=4200, 

In [5]:
absolute_path_to_landing = absolute_path_to_cloud
sports_path = os.path.join(absolute_path_to_landing, 'delta_sports')
matches = spark.read.format("delta").load(os.path.join(sports_path, "matches"))
league = spark.read.format("delta").load(os.path.join(sports_path, "leagues")).select("league_id", "league_name")
teams = spark.read.format("delta").load(os.path.join(sports_path, "teams")).select("team_id", "team_name")
venues = spark.read.format("delta").load(os.path.join(sports_path, "venues")).select("venue_id", "venue_name")

In [6]:
home_teams = teams.alias("home_teams")
away_teams = teams.alias("away_teams")

enriched_df = (matches
    .join(league, on=(col("league_id") == col("league")))
    .join(home_teams, on=(col("home_teams.team_id") == col("team_home_id")))
    .withColumnRenamed("team_name", "team_home")
    .join(away_teams, on=(col("away_teams.team_id") == col("team_away_id")))
    .withColumnRenamed("team_name", "team_away")
    .join(venues, on=(matches.venue_id == venues.venue_id))
    .drop("league", "league_id", "team_id", "team_away_id", "team_home_id",
         "venue_id", "period_first", "period_second", "referee", "status_elapsed", "status_extra")
    .withColumn(
        "match_date",
        date_format(to_timestamp(col("fixture_date")), "dd MMMM yyyy")
    )
    .withColumn(
        "text_to_embed",
        concat(col("league_name"), lit(" "), col("team_home"), lit(" - "), col("team_away"), lit(" "), col("match_date"))
    )
    .filter(col("status_long") == "match finished")
    .drop("status_long", "fixture_date")
)

In [7]:
from dags.lib.PineconeManager import prepare_data
data, reg = prepare_data(enriched_df, "fixture_id", "timestamp", ["text_to_embed"])

In [3]:
pinecone_key = os.getenv('PINECONE_API')
index_name = "letstalkvector"
namespace = "letstalk-ns"

pi = PineconeManager(index_name, namespace, pinecone_key)


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
pi.query("italy", 10)

{'result': {'hits': [{'_id': 'https://www.autoevolution.com/news/bmw-flexes-all-new-m2-cs-before-concorso-d-eleganza-villa-d-este-2025-ducktail-incoming-251877.html',
                      '_score': 0.2705961763858795,
                      'fields': {'author': 'Benny Kirk',
                                 'content': 'on the pristine and scenic shores '
                                            'of lake como in north italy  the '
                                            'concorso d eleganza villa d este '
                                            'is one of the country s premiere '
                                            'concours d elegance events  it '
                                            'was also the first of its kind to '
                                            't         chars',
                                 'description': 'on the pristine and scenic '
                                                'shores of lake como in north '
                       

In [8]:
pi.reset_index()

In [9]:
pi.print_stats()

{'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


In [9]:
func = pi.get_pinecone_loader()
data.rdd.foreachPartition(func)

                                                                                

In [8]:
pinecone_api_key = os.getenv('PINECONE_API')
pc = Pinecone(api_key=pinecone_api_key)

index_name = "letstalkvector"
if not pc.has_index(index_name):
    pc.create_index_for_model(
        name=index_name,
        cloud="gcp",
        region="europe-west4",
        embed={
            "model":"llama-text-embed-v2",
            "field_map":{"text": "text_to_embed"}
        }
    )