In [1]:
import sys
import os
import names
import random
from sklearn.feature_extraction.text import TfidfVectorizer
sys.path.append(os.path.abspath("../../")) 

# Standard libraries
import hashlib
import pandas as pd
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD

# Your project modules
from dags.lib.pt_utils import *
from dags.lib.IncrementalLoader import IncrementalLoader
from dags.lib.Processer import *

# Spark
from pyspark.sql.utils import AnalysisException
from delta import *

import math


In [24]:
g = Graph()
DBO = Namespace("http://sdm_upc.org/ontology/")
DBR = Namespace("http://sdm_upc.org/resource/")


g.bind("dbo", DBO)
g.bind("dbr", DBR)



In [55]:
def consistent_hash(value):
    return int(hashlib.sha256(str(value).encode()).hexdigest(), 16)

In [None]:
USERS=100
PERCENTAGE=0.1

In [None]:
is_gcs_enabled= "False"
if is_gcs_enabled.lower() == 'true':
    is_gcs_enabled = True
else:
    is_gcs_enabled = False

spark, base_path = get_spark_and_path(is_gcs_enabled)

trusted_path ='..\..\data\letstalk_trusted_zone_bdma'

In [5]:
def keyword_extract(row):
    texts = []

    if row.content is not None:
        texts.append(row.content)

    if row.title is not None:
        texts.append(row.title)

    if row.description is not None:
        texts.append(row.description)
        
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(texts)

        # Sum TF-IDF scores across all documents
    scores = X.sum(axis=0).A1
    words = vectorizer.get_feature_names_out()
    word_scores = list(zip(words, scores))

    sorted_words = sorted(word_scores, key=lambda x: x[1], reverse=True)
    top_words = [word for word, _ in sorted_words[:3]]
    return top_words
    
is_gcs_enabled= "False"
if is_gcs_enabled.lower() == 'true':
    is_gcs_enabled = True
else:
    is_gcs_enabled = False

spark, base_path = get_spark_and_path(is_gcs_enabled)

trusted_path ='..\..\data\letstalk_trusted_zone_bdma'
subpath= 'movie'

path = os.path.join(trusted_path, subpath)
df = DeltaTable.forPath(spark, path).toDF().sample(fraction=PERCENTAGE, seed=42)
film_ids = df.select("film_id").distinct().rdd.map(lambda row: row.film_id).collect()
for row in df.toLocalIterator():

    subject = URIRef(DBR+f"film_{row.film_id}")
    
    g.add((subject, DBO.movie_title, Literal(str(row.title), datatype=XSD.string )))
    g.add((subject, DBO.movie_language, Literal(str(row.original_title), datatype=XSD.string )))
    g.add((subject, DBO.movie_release_date, Literal(row.release_date, datatype=XSD.date )))
    g.add((subject, DBO.movie_revenue, Literal(int(row.revenue), datatype=XSD.integer)))
    g.add((subject, DBO.movie_budget, Literal(int(row.budget), datatype=XSD.integer )))
    runtime_value = row.runtime
    if runtime_value is not None and not math.isnan(runtime_value):
        g.add((subject, DBO.movie_runtime, Literal(int(runtime_value), datatype=XSD.integer)))
    g.add((subject, DBO.movie_adult, Literal(bool(row.adult), datatype=XSD.bolean)))
    g.add((subject, DBO.movie_popularity, Literal(float(row.popularity), datatype=XSD.long)))
    g.add((subject, DBO.movie_vote_avg, Literal(float(row.vote_average), datatype=XSD.long)))
    g.add((subject, DBO.movie_vote_cnt, Literal(int(row.vote_count), datatype=XSD.integer)))

subpath= 'movie_genre'

path = os.path.join(trusted_path, subpath)
df = DeltaTable.forPath(spark, path).toDF().sample(fraction=PERCENTAGE, seed=42)

for row in df.toLocalIterator():
    subject = URIRef(DBR+f"film_{row.film_id}")
    object = URIRef(DBR+f"genre_{row.genre_id}")
    g.add((subject, DBO.has_genre, object))
    
subpath= 'genre'

path = os.path.join(trusted_path, subpath)
df = DeltaTable.forPath(spark, path).toDF().sample(fraction=PERCENTAGE, seed=42)
for row in df.toLocalIterator():
    subject = URIRef(DBR+f"genre_{row.genre_id}")
    g.add((subject, DBO.genre_name, Literal(str(row.genre), datatype=XSD.string )))
    

subpath= 'entertainment'

path = os.path.join(trusted_path, subpath)
df = DeltaTable.forPath(spark, path).toDF().sample(fraction=0.1, seed=42)
for row in df.toLocalIterator():
    subject = URIRef(DBR+f"new_{consistent_hash(row.url)}")
    g.add((subject, RDFS.type, DBO.Entertainment_News))
    g.add((subject, DBO.written_by, URIRef(DBR+f"journalist_{consistent_hash(row.author)}")))
    g.add((URIRef(DBR+f"journalist_{consistent_hash(row.author)}"), DBO.author_name, Literal(str(row.author), datatype=XSD.string )))
    for keyword in keyword_extract(row):
        g.add((subject, DBO.related_keyword, URIRef(DBR+f"keyword_{consistent_hash(keyword)}")))
        g.add(( URIRef(DBR+f"keyword_{consistent_hash(keyword)}"), DBO.keyword_text, Literal(str(keyword), datatype=XSD.string )))
    
    g.add((subject, DBO.published_at,  URIRef(DBR+f"source_{consistent_hash(row.source)}")))
    g.add((URIRef(DBR+f"source_{consistent_hash(row.source)}"), DBO.source_name, Literal(str(row.source), datatype=XSD.string )))
    g.add((subject, DBO.news_title, Literal(str(row.title), datatype=XSD.string )))
    g.add((subject, DBO.news_date, Literal(row.release_date, datatype=XSD.dateTime)))
   



for i in range(USERS):
    subject = URIRef(DBR+f"user_{i}")
    g.add((subject, DBO.user_name, Literal(str(names.get_full_name()), datatype=XSD.string )))
    for j in range(random.randint(0, 3)):
        g.add((subject, DBO.likes_movie, URIRef(DBR+f"film_{random.choice(film_ids)}")))
    

INFO:dags.lib.pt_utils:False


INFO:dags.lib.pt_utils:False
INFO:py4j.clientserver:Error while receiving.
Traceback (most recent call last):
  File "c:\Users\josub\.conda\envs\spark311\Lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: reentrant call inside <_io.BufferedReader name=4736>
INFO:py4j.clientserver:Closing down clientserver connection
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "c:\Users\josub\.conda\envs\spark311\Lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: reentrant call inside <_io.BufferedReader name=4736>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\josub\.conda\envs\spark311\Lib\site-packages\py4j\java_gateway.py

Py4JError: An error occurred while calling z:io.delta.tables.DeltaTable.forPath

In [6]:
is_gcs_enabled= "False"
if is_gcs_enabled.lower() == 'true':
    is_gcs_enabled = True
else:
    is_gcs_enabled = False

spark, base_path = get_spark_and_path(is_gcs_enabled)

trusted_path ='..\..\data\letstalk_trusted_zone_bdma'

INFO:dags.lib.pt_utils:False
INFO:dags.lib.pt_utils:False


In [29]:
from pyspark.sql.functions import col, concat_ws, when
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

def extract_column_keywords(df, text_columns=['content', 'title', 'description'], top_n=10):
    """
    Returns: List of top keywords (without frequencies)
    """
    combined_df = df.withColumn(
        "combined_text",
        concat_ws(" ", *[
            when(col(c).isNotNull(), col(c)).otherwise("")
            for c in text_columns
        ]))
    
    texts = [row.combined_text for row in combined_df.select("combined_text").collect()]
    
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    X = vectorizer.fit_transform(texts)
    
    feature_names = vectorizer.get_feature_names_out()
    keywords = []
    
    for i in range(X.shape[0]):
        scores = X[i].toarray().flatten()
        top_indices = scores.argsort()[-3:][::-1]
        keywords.extend(feature_names[top_indices])
    
    # Return just the words as a list
    return [word for word, count in Counter(keywords).most_common(top_n)]

In [4]:


trusted_path ='..\..\data\letstalk_trusted_zone_bdma'
    
subpath= 'technology'

path = os.path.join(trusted_path, subpath)
df = DeltaTable.forPath(spark, path).toDF()

top_keywords = extract_column_keywords(df, top_n=25)
print( top_keywords)


NameError: name 'spark' is not defined

In [None]:
from pyspark.ml.feature import CountVectorizer, RegexTokenizer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, concat_ws, when, posexplode, lit, array
import os
from delta.tables import DeltaTable

# Load data
path = os.path.join(trusted_path, "movie")
movies_df = DeltaTable.forPath(spark, path).toDF()
path = os.path.join(trusted_path, "entertainment")
news_df = DeltaTable.forPath(spark, path).toDF()

def find_movie_mentions(news_df, movies_df, text_columns=['content', 'title', 'description']):
    # Step 1: Prepare movie vocabulary (lowercase)
    movie_vocab = [row.title.lower() for row in movies_df.select("title").distinct().collect()]
    
    # Step 2: Combine text columns in news data
    news_combined = news_df.withColumn(
        "combined_text",
        concat_ws(" ", *[
            when(col(c).isNotNull(), col(c)).otherwise("")
            for c in text_columns
        ])
    )
    
    # Step 3: Tokenize text
    tokenizer = RegexTokenizer(
        inputCol="combined_text",
        outputCol="words",
        pattern=r"\W+",
        toLowercase=True
    )
    
    # Step 4: Configure CountVectorizer
    cv = CountVectorizer(
        inputCol="words",
        outputCol="movie_hits",
        vocabSize=100000,  # Large enough to capture all terms
        minDF=1.0
    )
    
    # Step 5: Build and run pipeline
    pipeline = Pipeline(stages=[tokenizer, cv])
    model = pipeline.fit(news_combined)
    
    # Step 6: Get vocabulary and find movie indices
    vocab = model.stages[-1].vocabulary
    movie_indices = [i for i, word in enumerate(vocab) if word in movie_vocab]
    
    # Step 7: Transform data and find matches
    result = model.transform(news_combined).select(
        "title",
        posexplode("movie_hits").alias("pos", "count")
    ).filter(
        (col("pos").isin(movie_indices)) & 
        (col("count") > 0)
    ).withColumn(
        "movie_title",
        array([lit(vocab[i]) for i in movie_indices])[col("pos")]
    )
    
    return result.select("title", "movie_title", "count")

# Execute and show results
matches_df = find_movie_mentions(news_df, movies_df)
matches_df.show(truncate=False)