In [None]:
# Python Version: "3.10.5"
# Java Version: "1.8.0_421"

In [None]:
%pip install -r ../../requirements.txt

In [None]:
from pyspark.sql import SparkSession

# Configurazione della SparkSession
# Crea una sessione Spark
spark = SparkSession.builder \
    .appName("Spark NLP Example") \
    .master("local[*]") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.4.1") \
    .getOrCreate()
    

In [None]:
import sparknlp
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import SentenceDetectorDLModel, Tokenizer
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

# Verifica della versione di Spark NLP
print("Spark NLP version:", sparknlp.version())
print("Apache Spark version:", spark.version)

In [None]:
%run ../../Common/UtilityClustering

In [None]:
%run ../../Common/UtilityNLP

In [None]:
echo %JAVA_HOME%


In [None]:
df=spark.read.option("header",True).csv('../dati/input/esempio_frasi_1.csv')
df.cache()
df.show()

In [None]:
model = nlp_pipeline_bert_sentence_embedding("descrizione").fit(df)
result_bert = model.transform(df)

In [None]:
result_bert.cache()
result_bert.display()

In [None]:
print("# dataset originale: {}".format(df.count()))
print("# dataset nuovo: {}".format(result_bert.count()))

In [None]:
result_df_exp=convert_sentence_embedding_in_col(result_bert,["idcase","descrizione","dataapertura"])
result_df_exp.display()

In [None]:
col_features=result_df_exp.columns[4:]
result_df_exp_filled = result_df_exp.dropna()
result, pca_model, loadings=pipelineStandardPCA(result_df_exp_filled, col_features, 30)

In [None]:
cumulativePCwithVariance(pca_model)

In [None]:
silhouetteClusteringKMeans(result,"pca_features",m=2,n=20,i=2)

In [None]:
predictions_cluster_final, final_model=defineClustering(result, 12)

In [None]:
predictions_cluster_final.select("idcase","descrizione","dataapertura","sentence","prediction").distinct().display()

In [None]:
plotPCA3DInterattivo(predictions_cluster_final.where("dataapertura >= '2024-06-01'"), features='pca_features', predictions='prediction', additional_column='descrizione')

In [None]:
plotClustering3DInterattivo(predictions_cluster_final.where("dataapertura >= '2024-06-01'"), features='pca_features', predictions='prediction', additional_column='descrizione')

In [None]:
docs_per_topic = predictions_cluster_final.groupby('prediction').agg(concat_ws(' ', collect_list(col("sentence"))).alias('Doc'))

topN=top_n_words(docs_per_topic, inputCol="Doc", outputCol="features", ngram=3, N=10, targetCol="prediction")

topN.display()

In [None]:
docs_per_topic = predictions_cluster_final.groupby('prediction').agg(concat_ws(' ', collect_list(col("sentence"))).alias('Doc'))

topN_4gram=top_n_words(docs_per_topic, inputCol="Doc", outputCol="features", ngram=4, N=10, targetCol="prediction")

topN_4gram.display()

In [None]:
predictions_cluster_final.groupBy("prediction").count().display()

In [None]:
predictions_cluster_final.select("idcase",
    "descrizione","prediction").join(topN,"prediction","left").display()