In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.appName("UserStory8_IntentClassification").getOrCreate()

data_path = r'/content/trading_bot_dataset_with_user_profiles1.csv'
df = spark.read.csv(data_path, header=True, inferSchema=True)

In [None]:
df.printSchema()

root
 |-- Interaction ID: integer (nullable = true)
 |-- Timestamp: timestamp (nullable = true)
 |-- User Query: string (nullable = true)
 |-- Intent Detected: string (nullable = true)
 |-- Bot Response: string (nullable = true)
 |-- Response Time (ms): integer (nullable = true)
 |-- Prediction Accuracy (%): integer (nullable = true)
 |-- Entity Extraction Accuracy (%): integer (nullable = true)
 |-- User Sentiment: string (nullable = true)
 |-- User Feedback: string (nullable = true)
 |-- Conversation Success: string (nullable = true)
 |-- User ID: string (nullable = true)
 |-- User Type: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Device Type: string (nullable = true)
 |-- Account Age (months): integer (nullable = true)
 |-- Is Premium: boolean (nullable = true)



In [None]:
from pyspark.sql.functions import when, lower, col, lit, avg
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression

# Initialize Spark Session
spark = SparkSession.builder.appName("SentimentClassifier").getOrCreate()

# Replace this with your actual CSV file path on your machine
csv_path = r'/content/trading_bot_dataset_with_user_profiles1.csv'
  # Windows example (use 'file://' prefix)

# Load data from CSV
# Assumes CSV has columns: User Feedback, User Sentiment, User ID, Interaction ID
raw_df = spark.read.option("header", True).csv(csv_path)

# Optional: show schema and few rows to verify loading worked
raw_df.printSchema()


root
 |-- Interaction ID: string (nullable = true)
 |-- Timestamp: string (nullable = true)
 |-- User Query: string (nullable = true)
 |-- Intent Detected: string (nullable = true)
 |-- Bot Response: string (nullable = true)
 |-- Response Time (ms): string (nullable = true)
 |-- Prediction Accuracy (%): string (nullable = true)
 |-- Entity Extraction Accuracy (%): string (nullable = true)
 |-- User Sentiment: string (nullable = true)
 |-- User Feedback: string (nullable = true)
 |-- Conversation Success: string (nullable = true)
 |-- User ID: string (nullable = true)
 |-- User Type: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Device Type: string (nullable = true)
 |-- Account Age (months): string (nullable = true)
 |-- Is Premium: string (nullable = true)



In [None]:
raw_df.show(5, truncate=False)

+--------------+----------------+----------------------------------+----------------+-------------------------------+------------------+-----------------------+------------------------------+--------------+-----------------+--------------------+-------+-------------+-------------+-----------+--------------------+----------+
|Interaction ID|Timestamp       |User Query                        |Intent Detected |Bot Response                   |Response Time (ms)|Prediction Accuracy (%)|Entity Extraction Accuracy (%)|User Sentiment|User Feedback    |Conversation Success|User ID|User Type    |Region       |Device Type|Account Age (months)|Is Premium|
+--------------+----------------+----------------------------------+----------------+-------------------------------+------------------+-----------------------+------------------------------+--------------+-----------------+--------------------+-------+-------------+-------------+-----------+--------------------+----------+
|1             |2020-0

In [None]:
# Now continue with the same pipeline steps:

tokenizer = Tokenizer(inputCol="User Feedback", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [None]:
feature_pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf])
enriched_df = feature_pipeline.fit(raw_df).transform(raw_df)

In [None]:
enriched_df = enriched_df.withColumn(
    "label",
    when(lower(col("User Sentiment")) == "positive", 2.0)
    .when(lower(col("User Sentiment")) == "neutral", 1.0)
    .otherwise(0.0)
)

In [None]:
## Select only the 'features' and 'label' columns for model training
training_data = enriched_df.select("features", "label")

print("=== Training data (features and label) sample ===")
training_data.show(5, truncate=False)  # Show first 5 rows

=== Training data (features and label) sample ===
+--------------------------------------------------------+-----+
|features                                                |label|
+--------------------------------------------------------+-----+
|(1000,[286],[0.5101598447800131])                       |2.0  |
|(1000,[678,740],[0.9147933520086514,0.9147933520086514])|0.0  |
|(1000,[678,740],[0.9147933520086514,0.9147933520086514])|0.0  |
|(1000,[678,740],[0.9147933520086514,0.9147933520086514])|1.0  |
|(1000,[678,740],[0.9147933520086514,0.9147933520086514])|0.0  |
+--------------------------------------------------------+-----+
only showing top 5 rows



In [None]:
# Initialize Logistic Regression classifier for multi-class sentiment classification
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)
# Train the Logistic Regression model on the training data
model = lr.fit(training_data)
# Use the trained model to predict sentiment on the full enriched dataset
predictions = model.transform(enriched_df)
print("=== Predictions sample ===")
predictions.select("User Feedback", "User Sentiment", "features", "label", "prediction", "probability").show(5, truncate=False)
# Add a new column 'Adaptive_Action' based on predicted sentiment class:
# If predicted class is 0 (negative), action = "Simplify Response"
# If predicted class is 1 (neutral), action = "Neutral - Monitor"
# Otherwise (class 2 = positive), action = "No Change"

=== Predictions sample ===
+-----------------+--------------+--------------------------------------------------------+-----+----------+----------------------------------------------------------------+
|User Feedback    |User Sentiment|features                                                |label|prediction|probability                                                     |
+-----------------+--------------+--------------------------------------------------------+-----+----------+----------------------------------------------------------------+
|Helpful          |Positive      |(1000,[286],[0.5101598447800131])                       |2.0  |2.0       |[1.3993849606908003E-4,1.1092330660680896E-4,0.9997491381973241]|
|Needs improvement|Negative      |(1000,[678,740],[0.9147933520086514,0.9147933520086514])|0.0  |0.0       |[0.7480047468142033,0.2518456483961935,1.4960478960322868E-4]   |
|Needs improvement|Negative      |(1000,[678,740],[0.9147933520086514,0.9147933520086514])|0.0  |0.0   

In [None]:
adaptive_actions = predictions.withColumn(
    "Adaptive_Action",
    when(col("prediction") == 0.0, lit("Simplify Response"))
    .when(col("prediction") == 1.0, lit("Neutral - Monitor"))
    .otherwise(lit("No Change"))
)
print("=== After adding Adaptive_Action column ===")
adaptive_actions.select("User Feedback", "prediction", "Adaptive_Action").show(5, truncate=False)


=== After adding Adaptive_Action column ===
+-----------------+----------+-----------------+
|User Feedback    |prediction|Adaptive_Action  |
+-----------------+----------+-----------------+
|Helpful          |2.0       |No Change        |
|Needs improvement|0.0       |Simplify Response|
|Needs improvement|0.0       |Simplify Response|
|Needs improvement|0.0       |Simplify Response|
|Needs improvement|0.0       |Simplify Response|
+-----------------+----------+-----------------+
only showing top 5 rows



In [None]:
# Compute average predicted sentiment per User ID
user_sentiment_avg = adaptive_actions.groupBy("User ID").agg(avg("prediction").alias("Avg_Sentiment"))
print("=== Average sentiment per user ===")
user_sentiment_avg.show(5, truncate=False)

=== Average sentiment per user ===
+-------+------------------+
|User ID|Avg_Sentiment     |
+-------+------------------+
|U10088 |1.0               |
|U10023 |0.8               |
|U10165 |0.75              |
|U10086 |1.6666666666666667|
|U10261 |0.6666666666666666|
+-------+------------------+
only showing top 5 rows



In [None]:
# Flag users for escalation based on average sentiment
escalation_flags = user_sentiment_avg.withColumn(
    "Auto_Escalate",
    when(col("Avg_Sentiment") < 0.5, lit(True)).otherwise(lit(False))
)
print("=== Escalation flags for users ===")
escalation_flags.show(5, truncate=False)


=== Escalation flags for users ===
+-------+------------------+-------------+
|User ID|Avg_Sentiment     |Auto_Escalate|
+-------+------------------+-------------+
|U10088 |1.0               |false        |
|U10023 |0.8               |false        |
|U10165 |0.75              |false        |
|U10086 |1.6666666666666667|false        |
|U10261 |0.6666666666666666|false        |
+-------+------------------+-------------+
only showing top 5 rows



In [None]:
# Join escalation flags back to the session-level data
final_output = adaptive_actions.join(escalation_flags, on="User ID", how="left")
print("=== Final output with escalation flags ===")
final_output.select(
    "Interaction ID", "User ID", "User Feedback", "User Sentiment", "prediction", "Adaptive_Action", "Auto_Escalate"
).show(5, truncate=False)

=== Final output with escalation flags ===
+--------------+-------+-----------------+--------------+----------+-----------------+-------------+
|Interaction ID|User ID|User Feedback    |User Sentiment|prediction|Adaptive_Action  |Auto_Escalate|
+--------------+-------+-----------------+--------------+----------+-----------------+-------------+
|1             |U10166 |Helpful          |Positive      |2.0       |No Change        |false        |
|2             |U10030 |Needs improvement|Negative      |0.0       |Simplify Response|false        |
|3             |U10253 |Needs improvement|Negative      |0.0       |Simplify Response|false        |
|4             |U10255 |Needs improvement|Neutral       |0.0       |Simplify Response|false        |
|5             |U10085 |Needs improvement|Negative      |0.0       |Simplify Response|false        |
+--------------+-------+-----------------+--------------+----------+-----------------+-------------+
only showing top 5 rows



In [None]:
# Join the escalation flags back to the session-level data on "User ID"
final_output = adaptive_actions.join(escalation_flags, on="User ID", how="left")
final_output.show()


+-------+--------------+----------------+--------------------+--------------------+--------------------+------------------+-----------------------+------------------------------+--------------+-----------------+--------------------+-------------+-------------+-----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+-----------------+------------------+-------------+
|User ID|Interaction ID|       Timestamp|          User Query|     Intent Detected|        Bot Response|Response Time (ms)|Prediction Accuracy (%)|Entity Extraction Accuracy (%)|User Sentiment|    User Feedback|Conversation Success|    User Type|       Region|Device Type|Account Age (months)|Is Premium|               words|            filtered|         rawFeatures|            features|label|       rawPrediction|         probability|prediction|  Adaptive_Action|     Avg_Sentiment|Auto_Escalate|
+-

In [None]:
# Display final actionable insights for each interaction/session
final_output.select(
    "Interaction ID", "User ID", "User Feedback", "User Sentiment", "prediction", "Adaptive_Action", "Auto_Escalate"
).show(truncate=False)

+--------------+-------+-----------------+--------------+----------+-----------------+-------------+
|Interaction ID|User ID|User Feedback    |User Sentiment|prediction|Adaptive_Action  |Auto_Escalate|
+--------------+-------+-----------------+--------------+----------+-----------------+-------------+
|1             |U10166 |Helpful          |Positive      |2.0       |No Change        |false        |
|2             |U10030 |Needs improvement|Negative      |0.0       |Simplify Response|false        |
|3             |U10253 |Needs improvement|Negative      |0.0       |Simplify Response|false        |
|4             |U10255 |Needs improvement|Neutral       |0.0       |Simplify Response|false        |
|5             |U10085 |Needs improvement|Negative      |0.0       |Simplify Response|false        |
|6             |U10228 |Helpful          |Positive      |2.0       |No Change        |false        |
|7             |U10081 |Helpful          |Positive      |2.0       |No Change        |false

In [None]:
final_output.select(
    "Interaction ID", "User ID", "User Feedback", "User Sentiment", "prediction", "Adaptive_Action", "Auto_Escalate"
).filter(col("Auto_Escalate") == True).show(truncate=False)


+--------------+-------+-----------------+--------------+----------+-----------------+-------------+
|Interaction ID|User ID|User Feedback    |User Sentiment|prediction|Adaptive_Action  |Auto_Escalate|
+--------------+-------+-----------------+--------------+----------+-----------------+-------------+
|22            |U10279 |Needs improvement|Negative      |0.0       |Simplify Response|true         |
|28            |U10248 |Needs improvement|Negative      |0.0       |Simplify Response|true         |
|83            |U10233 |Needs improvement|Negative      |0.0       |Simplify Response|true         |
|86            |U10189 |Needs improvement|Neutral       |0.0       |Simplify Response|true         |
|94            |U10012 |Needs improvement|Negative      |0.0       |Simplify Response|true         |
|116           |U10155 |Needs improvement|Negative      |0.0       |Simplify Response|true         |
|120           |U10180 |Needs improvement|Neutral       |0.0       |Simplify Response|true 

In [None]:
final_output = final_output.withColumn(
        "Session_Escalate",
        when(col("prediction") <= 1.0, lit(True)).otherwise(lit(False))
    )
final_output.show()

+-------+--------------+----------------+--------------------+--------------------+--------------------+------------------+-----------------------+------------------------------+--------------+-----------------+--------------------+-------------+-------------+-----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+-----------------+------------------+-------------+----------------+
|User ID|Interaction ID|       Timestamp|          User Query|     Intent Detected|        Bot Response|Response Time (ms)|Prediction Accuracy (%)|Entity Extraction Accuracy (%)|User Sentiment|    User Feedback|Conversation Success|    User Type|       Region|Device Type|Account Age (months)|Is Premium|               words|            filtered|         rawFeatures|            features|label|       rawPrediction|         probability|prediction|  Adaptive_Action|     Avg_Sentiment|

In [None]:
# Show only sessions where escalation is needed at session-level
final_output.filter(col("Session_Escalate") == True).show(truncate=False)


+-------+--------------+----------------+----------------------------------+-----------------+--------------------------------------+------------------+-----------------------+------------------------------+--------------+-----------------+--------------------+-------------+-------------+-----------+--------------------+----------+--------------------+--------------------+--------------------------+--------------------------------------------------------+-----+--------------------------------------------------------+-------------------------------------------------------------+----------+-----------------+------------------+-------------+----------------+
|User ID|Interaction ID|Timestamp       |User Query                        |Intent Detected  |Bot Response                          |Response Time (ms)|Prediction Accuracy (%)|Entity Extraction Accuracy (%)|User Sentiment|User Feedback    |Conversation Success|User Type    |Region       |Device Type|Account Age (months)|Is Premium|wor

In [None]:
# Filter escalated sessions needing improvement
sessions_to_improve = final_output.filter(
    (col("Auto_Escalate") == True) | (col("Session_Escalate") == True)
).select(
    "Interaction ID", "User ID", "User Query", "Bot Response", "User Sentiment",
    "User Feedback", "Adaptive_Action"
)

sessions_to_improve.show(truncate=False)


+--------------+-------+----------------------------------+--------------------------------------+--------------+-----------------+-----------------+
|Interaction ID|User ID|User Query                        |Bot Response                          |User Sentiment|User Feedback    |Adaptive_Action  |
+--------------+-------+----------------------------------+--------------------------------------+--------------+-----------------+-----------------+
|2             |U10030 |What is your prediction for Forex?|Forex markets favor USD today.        |Negative      |Needs improvement|Simplify Response|
|3             |U10253 |Any news on Gold?                 |Investors are flocking to Gold.       |Negative      |Needs improvement|Simplify Response|
|4             |U10255 |Should I buy or sell WTI?         |WTI is showing high volatility.       |Neutral       |Needs improvement|Simplify Response|
|5             |U10085 |What's the trend with Tesla?      |Tesla shows bullish signs.            |Ne

In [None]:
print(sessions_to_improve.columns)

['Interaction ID', 'User ID', 'User Query', 'Bot Response', 'User Sentiment', 'User Feedback', 'Adaptive_Action']


In [None]:
# Step 1: Select relevant columns: user queries and current bot responses
sessions_for_review = sessions_to_improve.select("User Query", "Bot Response")

# Step 2: Convert to Pandas DataFrame
pandas_review_df = sessions_for_review.toPandas()

# Step 3: Add a blank column for suggested improved responses
pandas_review_df["Improved Bot Response Suggestion"] = ""

# Step 4: Save to CSV for manual review
output_path = r'bot_response_review_new.csv'
pandas_review_df.to_csv(output_path, index=False)

# Step 5: Notify
print(f"✅ CSV created for bot response improvement at:\n{output_path}")


✅ CSV created for bot response improvement at:
bot_response_review_new.csv


In [None]:
#Step 1: Load Data with PySpark
#python
#CopyEdit
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ImprovedBotResponse").getOrCreate()

csv_path = r'/content/bot_response_review_new.csv'
df = spark.read.option("header", True).csv(csv_path)
df.printSchema()




root
 |-- User Query: string (nullable = true)
 |-- Bot Response: string (nullable = true)
 |-- Improved Bot Response Suggestion: string (nullable = true)



In [None]:
#🔁 Step 2: Convert to Pandas
#python
#CopyEdit
# Convert to Pandas for transformer use
pandas_df = df.toPandas()

In [None]:
#🤖 Step 3: Use Hugging Face Transformers to Improve Responses
#We'll use transformers library (like t5-small or gpt2) to rephrase or improve bot responses given the query.
#Install required packages first:
#bash
#CopyEdit
!pip install transformers sentencepiece




In [None]:


# After installation, restart your runtime (Runtime > Restart runtime),
# then run the following code.

#  Import and load the paraphrasing model with improved sampling logic
from transformers import pipeline
import pandas as pd

# Load the paraphrasing model
rephrase_pipeline = pipeline("text2text-generation", model="Vamsi/T5_Paraphrase_Paws")

def improve_response(query, bot_response):
    if not bot_response or not isinstance(bot_response, str):
        return ""

    input_text = f"paraphrase: {bot_response} </s>"

    # Generate multiple paraphrases using sampling to encourage variation
    outputs = rephrase_pipeline(
        input_text,
        max_length=60,
        num_return_sequences=3,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.9,
        clean_up_tokenization_spaces=True
    )

    # Return the first paraphrase that is different from original
    for out in outputs:
        candidate = out['generated_text'].strip()
        if candidate.lower() != bot_response.strip().lower():
            return candidate

    # If none different, append note to original
    return bot_response + " (Rephrased)"

#  Load your CSV dataset into Pandas
csv_path = '/content/bot_response_review_new.csv'  # <-- change this to your file path
df = pd.read_csv(csv_path)

#  Apply the paraphrasing function to all rows
df["Improved Bot Response"] = df.apply(
    lambda row: improve_response(row["User Query"], row["Bot Response"]),
    axis=1
)

#  Save the updated DataFrame to a new CSV file
output_path = '/content/bot_response_review.csv'  # change as needed
df.to_csv(output_path, index=False)

print(f"✅ Improved responses saved to {output_path}")

# Optional: display the first few rows to verify
print(df[["User Query", "Bot Response", "Improved Bot Response"]].head())


Device set to use cpu
Both `max_new_tokens` (=256) and `max_length`(=60) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=60) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=60) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=60) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes

✅ Improved responses saved to /content/bot_response_review.csv
                           User Query                     Bot Response  \
0  What is your prediction for Forex?   Forex markets favor USD today.   
1                   Any news on Gold?  Investors are flocking to Gold.   
2           Should I buy or sell WTI?  WTI is showing high volatility.   
3        What's the trend with Tesla?       Tesla shows bullish signs.   
4    What is your prediction for USD?              USD remains stable.   

                 Improved Bot Response  
0  Today, the Forex markets favor USD.  
1             Investors flock to gold.  
2           WTI shows high volatility.  
3        Tesla displays bullish signs.  
4              The USD remains stable.  
