Importing the necessary libraries

In [305]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer



In [306]:
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
# Force download to the correct path
nltk.download('punkt', download_dir=r"C:\Users\anshk\AppData\Roaming\nltk_data")

# Append the correct nltk data path
nltk.data.path.append(r"C:\Users\anshk\AppData\Roaming\nltk_data")

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\anshk\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anshk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\anshk\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\anshk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anshk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [307]:
df = pd.read_excel('dataset.xls')
df.head()

Unnamed: 0,ticket_id,ticket_text,issue_type,urgency_level,product
0,1,Payment issue for my SmartWatch V2. I was unde...,Billing Problem,Medium,SmartWatch V2
1,2,Can you tell me more about the UltraClean Vacu...,General Inquiry,,UltraClean Vacuum
2,3,I ordered SoundWave 300 but got EcoBreeze AC i...,Wrong Item,Medium,SoundWave 300
3,4,Facing installation issue with PhotoSnap Cam. ...,Installation Issue,Low,PhotoSnap Cam
4,5,Order #30903 for Vision LED TV is 13 days late...,Late Delivery,,Vision LED TV


In [308]:
# df.info()

In [309]:
df.isnull().sum().sort_values(ascending=False)

issue_type       76
ticket_text      55
urgency_level    52
ticket_id         0
product           0
dtype: int64

In [310]:
df.dropna(inplace=True)
df.isnull().sum().sort_values(ascending=False)

ticket_id        0
ticket_text      0
issue_type       0
urgency_level    0
product          0
dtype: int64

In [311]:
# duplicate = df[df.duplicated()]
# print(duplicate)

In [312]:
# df = df.drop_duplicates(keep=False)
# df.info()

In [313]:

# def clean_text(text):
#     text = text.lower()
#     text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove punctuation
#     text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
#     return text

# df['ticket_text'] = df['ticket_text'].apply(clean_text)
# df['issue_type'] = df['issue_type'].apply(clean_text)
# df['urgency_level'] = df['urgency_level'].apply(clean_text)
# df['product'] = df['product'].apply(clean_text)
# df.head()

In [314]:


# Initialize
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply to your DataFrame
df['ticket_text'] = df['ticket_text'].apply(preprocess_text)
df['issue_type'] = df['issue_type'].apply(preprocess_text)
df['urgency_level'] = df['urgency_level'].apply(preprocess_text)
df.head()



Unnamed: 0,ticket_id,ticket_text,issue_type,urgency_level,product
0,1,payment issue smartwatch v underbilled order,billing problem,medium,SmartWatch V2
2,3,ordered soundwave got ecobreeze ac instead ord...,wrong item,medium,SoundWave 300
3,4,facing installation issue photosnap cam setup ...,installation issue,low,PhotoSnap Cam
5,6,tell photosnap cam warranty also available red,general inquiry,medium,PhotoSnap Cam
6,7,malfunction stopped working day,product defect,low,EcoBreeze AC


In [315]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=1000)
X_tfidf = tfidf.fit_transform(df['ticket_text'])


In [316]:
from textblob import TextBlob

df['text_length'] = df['ticket_text'].apply(len)
df['sentiment'] = df['ticket_text'].apply(lambda x: TextBlob(x).sentiment.polarity)


In [317]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download the VADER sentiment lexicon (only once)
nltk.download('vader_lexicon')

# Initialize Sentiment Analyzer
sia = SentimentIntensityAnalyzer()

# Function to extract sentiment score
def get_sentiment_score(text):
    sentiment = sia.polarity_scores(text)
    return sentiment['compound']  # Range: -1 (negative) to +1 (positive)

# Function to map sentiment score to urgency level
def predict_urgency_from_sentiment(text):
    score = get_sentiment_score(text)
    
    if score <= -0.4:
        return "high"
    elif score <= 0.2:
        return "medium"
    else:
        return "low"

# Apply to your DataFrame
df['sentiment_score'] = df['ticket_text'].apply(get_sentiment_score)
df['predicted_urgency'] = df['ticket_text'].apply(predict_urgency_from_sentiment)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\anshk\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [336]:
import numpy as np
from scipy.sparse import hstack

X_combined = hstack([X_tfidf, 
                     np.array(df['text_length']).reshape(-1, 1), 
                     np.array(df['sentiment']).reshape(-1, 1)])


In [319]:
from sklearn.preprocessing import LabelEncoder

issue_encoder = LabelEncoder()
urgency_encoder = LabelEncoder()

y_issue = issue_encoder.fit_transform(df['issue_type'])
y_urgency = urgency_encoder.fit_transform(df['urgency_level'])


In [334]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X_train, X_test, y_issue_train, y_issue_test = train_test_split(X_combined, y_issue, test_size=0.2, random_state=42)

issue_model = LogisticRegression(max_iter=10000)
issue_model.fit(X_train, y_issue_train)
y_issue_pred = issue_model.predict(X_test)

print("Issue Type Classification Report:")
print(classification_report(y_issue_test, y_issue_pred, target_names=issue_encoder.classes_))


Issue Type Classification Report:
                    precision    recall  f1-score   support

    account access       1.00      1.00      1.00        23
   billing problem       1.00      1.00      1.00        19
   general inquiry       1.00      1.00      1.00        25
installation issue       1.00      1.00      1.00        29
     late delivery       1.00      1.00      1.00        17
    product defect       1.00      1.00      1.00        30
        wrong item       1.00      1.00      1.00        23

          accuracy                           1.00       166
         macro avg       1.00      1.00      1.00       166
      weighted avg       1.00      1.00      1.00       166



In [322]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


X_train_u, X_test_u, y_train_u, y_test_u = train_test_split(X_combined, y_urgency, test_size=0.2, random_state=42)


urgency_model = LogisticRegression(max_iter=10000)
urgency_model.fit(X_train_u, y_train_u)
y_pred_u = urgency_model.predict(X_test_u)

print("Urgency Level Classification Report:")
print(classification_report(y_test_u, y_pred_u, target_names=urgency_encoder.classes_))


Urgency Level Classification Report:
              precision    recall  f1-score   support

        high       0.40      0.35      0.37        66
         low       0.29      0.33      0.31        43
      medium       0.32      0.33      0.32        57

    accuracy                           0.34       166
   macro avg       0.33      0.34      0.33       166
weighted avg       0.34      0.34      0.34       166



In [331]:
import re
import spacy

nlp = spacy.load("en_core_web_sm")

def extract_entities(text):
    # Predefined products and complaint keywords
    products_list = [
        "SmartWatch V2", "UltraClean Vacuum", "SoundWave 300", "PhotoSnap Cam", "Vision LED TV",
        "EcoBreeze AC", "RoboChef Blender", "FitRun Treadmill", "PowerMax Battery", "ProTab X1"
    ]
    complaint_keywords_list = [
    "no response", "charged twice", "showing blocked", "not working", "setup fails",
    "not received", "wrong item", "debited incorrectly", "not refunded", "mixed up",
    "captcha failed", "unknown issue", "reset required", "not able to install", "can’t log in",
    "can’t access"
]

    # Method 1: Predefined list matching
    found_products = [p for p in products_list if p.lower() in text.lower()]
    found_keywords = [kw for kw in complaint_keywords_list if kw in text.lower()]
    dates = re.findall(r'\d{1,2} [A-Za-z]+', text)

    # Method 2: spaCy NER and custom rules
    doc = nlp(text)
    ner_products = []
    ner_complaint_phrases = []

    # Extract products using NER
    for ent in doc.ents:
        if ent.label_ in ["PRODUCT", "ORG"]:
            ner_products.append(ent.text)

    # Extract complaint keywords using spaCy
    for token in doc:
        if token.dep_ == "ROOT" and token.pos_ in ["VERB", "ADJ"]:
            if token.lemma_.lower() in complaint_keywords_list:
                ner_complaint_phrases.append(token.text)

    # Combine results
    products = list(set(found_products)) if found_products else list(set(ner_products))
    complaint_keywords = list(set(found_keywords + ner_complaint_phrases))
    dates = dates if dates else ["No dates found"]

    return {
        "products": products,
        "dates": dates,
        "complaint_keywords": complaint_keywords
    }

In [329]:
def process_ticket(text):
    try:
        preprocessed = preprocess_text(text)
        tfidf_input = tfidf.transform([preprocessed])
        text_length = len(text)
        sentiment = TextBlob(text).sentiment.polarity

        combined_features = hstack([tfidf_input, [[text_length]], [[sentiment]]])

        issue_pred = issue_encoder.inverse_transform(issue_model.predict(combined_features))[0]
        urgency_pred = urgency_encoder.inverse_transform(urgency_model.predict(combined_features))[0]
        entities = extract_entities(text)

        return {
            "predicted_issue_type": issue_pred,
            "predicted_urgency_level": urgency_pred,
            "entities": entities
        }

    except Exception as e:
        return {
            "predicted_issue_type": "Error: " + str(e),
            "predicted_urgency_level": "Error: " + str(e),
            "entities": "Error: " + str(e)
        }


In [335]:
import gradio as gr

def gradio_interface(ticket_text):
    result = process_ticket(ticket_text)

    # Print to console for full debugging
    print("DEBUG:", result)

    return (
        result['predicted_issue_type'],
        result['predicted_urgency_level'],
        result['entities']
    )

    # return {
    #     "Predicted Issue Type": result['predicted_issue_type'],
    #     "Predicted Urgency Level": result['predicted_urgency_level'],
    #     "Extracted Entities": result['entities']
    # }

demo = gr.Interface(
    fn=gradio_interface,
    inputs=gr.Textbox(lines=5, label="Enter Ticket Text"),
    outputs=[
        gr.Text(label="Predicted Issue Type"),
        gr.Text(label="Predicted Urgency Level"),
        gr.JSON(label="Extracted Entities")
    ],
    title="Customer Ticket Classifier",
    description="Paste a support ticket message to classify its issue type, urgency level, and extract key entities."
)

demo.launch()


* Running on local URL:  http://127.0.0.1:7891
* To create a public link, set `share=True` in `launch()`.




DEBUG: {'predicted_issue_type': 'billing problem', 'predicted_urgency_level': 'low', 'entities': {'products': ['EcoBreeze AC'], 'dates': ['No dates found'], 'complaint_keywords': ['charged twice']}}
DEBUG: {'predicted_issue_type': 'product defect', 'predicted_urgency_level': 'low', 'entities': {'products': ['FitRun Treadmill'], 'dates': ['2 days'], 'complaint_keywords': []}}
DEBUG: {'predicted_issue_type': 'wrong item', 'predicted_urgency_level': 'high', 'entities': {'products': [], 'dates': ['12 days', '15 May'], 'complaint_keywords': ['mixed up']}}
DEBUG: {'predicted_issue_type': 'wrong item', 'predicted_urgency_level': 'high', 'entities': {'products': ['UltraClean Vacuum'], 'dates': ['15 March'], 'complaint_keywords': ['not received', 'not working']}}
DEBUG: {'predicted_issue_type': 'wrong item', 'predicted_urgency_level': 'high', 'entities': {'products': ['UltraClean Vacuum'], 'dates': ['15 March'], 'complaint_keywords': ['not received', 'not working']}}
DEBUG: {'predicted_issue_ty