In [12]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

[nltk_data] Downloading package stopwords to C:\Users\Anand
[nltk_data]     Mall\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Anand
[nltk_data]     Mall\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

class TicketVectorizer:
    def __init__(self, filepath):
        self.df = pd.read_csv(filepath)
        self.df['cleaned'] = self.df['description'].apply(clean_text)
        
        self.vectorizer = TfidfVectorizer()
        self.tfidf_matrix = self.vectorizer.fit_transform(self.df['cleaned'])

    def transform_query(self, query):
        cleaned_query = clean_text(query)
        return self.vectorizer.transform([cleaned_query])

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

class TicketRecommender:
    def __init__(self, vectorizer):
        self.vectorizer = vectorizer

    def recommend(self, query, top_k=3):
        query_vec = self.vectorizer.transform_query(query)
        
        similarities = cosine_similarity(
            query_vec,
            self.vectorizer.tfidf_matrix
        ).flatten()

        top_indices = similarities.argsort()[-top_k:][::-1]

        results = []
        for idx in top_indices:
            results.append({
                "ticket_id": int(self.vectorizer.df.iloc[idx]["ticket_id"]),
                "description": self.vectorizer.df.iloc[idx]["description"],
                "resolution": self.vectorizer.df.iloc[idx]["resolution"],
                "similarity_score": float(similarities[idx])
            })

        return results

In [20]:
vectorizer = TicketVectorizer("../enterprise_synthetic_tickets.csv")
recommender = TicketRecommender(vectorizer)

query = "Laptop running Windows 11 unable to connect to office WiFi network from home network"

recommendations = recommender.recommend(query)

for r in recommendations:
    print(r)

{'ticket_id': 156, 'description': 'Laptop not connecting to office WiFi network', 'resolution': 'Update network drivers, restart router and verify network security policies', 'similarity_score': 0.5663007807253793}
{'ticket_id': 128, 'description': 'Laptop not connecting to office WiFi network', 'resolution': 'Update network drivers, restart router and verify network security policies', 'similarity_score': 0.5663007807253793}
{'ticket_id': 144, 'description': 'Laptop not connecting to office WiFi network', 'resolution': 'Update network drivers, restart router and verify network security policies', 'similarity_score': 0.5663007807253793}


In [None]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


# ==============================
# TEXT CLEANING
# ==============================
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = text.split()
    tokens = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word not in stop_words and len(word) > 2
    ]
    return " ".join(tokens)


# ==============================
# VECTORIZER
# ==============================
class TicketVectorizer:
    def __init__(self, filepath):

        self.df = pd.read_csv(filepath)

        # ðŸ”¥ Remove duplicate descriptions
        self.df = self.df.drop_duplicates(subset=["description"])

        # Clean text
        self.df["cleaned"] = self.df["description"].apply(clean_text)

        # ðŸ”¥ Use n-grams (important upgrade)
        self.vectorizer = TfidfVectorizer(
            ngram_range=(1, 2),   
            max_df=0.95,
            min_df=1
        )

        self.tfidf_matrix = self.vectorizer.fit_transform(self.df["cleaned"])

    def transform_query(self, query):
        cleaned_query = clean_text(query)
        return self.vectorizer.transform([cleaned_query])


# ==============================
# RECOMMENDER
# ==============================
def recommend(self, query, top_k=3, threshold=0.30):

    query_vec = self.vectorizer.transform_query(query)

    similarities = cosine_similarity(
        query_vec,
        self.vectorizer.tfidf_matrix
    ).flatten()

    # ðŸ”¥ Boost same-category matches
    query_lower = query.lower()

    for idx, row in self.vectorizer.df.iterrows():
        category = row["category"].lower()

        if category in query_lower:
            similarities[idx] *= 1.25   # boost score
        else:
            similarities[idx] *= 0.90   # slight penalty

    sorted_indices = similarities.argsort()[::-1]

    results = []
    for idx in sorted_indices:
        if similarities[idx] < threshold:
            continue

        results.append({
            "ticket_id": int(self.vectorizer.df.iloc[idx]["ticket_id"]),
            "description": self.vectorizer.df.iloc[idx]["description"],
            "category": self.vectorizer.df.iloc[idx]["category"],
            "priority": self.vectorizer.df.iloc[idx]["priority"],
            "resolution": self.vectorizer.df.iloc[idx]["resolution"],
            "similarity_score": round(float(similarities[idx]), 4)
        })

        if len(results) == top_k:
            break

    return results


# ==============================
# RUN TEST
# ==============================
vectorizer = TicketVectorizer("../enterprise_synthetic_tickets.csv")
recommender = TicketRecommender(vectorizer)

query = "Laptop running Windows 11 unable to connect to office WiFi network from home network"

recommendations = recommender.recommend(query, top_k=3, threshold=0.35)

for r in recommendations:
    print(r)

{'ticket_id': 128, 'description': 'Laptop not connecting to office WiFi network', 'category': 'WiFi', 'priority': 'High', 'resolution': 'Update network drivers, restart router and verify network security policies', 'similarity_score': 0.5579}
{'ticket_id': 27, 'description': 'Frequent VPN disconnections while working from home network', 'category': 'VPN', 'priority': 'High', 'resolution': 'Clear cached VPN credentials, reauthenticate and verify MFA configuration', 'similarity_score': 0.3575}


[nltk_data] Downloading package stopwords to C:\Users\Anand
[nltk_data]     Mall\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Anand
[nltk_data]     Mall\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
