In [8]:
!pip install nltk scikit-learn flask joblib



In [15]:
import pandas as pd

# Load the data
df = pd.read_csv("customer_support_tickets.csv")

# Preview first few rows
df.head()

Unnamed: 0,Ticket ID,Customer Name,Customer Email,Customer Age,Customer Gender,Product Purchased,Date of Purchase,Ticket Type,Ticket Subject,Ticket Description,Ticket Status,Resolution,Ticket Priority,Ticket Channel,First Response Time,Time to Resolution,Customer Satisfaction Rating
0,1,Marisa Obrien,carrollallison@example.com,32,Other,GoPro Hero,2021-03-22,Technical issue,Product setup,I'm having an issue with the {product_purchase...,Pending Customer Response,,Critical,Social media,2023-06-01 12:15:36,,
1,2,Jessica Rios,clarkeashley@example.com,42,Female,LG Smart TV,2021-05-22,Technical issue,Peripheral compatibility,I'm having an issue with the {product_purchase...,Pending Customer Response,,Critical,Chat,2023-06-01 16:45:38,,
2,3,Christopher Robbins,gonzalestracy@example.com,48,Other,Dell XPS,2020-07-14,Technical issue,Network problem,I'm facing a problem with my {product_purchase...,Closed,Case maybe show recently my computer follow.,Low,Social media,2023-06-01 11:14:38,2023-06-01 18:05:38,3.0
3,4,Christina Dillon,bradleyolson@example.org,27,Female,Microsoft Office,2020-11-13,Billing inquiry,Account access,I'm having an issue with the {product_purchase...,Closed,Try capital clearly never color toward story.,Low,Social media,2023-06-01 07:29:40,2023-06-01 01:57:40,3.0
4,5,Alexander Carroll,bradleymark@example.com,67,Female,Autodesk AutoCAD,2020-02-04,Billing inquiry,Data loss,I'm having an issue with the {product_purchase...,Closed,West decision evidence bit.,Low,Email,2023-06-01 00:12:42,2023-06-01 19:53:42,1.0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8469 entries, 0 to 8468
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Ticket ID                     8469 non-null   int64  
 1   Customer Name                 8469 non-null   object 
 2   Customer Email                8469 non-null   object 
 3   Customer Age                  8469 non-null   int64  
 4   Customer Gender               8469 non-null   object 
 5   Product Purchased             8469 non-null   object 
 6   Date of Purchase              8469 non-null   object 
 7   Ticket Type                   8469 non-null   object 
 8   Ticket Subject                8469 non-null   object 
 9   Ticket Description            8469 non-null   object 
 10  Ticket Status                 8469 non-null   object 
 11  Resolution                    2769 non-null   object 
 12  Ticket Priority               8469 non-null   object 
 13  Tic

In [17]:
df.isnull().sum()

Ticket ID                          0
Customer Name                      0
Customer Email                     0
Customer Age                       0
Customer Gender                    0
Product Purchased                  0
Date of Purchase                   0
Ticket Type                        0
Ticket Subject                     0
Ticket Description                 0
Ticket Status                      0
Resolution                      5700
Ticket Priority                    0
Ticket Channel                     0
First Response Time             2819
Time to Resolution              5700
Customer Satisfaction Rating    5700
dtype: int64

In [18]:
# Remove missing rows
df = df.dropna()

In [21]:
df.duplicated().sum()

0

In [24]:
df['Ticket Type'].value_counts()

Ticket Type
Refund request          596
Technical issue         580
Billing inquiry         544
Product inquiry         533
Cancellation request    516
Name: count, dtype: int64

In [25]:
df['Ticket Description'].str.contains("{product_purchased}", regex=False).sum()

2769

In [26]:
df['Ticket Description'] = df['Ticket Description'].str.replace("{product_purchased}", "", regex=False)

In [27]:
df['Ticket Description'].apply(lambda x: len(str(x).split())).describe()

count    2769.000000
mean       46.053810
std         8.308254
min        21.000000
25%        42.000000
50%        48.000000
75%        52.000000
max        62.000000
Name: Ticket Description, dtype: float64

In [32]:
df = df[df['Ticket Description'].apply(lambda x: len(str(x).split())) > 3]

In [34]:
df.to_csv("cleaned_support_tickets.csv", index=False)

In [35]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import joblib

# Download resources
nltk.download('punkt')
nltk.download('stopwords')

# Load your dataset
df = pd.read_csv('cleaned_support_tickets.csv')  # Update if needed

# Use correct columns
questions = df['Ticket Description'].astype(str)
labels = df['Ticket Type'].astype(str)

# Preprocessing
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

questions_clean = questions.apply(preprocess)

# TF-IDF vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(questions_clean)

# Label encoding
encoder = LabelEncoder()
y = encoder.fit_transform(labels)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate
print(classification_report(y_test, model.predict(X_test), target_names=encoder.classes_))

# Save components
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(encoder, 'label_encoder.pkl')
joblib.dump(model, 'chatbot_model.pkl')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arpan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arpan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                      precision    recall  f1-score   support

     Billing inquiry       0.19      0.13      0.15       357
Cancellation request       0.17      0.18      0.18       327
     Product inquiry       0.19      0.18      0.19       316
      Refund request       0.20      0.23      0.22       345
     Technical issue       0.21      0.24      0.22       349

            accuracy                           0.19      1694
           macro avg       0.19      0.19      0.19      1694
        weighted avg       0.19      0.19      0.19      1694



['chatbot_model.pkl']

In [36]:
# Load components
vectorizer = joblib.load('vectorizer.pkl')
encoder = joblib.load('label_encoder.pkl')
model = joblib.load('chatbot_model.pkl')

# Preprocess function (reuse)
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Response function
def get_response(user_query):
    cleaned = preprocess(user_query)
    vect = vectorizer.transform([cleaned])
    pred = model.predict(vect)
    intent = encoder.inverse_transform(pred)[0]

    responses = {
        "Technical issue": "It seems you're facing a technical issue. Please check your device or contact support.",
        "Billing inquiry": "Billing-related issues can be resolved through your account billing section.",
        "General inquiry": "Feel free to ask general questions. I'm here to help!",
        "Account management": "You can manage your account settings from the dashboard.",
    }

    return responses.get(intent, f"I'm not sure, but it seems like a '{intent}' type of issue.")


In [37]:
# Test chatbot
query = "I can't log into my account"
print("User:", query)
print("Chatbot:", get_response(query))

User: I can't log into my account
Chatbot: I'm not sure, but it seems like a 'Refund request' type of issue.


In [None]:
print("Chatbot is ready! Type 'exit' to stop.\n")
while True:
    user_input = input("You: ")
    if user_input.lower() in ['exit', 'quit']:
        print("Chatbot: Goodbye! 👋")
        break
    response = get_response(user_input)
    print("Chatbot:", response)

Chatbot is ready! Type 'exit' to stop.



You:  I can't access my dashboard


Chatbot: I'm not sure, but it seems like a 'Product inquiry' type of issue.


You:  I was charged twice!


Chatbot: I'm not sure, but it seems like a 'Product inquiry' type of issue.


In [7]:
import pandas as pd
import numpy as np
import nltk
import string
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# nltk.download('stopwords')
# nltk.download('punkt')
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Load dataset
df = pd.read_csv('customer_support_tickets.csv')

# Use proper columns
questions = df['Ticket Description'].astype(str)
labels = df['Ticket Type'].astype(str)

# Preprocessing
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

questions_clean = questions.apply(preprocess)

# Vectorize
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(questions_clean)

# Encode labels
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(labels)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=encoder.classes_))

# Save components
import joblib
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(encoder, 'label_encoder.pkl')
joblib.dump(model, 'chatbot_model.pkl')


                      precision    recall  f1-score   support

     Billing inquiry       0.19      0.13      0.15       357
Cancellation request       0.17      0.18      0.18       327
     Product inquiry       0.19      0.18      0.19       316
      Refund request       0.20      0.23      0.22       345
     Technical issue       0.21      0.24      0.22       349

            accuracy                           0.19      1694
           macro avg       0.19      0.19      0.19      1694
        weighted avg       0.19      0.19      0.19      1694



['chatbot_model.pkl']