In [18]:
import nltk
import pandas as pd
from string import punctuation
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer


In [19]:
nltk.download("all")  # download packages required for text processing!

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\JAYENDRAARCHANA\AppData\Roaming\nltk_dat
[nltk_data]    |     a...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\JAYENDRAARCHANA\AppData\Roaming\nltk_dat
[nltk_data]    |     a...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\JAYENDRAARCHANA\AppData\Roaming\nltk_dat
[nltk_data]    |     a...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\JAYENDRAARCHANA\AppData\Roaming\nltk_dat
[nltk_data]    |     a...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_da

True

# Loading datasets
## data sources: kaggle

In [20]:
dataset_1 = pd.read_csv("datasets/customer_support_tickets.csv")
dataset_1

Unnamed: 0,Ticket ID,Customer Name,Customer Email,Customer Age,Customer Gender,Product Purchased,Date of Purchase,Ticket Type,Ticket Subject,Ticket Description,Ticket Status,Resolution,Ticket Priority,Ticket Channel,First Response Time,Time to Resolution,Customer Satisfaction Rating
0,1,Marisa Obrien,carrollallison@example.com,32,Other,GoPro Hero,2021-03-22,Technical issue,Product setup,I'm having an issue with the {product_purchase...,Pending Customer Response,,Critical,Social media,2023-06-01 12:15:36,,
1,2,Jessica Rios,clarkeashley@example.com,42,Female,LG Smart TV,2021-05-22,Technical issue,Peripheral compatibility,I'm having an issue with the {product_purchase...,Pending Customer Response,,Critical,Chat,2023-06-01 16:45:38,,
2,3,Christopher Robbins,gonzalestracy@example.com,48,Other,Dell XPS,2020-07-14,Technical issue,Network problem,I'm facing a problem with my {product_purchase...,Closed,Case maybe show recently my computer follow.,Low,Social media,2023-06-01 11:14:38,2023-06-01 18:05:38,3.0
3,4,Christina Dillon,bradleyolson@example.org,27,Female,Microsoft Office,2020-11-13,Billing inquiry,Account access,I'm having an issue with the {product_purchase...,Closed,Try capital clearly never color toward story.,Low,Social media,2023-06-01 07:29:40,2023-06-01 01:57:40,3.0
4,5,Alexander Carroll,bradleymark@example.com,67,Female,Autodesk AutoCAD,2020-02-04,Billing inquiry,Data loss,I'm having an issue with the {product_purchase...,Closed,West decision evidence bit.,Low,Email,2023-06-01 00:12:42,2023-06-01 19:53:42,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8464,8465,David Todd,adam28@example.net,22,Female,LG OLED,2021-12-08,Product inquiry,Installation support,My {product_purchased} is making strange noise...,Open,,Low,Phone,,,
8465,8466,Lori Davis,russell68@example.com,27,Female,Bose SoundLink Speaker,2020-02-22,Technical issue,Refund request,I'm having an issue with the {product_purchase...,Open,,Critical,Email,,,
8466,8467,Michelle Kelley,ashley83@example.org,57,Female,GoPro Action Camera,2021-08-17,Technical issue,Account access,I'm having an issue with the {product_purchase...,Closed,Eight account century nature kitchen.,High,Social media,2023-06-01 09:44:22,2023-06-01 04:31:22,3.0
8467,8468,Steven Rodriguez,fpowell@example.org,54,Male,PlayStation,2021-10-16,Product inquiry,Payment issue,I'm having an issue with the {product_purchase...,Closed,We seat culture plan.,Medium,Email,2023-06-01 18:28:24,2023-06-01 05:32:24,3.0


# Getting information about dataset

In [21]:
dataset_1.describe()

Unnamed: 0,Ticket ID,Customer Age,Customer Satisfaction Rating
count,8469.0,8469.0,2769.0
mean,4235.0,44.026804,2.991333
std,2444.934048,15.296112,1.407016
min,1.0,18.0,1.0
25%,2118.0,31.0,2.0
50%,4235.0,44.0,3.0
75%,6352.0,57.0,4.0
max,8469.0,70.0,5.0


# Checking for Null (NaN) values!

In [22]:
dataset_1.isnull().sum()

Ticket ID                          0
Customer Name                      0
Customer Email                     0
Customer Age                       0
Customer Gender                    0
Product Purchased                  0
Date of Purchase                   0
Ticket Type                        0
Ticket Subject                     0
Ticket Description                 0
Ticket Status                      0
Resolution                      5700
Ticket Priority                    0
Ticket Channel                     0
First Response Time             2819
Time to Resolution              5700
Customer Satisfaction Rating    5700
dtype: int64

# ðŸ“Œ NLP Pipeline for Classical Machine Learning Sentiment Analysis (Flowchart)

flowchart TD
    A[Raw Text] --> B[Tokenization]
    B --> C[Lowercasing]
    C --> D[Punctuation Removal]
    D --> E[Stop-Word Removal]
    E --> F[Lemmatization]
    F --> G[Number Handling (normalize/remove)]
    G --> H[Vectorization (CountVectorizer / TFâ€“IDF)]
    H --> I[Train/Test ML Model]
    I --> J[Sentiment Prediction]


In [7]:
def word_processor(text: str):
    lemmatizer = WordNetLemmatizer()  # lematization!
    raw_tokens = word_tokenize(text)
    tokens = [token for token in raw_tokens if token not in punctuation]
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    return ' '.join(lemmatized_tokens)

In [8]:
def analyze_polarity(text: str):
    polarity_scores = SentimentIntensityAnalyzer().polarity_scores(text)
    del polarity_scores["compound"]
    return max(polarity_scores, key=polarity_scores.get)

In [None]:
tfidf_model = TfidfVectorizer()
dataset_1['Ticket Description'] = dataset_1['Ticket Description'].apply(word_processor)
transformed = tfidf_model.fit_transform(dataset_1['Ticket Description'])
print(tfidf_model.vocabulary_)