In [1]:
## Comcast Telecom Complaints Dataset

**Context** <br>
Estimation of Comcast Customer Top Complaints.

**Acknowledgements** <br>
Kaggle Datasets

In [2]:
import warnings
warnings.filterwarnings(action='ignore')

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams

In [3]:
import numpy as np
import pandas as pd

In [4]:
###!mkdir ~/.kaggle

In [5]:
###!cp /kaggle.json ~/.kaggle/

In [6]:
###!chmod 600 ~/.kaggle/kaggle.json

In [7]:
###!pip install keras-tuner

In [8]:
###!pip install kaggle

In [9]:
####! kaggle datasets download -d yasserh/comcast-telecom-complaints

In [10]:
###! unzip /content/comcast-telecom-complaints.zip

In [11]:
comcast = pd.read_csv("/content/Comcast.csv")

In [12]:
training_data = comcast.sample(frac=0.7, random_state=25)
testing_data = comcast.drop(training_data.index)

In [13]:
print(training_data.shape, testing_data.shape)

(1557, 11) (667, 11)


In [14]:
training_data.to_csv("train.csv")
testing_data.to_csv("test.csv")

In [15]:
training_data.columns

Index(['Ticket #', 'Customer Complaint', 'Date', 'Date_month_year', 'Time',
       'Received Via', 'City', 'State', 'Zip code', 'Status',
       'Filing on Behalf of Someone'],
      dtype='object')

In [16]:
training_data.Status.value_counts()

Solved     658
Closed     521
Open       266
Pending    112
Name: Status, dtype: int64

In [17]:
####! pip install unidecode

In [18]:
####! pip install nltk

In [19]:
import re, unidecode
from bs4 import BeautifulSoup
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Needed only once
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text(separator=" ")
    return stripped_text
def remove_accented_chars(text):
    text = unidecode.unidecode(text)
    return text
def remove_numbers(text):
    result = re.sub(r'\d+', '', text)
    return result
def remove_slash_with_space(text):
    return text.replace('\\', " ")
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)
def text_lowercase(text):
    return text.lower()
def remove_whitespace(text):
    return  " ".join(text.split())
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)
def stem_words(text):
    stemmer = PorterStemmer()
    word_tokens = word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    return ' '.join(stems)
def lemmatize_words(text):
    lemmatizer = WordNetLemmatizer()
    word_tokens = word_tokenize(text)
    # provide context i.e. part-of-speech
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return ' '.join(lemmas)

In [20]:
# Perform preprocessing
def perform_preprocessing(text):
    text = remove_html_tags(text)
    text = remove_accented_chars(text)
    text = remove_numbers(text)
    text = remove_stopwords(text)
    text = text_lowercase(text)
    text = remove_slash_with_space(text)
#    text = remove_punctuation(text)
    text = stem_words(text)
    text = lemmatize_words(text)
    text = remove_whitespace(text)
    return text

In [21]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [22]:
training_data.columns

Index(['Ticket #', 'Customer Complaint', 'Date', 'Date_month_year', 'Time',
       'Received Via', 'City', 'State', 'Zip code', 'Status',
       'Filing on Behalf of Someone'],
      dtype='object')

In [23]:
training_data['Customer_Complaint_corpus'] = training_data['Customer Complaint'].apply(perform_preprocessing)

In [24]:
testing_data['Customer_Complaint_corpus'] = testing_data['Customer Complaint'].apply(perform_preprocessing)

In [25]:
####! pip install pycaret==2.3.4

In [26]:
###! pip install jinja2

In [27]:
###! pip install markupsafe==2.0.1

In [28]:
###! pip install evalml

In [29]:
from pycaret.classification import *

In [30]:
exp_mclf101 = setup(data = training_data, target = 'Status', session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Status
2,Target type,Multiclass
3,Target mapping,"Closed: 0, Open: 1, Pending: 2, Solved: 3"
4,Original data shape,"(1557, 12)"
5,Transformed data shape,"(1557, 12)"
6,Transformed train set shape,"(1089, 12)"
7,Transformed test set shape,"(468, 12)"
8,Ordinal features,2
9,Numeric features,1


In [31]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.6685,0.7877,0.6685,0.5478,0.585,0.4502,0.5157,0.343
ridge,Ridge Classifier,0.6538,0.0,0.6538,0.5648,0.577,0.431,0.4779,0.282
gbc,Gradient Boosting Classifier,0.6262,0.7892,0.6262,0.5918,0.5988,0.4173,0.4293,0.887
xgboost,Extreme Gradient Boosting,0.6235,0.7892,0.6235,0.6091,0.6087,0.4259,0.4318,0.587
rf,Random Forest Classifier,0.6226,0.7911,0.6226,0.6006,0.6031,0.4175,0.4239,0.649
catboost,CatBoost Classifier,0.618,0.7838,0.618,0.5971,0.6004,0.4143,0.4212,0.36
et,Extra Trees Classifier,0.6143,0.787,0.6143,0.6041,0.6036,0.4125,0.4158,0.516
lightgbm,Light Gradient Boosting Machine,0.6125,0.7904,0.6125,0.6035,0.6039,0.4142,0.4174,0.538
lda,Linear Discriminant Analysis,0.5758,0.7762,0.5758,0.5966,0.5737,0.3766,0.3888,0.353
dt,Decision Tree Classifier,0.5583,0.6827,0.5583,0.5663,0.5577,0.3504,0.3531,0.633


Processing:   0%|          | 0/69 [00:00<?, ?it/s]

In [32]:
nb2 = create_model('nb')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6422,0.761,0.6422,0.5282,0.559,0.406,0.4757
1,0.6789,0.7976,0.6789,0.5288,0.5881,0.4728,0.5227
2,0.6606,0.7963,0.6606,0.5433,0.5773,0.4373,0.5075
3,0.6606,0.7784,0.6606,0.5214,0.5756,0.444,0.4887
4,0.7156,0.8421,0.7156,0.5846,0.6286,0.5283,0.5972
5,0.7064,0.8117,0.7064,0.5883,0.6226,0.5119,0.5878
6,0.6422,0.7483,0.6422,0.5552,0.5638,0.4014,0.4848
7,0.6697,0.7963,0.6697,0.5488,0.5873,0.4516,0.5143
8,0.6606,0.7638,0.6606,0.5447,0.579,0.4356,0.5009
9,0.6481,0.7813,0.6481,0.5348,0.5681,0.4134,0.4774


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [41]:
tune_model(nb2)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6422,0.76,0.6422,0.5282,0.559,0.406,0.4757
1,0.6881,0.7872,0.6881,0.5487,0.5995,0.4858,0.5466
2,0.6606,0.7972,0.6606,0.5433,0.5773,0.4373,0.5075
3,0.6789,0.7762,0.6789,0.5401,0.5889,0.4699,0.5324
4,0.7156,0.8339,0.7156,0.5846,0.6286,0.5283,0.5972
5,0.7064,0.8103,0.7064,0.5883,0.6226,0.5119,0.5878
6,0.6514,0.7509,0.6514,0.5706,0.5724,0.416,0.5087
7,0.6789,0.7939,0.6789,0.561,0.5958,0.4661,0.5353
8,0.6606,0.76,0.6606,0.5519,0.5808,0.4349,0.5045
9,0.6574,0.7816,0.6574,0.5476,0.5768,0.428,0.4988


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [33]:
prediction = predict_model(nb2, data = testing_data)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Naive Bayes,0.7031,0,0.7031,0.5859,0.6253,0.4783,0.5378


In [34]:
prediction.head(2)

Unnamed: 0,Ticket #,Customer Complaint,Date,Date_month_year,Time,Received Via,City,State,Zip code,Filing on Behalf of Someone,Customer_Complaint_corpus,Status,prediction_label,prediction_score
2,242732,Speed and Service,18-04-15,18-Apr-15,9:55:47 AM,Internet,Acworth,Georgia,30101,Yes,speed servic,0,Closed,0.9667
4,307175,Comcast not working and no service to boot,26-05-15,26-May-15,1:25:26 PM,Internet,Acworth,Georgia,30101,No,comcast work servic boot,3,Solved,0.6616


In [35]:
prediction.reset_index(inplace=True)

In [36]:
prediction.head(3)

Unnamed: 0,index,Ticket #,Customer Complaint,Date,Date_month_year,Time,Received Via,City,State,Zip code,Filing on Behalf of Someone,Customer_Complaint_corpus,Status,prediction_label,prediction_score
0,2,242732,Speed and Service,18-04-15,18-Apr-15,9:55:47 AM,Internet,Acworth,Georgia,30101,Yes,speed servic,0,Closed,0.9667
1,4,307175,Comcast not working and no service to boot,26-05-15,26-May-15,1:25:26 PM,Internet,Acworth,Georgia,30101,No,comcast work servic boot,3,Solved,0.6616
2,5,338519,ISP Charging for arbitrary data limits with ov...,06-12-15,06-Dec-15,9:59:40 PM,Internet,Acworth,Georgia,30101,No,isp charg arbitrari data limit overag fee,3,Solved,0.6557


In [37]:
prediction["prediction_label"].value_counts()

Solved    487
Closed    180
Name: prediction_label, dtype: int64