In [1]:
import re 
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

In [2]:
# Inhalt der CSV 'Comcast.csv' lesen
df = pd.read_csv ('Comcast.csv')
# Inhalt der Spalte 'Customer Complaint in Kleinbuchstaben umwandeln 
df['Customer Complaint'] = df['Customer Complaint'].str.lower()
print (df['Customer Complaint'].head(5))

0                        comcast cable internet speeds
1         payment disappear - service got disconnected
2                                    speed and service
3    comcast imposed a new usage cap of 300gb that ...
4           comcast not working and no service to boot
Name: Customer Complaint, dtype: object


In [3]:
# Wörter mithilfe der Funktion word-tokenize aus der Unterbibiliothek nltk.tokenize tokenisiert.
df['Customer Complaint'] = df['Customer Complaint'].apply(word_tokenize)
print (df['Customer Complaint'].head(5))


0                   [comcast, cable, internet, speeds]
1    [payment, disappear, -, service, got, disconne...
2                                [speed, and, service]
3    [comcast, imposed, a, new, usage, cap, of, 300...
4    [comcast, not, working, and, no, service, to, ...
Name: Customer Complaint, dtype: object


In [5]:
# eStopWords mit den englischen Stoppwörtern füllen
eStopWords = set(stopwords.words('english'))
#Comcast als zusätzliches Stopwort
eStopWords.add('comcast')
# Stoppwörter entfernen
df['Customer Complaint'] = df['Customer Complaint'].apply(lambda x: [word for word in x if word not in eStopWords])
print (eStopWords)

{'then', "isn't", "that'll", 'she', "weren't", 'him', 'than', 'yours', 'o', 'wouldn', 'each', 'itself', 'don', 'was', 'our', 'myself', 'mustn', 'whom', 'yourself', 'when', 'again', "she's", 'here', 'ours', 'shouldn', 'had', 'doesn', "hasn't", "you'd", 'y', "shouldn't", 'below', 'weren', 'won', 'having', 'they', 'comcast', 'them', 'couldn', 're', 'if', "you're", 'he', 'were', 'there', 'haven', 'more', 'will', 'my', "mustn't", 'how', 'from', 'further', 'this', 'm', 'have', 'hasn', 'which', 'is', 'who', 'both', 'hadn', 'on', "you'll", 'ourselves', "you've", 'theirs', 'that', 'shan', 'most', 'themselves', 'such', 'its', 'yourselves', 'of', 'while', 'aren', 'isn', 'their', 'being', 'against', 'am', 'be', 'by', 'some', 'between', 'd', 'ma', 'needn', "couldn't", "hadn't", 'own', 'ain', 'at', 'not', 'and', 'until', "mightn't", 'the', 'other', 'an', 'didn', 'been', "should've", 'me', 'only', 'with', 'after', 'should', 'we', 'those', 'because', 'what', 'over', 'her', 'about', 't', 'down', 'i', '

In [6]:
# Wörter in die Grundform bringen (Stemming, Lemmatisierung) 
lemmatizer = WordNetLemmatizer()
df['Customer Complaint'] = df['Customer Complaint'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
print (df['Customer Complaint'].head(5))

0                             [cable, internet, speed]
1    [payment, disappear, -, service, got, disconne...
2                                     [speed, service]
3    [imposed, new, usage, cap, 300gb, punishes, st...
4                             [working, service, boot]
Name: Customer Complaint, dtype: object


In [18]:
# Bag-of-Words-Vektorisierers
bow_vectorizer = CountVectorizer()
X_bow = bow_vectorizer.fit_transform(df['Customer Complaint'].apply(' '.join))
print(X_bow[:5])

  (0, 201)	1
  (0, 671)	1
  (0, 1177)	1
  (1, 907)	1
  (1, 388)	1
  (1, 1132)	1
  (1, 563)	1
  (1, 394)	1
  (2, 1132)	1
  (2, 1176)	1
  (3, 618)	1
  (3, 828)	1
  (3, 1336)	1
  (3, 213)	1
  (3, 34)	1
  (3, 983)	1
  (3, 1192)	1
  (4, 1132)	1
  (4, 1393)	1
  (4, 181)	1


In [17]:
# TF-IDF Ansatz. Beide Ansätze können in Python mithilfe des scikit-learn-Pakets umgesetzt dafür wird pandas als pd importiert.
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df['Customer Complaint'].apply(' '.join))
print(X_tfidf[:5])

  (0, 1177)	0.6430858133577498
  (0, 671)	0.35391408383463274
  (0, 201)	0.6791063671631227
  (1, 394)	0.4603374037517078
  (1, 563)	0.518022386486237
  (1, 1132)	0.17857763377991173
  (1, 388)	0.5456269029782118
  (1, 907)	0.43605457601797254
  (2, 1176)	0.8300146883261219
  (2, 1132)	0.5577415325783895
  (3, 1192)	0.4100980454274462
  (3, 983)	0.4752447865207828
  (3, 34)	0.36424856605761413
  (3, 213)	0.239253169628378
  (3, 1336)	0.2880374421376605
  (3, 828)	0.3598540285809728
  (3, 618)	0.4512010627314486
  (4, 181)	0.7609589602524502
  (4, 1393)	0.5990941919907783
  (4, 1132)	0.2490534278712626


In [38]:
# LSA
lsa = TruncatedSVD(n_components=3, algorithm='randomized', n_iter=15, random_state=42)
lsa_output = lsa.fit_transform(X_tfidf)
# Neue Spalte für jede Komponente im Data frame
for i in range(lsa_output.shape[1]):
    df[f'LSA Topic {i}'] = lsa_output[:, i]
print (lsa_output)


[[ 0.3317082  -0.1990938  -0.04354409]
 [ 0.08763926  0.0201744  -0.01508059]
 [ 0.39800402 -0.02014424 -0.07211096]
 ...
 [ 0.07726578  0.04768923 -0.00412558]
 [ 0.03602734  0.01963266 -0.00580637]
 [ 0.20847585 -0.12443873 -0.0312382 ]]


In [24]:
# LDA
lda = LatentDirichletAllocation(n_components=3, doc_topic_prior=0.9, topic_word_prior=0.9)
lda_output = lda.fit_transform(X_tfidf)
# Neue Spalte für jede Komponente im Data frame
for i in range(lda_output.shape[1]):
    df[f'LDA Topic {i}'] = lda_output[:, i]
print (lda_output)

[[0.24948728 0.53977452 0.2107382 ]
 [0.29020813 0.43831662 0.27147525]
 [0.52328637 0.2477159  0.22899772]
 ...
 [0.51069816 0.24466928 0.24463256]
 [0.24568816 0.5082064  0.24610544]
 [0.51496735 0.24529266 0.23973999]]


In [26]:
# Verzeichnis für die Themen erstellen
dictionary = Dictionary(df['Customer Complaint'])

In [28]:
# Umwandlung in eine vektorisierte Form durch Berechnung des "Frequency counts"
corpus = [dictionary.doc2bow(doc) for doc in df['Customer Complaint']]

In [29]:
# Themen extrahieren
n_top_words = 3 
topics = []
feature_names = vectorizer.get_feature_names_out()

for topic_idx, topic in enumerate(lda.components_):
    top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
    top_features = [feature_names[i] for i in top_features_ind]
    topics.append(top_features)

print (topics)

[['internet', 'service', 'speed'], ['billing', 'service', 'internet'], ['data', 'cap', 'caps']]


In [36]:
# Coherence Score (Bestimmung der Anzahl von Themen)
coherence_model_lda = CoherenceModel(topics=topics, texts=df['Customer Complaint'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

In [39]:
#LDA Themen ausgeben
for i, topic in enumerate(topics):
    print(f"Beste Wörter für Thema {i+1}: {', '.join(topic)}")

Beste Wörter für Thema 1: internet, service, billing
Beste Wörter für Thema 2: billing, issues, practices
Beste Wörter für Thema 3: data, cap, caps
Beste Wörter für Thema 4: service, customer, poor


In [35]:
# Berechnung des Coherence Scores für LSA mithilfe von c_v measure
topics = [[feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]] for topic in lsa.components_]
coherence_model_lsa = CoherenceModel(topics=topics, texts=df['Customer Complaint'], dictionary=dictionary, coherence='c_v')
coherence_lsa = coherence_model_lsa.get_coherence()
print('Coherence Score: ', coherence_lsa)

Coherence Score:  0.7028139925670622


In [41]:
#LSA Themen ausgeben
for i, topic in enumerate(topics):
    print(f"Beste Wörter für Thema {i+1}: {', '.join(topic)}")

Beste Wörter für Thema 1: internet, service, billing
Beste Wörter für Thema 2: billing, issues, practices
Beste Wörter für Thema 3: data, cap, caps
Beste Wörter für Thema 4: service, customer, poor


In [42]:
# Datei für die Ausgabe der Ergebnisse erzeugen
df.to_csv('Comcast_Ergebnisse.csv')