In [2]:
import re 
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

In [3]:
# Inhalt der CSV 'Comcast.csv' lesen
df = pd.read_csv ('Comcast.csv')
# Inhalt der Spalte 'Customer Complaint in Kleinbuchstaben umwandeln 
df['Customer Complaint'] = df['Customer Complaint'].str.lower()
print (df['Customer Complaint'].head(5))

0                        comcast cable internet speeds
1         payment disappear - service got disconnected
2                                    speed and service
3    comcast imposed a new usage cap of 300gb that ...
4           comcast not working and no service to boot
Name: Customer Complaint, dtype: object


In [4]:
# Wörter mithilfe der Funktion word-tokenize aus der Unterbibiliothek nltk.tokenize tokenisiert.
df['Customer Complaint'] = df['Customer Complaint'].apply(word_tokenize)
print (df['Customer Complaint'].head(5))


0                   [comcast, cable, internet, speeds]
1    [payment, disappear, -, service, got, disconne...
2                                [speed, and, service]
3    [comcast, imposed, a, new, usage, cap, of, 300...
4    [comcast, not, working, and, no, service, to, ...
Name: Customer Complaint, dtype: object


In [5]:
# eStopWords mit den englischen Stoppwörtern füllen
eStopWords = set(stopwords.words('english'))
#Comcast als zusätzliches Stopwort
eStopWords.add('comcast')
# Stoppwörter entfernen
df['Customer Complaint'] = df['Customer Complaint'].apply(lambda x: [word for word in x if word not in eStopWords])

In [6]:
# Wörter in die Grundform bringen (Stemming, Lemmatisierung) 
lemmatizer = WordNetLemmatizer()
df['Customer Complaint'] = df['Customer Complaint'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
print (df['Customer Complaint'].head(5))

0                             [cable, internet, speed]
1    [payment, disappear, -, service, got, disconne...
2                                     [speed, service]
3    [imposed, new, usage, cap, 300gb, punishes, st...
4                             [working, service, boot]
Name: Customer Complaint, dtype: object


In [19]:
# Bag-of-Words-Vektorisierers
bow_vectorizer = CountVectorizer()
X_bow = bow_vectorizer.fit_transform(df['Customer Complaint'].apply(' '.join))

In [18]:
# TF-IDF Ansatz. Beide Ansätze können in Python mithilfe des scikit-learn-Pakets umgesetzt dafür wird pandas als pd importiert.
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df['Customer Complaint'].apply(' '.join))

In [20]:
# LSA
lsa = TruncatedSVD(n_components=4, algorithm='randomized', n_iter=15, random_state=42)
lsa_output = lsa.fit_transform(X)
# Neue Spalte für jede Komponente im Data frame
for i in range(lsa_output.shape[1]):
    df[f'LSA Topic {i}'] = lsa_output[:, i]

In [28]:
# LDA
lda = LatentDirichletAllocation(n_components=3, doc_topic_prior=0.9, topic_word_prior=0.9)
lda_output = lda.fit_transform(X)
# Neue Spalte für jede Komponente im Data frame
for i in range(lda_output.shape[1]):
    df[f'LDA Topic {i}'] = lda_output[:, i]

In [29]:
# Verzeichnis für die Themen erstellen
dictionary = Dictionary(df['Customer Complaint'])

In [30]:
# Umwandlung in eine vektorisierte Form durch Berechnung des "Frequency counts"
corpus = [dictionary.doc2bow(doc) for doc in df['Customer Complaint']]

In [39]:
# Themen extrahieren
n_top_words = 3 
topics = []
feature_names = vectorizer.get_feature_names_out()

for topic_idx, topic in enumerate(lda.components_):
    top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
    top_features = [feature_names[i] for i in top_features_ind]
    topics.append(top_features)

print (topics)

[['data', 'cap', 'internet'], ['issue', 'fee', 'service'], ['service', 'internet', 'speed']]


In [42]:
# Coherence Score (Bestimmung der Anzahl von Themen)
coherence_model_lda = CoherenceModel(topics=topics, texts=df['Customer Complaint'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

In [44]:
#LDA Themen ausgeben
for i, topic in enumerate(topics):
    print(f"Beste Wörter für Thema {i+1}: {', '.join(topic)}")

Beste Wörter für Thema 1: data, cap, internet
Beste Wörter für Thema 2: issue, fee, service
Beste Wörter für Thema 3: service, internet, speed


In [45]:
# Berechnung des Coherence Scores für LSA mithilfe von c_v measure
topics = [[feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]] for topic in lsa.components_]
coherence_model_lsa = CoherenceModel(topics=topics, texts=df['Customer Complaint'], dictionary=dictionary, coherence='c_v')
coherence_lsa = coherence_model_lsa.get_coherence()
print('Coherence Score: ', coherence_lsa)

Coherence Score:  0.7719403828058999


In [47]:
#LSA Themen ausgeben
for i, topic in enumerate(topics):
    print(f"Beste Wörter für Thema {i+1}: {', '.join(topic)}")

Beste Wörter für Thema 1: internet, service, billing
Beste Wörter für Thema 2: cap, data, usage
Beste Wörter für Thema 3: billing, issue, practice
Beste Wörter für Thema 4: service, customer, poor


In [48]:
# Datei für die Ausgabe der Ergebnisse erzeugen
df.to_csv('Comcast_Ergebnisse.csv')