In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import string

from tqdm.notebook import tqdm_notebook

import spacy
import en_core_web_sm

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF

# Load data

In [None]:
df = pd.read_csv('data/ciphix.csv', header=None, names=['text'])

In [None]:
df

# Look around

In [None]:
df.isna().sum()

In [None]:
df["wordcount"] = df["text"].str.split().str.len()

In [None]:
df['wordcount'].hist(bins=20)

In [None]:
df

### Conclusions

Looking around I noticed:
- all messages start with some kind of username handle
- some have multiple username handles
- some end with a different tag ^ followed by user acronym
- different languages
- smileys
- URLS


In [None]:
#Lets check the counts of the first tag mentioned
split = df['text'].str.extract(r'(@([a-zA-Z\d]+)([^\S\r\n]))(.*)')
split['text'] = split[3]
split['tag'] = split[1]
split = split[['tag','text']]
split = split.dropna(subset='text')
split['tag'].value_counts()[:20].plot(kind='barh', figsize=(10, 8))
plt.title("Counts of tag first-mentioned", y=1.02);

# Clean

In [None]:
#Remove all @ tags
def remove_ats(text):
    print(text)
    at_pattern = re.compile('@[a-zA-Z\d_]+')
    return at_pattern.sub(r'', text)

#Remove all employee tags
#Tags occur at the end of the line with capital letters and prefix '-' or '^'
def remove_tag(text):
    at_pattern = re.compile('[\^\-][A-Z\d]+$')
    return at_pattern.sub(r'', text)

#Remove URLS
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

#Remove smileys
def remove_emoji(text):   
    emoji_pattern = re.compile("["
                            "\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            "\U0001F300-\U0001F5FF"  # symbols & pictographs
                            "\U0001F600-\U0001F64F"  # emoticons
                            "\U0001F680-\U0001F6FF"  # transport & map symbols
                            "\U0001F700-\U0001F77F"  # alchemical symbols
                            "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                            "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                            "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                            "\U0001FA00-\U0001FA6F"  # Chess Symbols
                            "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                            "\U00002702-\U000027B0"  # Dingbats
                            "\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

#Remove newlines.
def remove_specialchars(text):
    char_pattern = re.compile('[\n]')
    return char_pattern.sub(r'', text)

#Remove only non-letters


In [None]:
df['clean_text'] = df['text'].apply(remove_ats) \
                                .apply(remove_urls) \
                                .apply(remove_tag) \
                                .apply(remove_emoji) \
                                .apply(remove_specialchars)

In [None]:
df.head(n=25) 

### Wrapup and count again

In [None]:
df = df.dropna(subset='clean_text')
print(df.shape)
df["wordcount"] = df["clean_text"].str.split().str.len()
df['wordcount'].hist(bins=20)

### Inspect special cases

In [None]:
pd.set_option('display.max_rows', 50)
small = df.loc[df["wordcount"]<2,:].head(n=20)
small

In [None]:
#Remove remaining text not containing letters
df = df[~df['clean_text'].str.fullmatch('^[\s\d]+$')] 

In [None]:
df.loc[df["wordcount"]<2,:].head(n=20)

In [None]:
#Largest wordcounts seem only a few cases that I accept for now
df.loc[df["wordcount"]>65,:]

# Preprocessing

In [None]:
# We only need tokenizer, pos tagging and lemmatization
nlp = en_core_web_sm.load(disable=["parser", "ner", "textcat"])


In [None]:
"""
I select for Nouns, proper Nouns and Verbs since those will the most useful in a topic detection
in customer service setting with the end goal of automation.
"""

def preprocess(texts):
    docs = nlp.pipe(texts, n_process=8)
    output = []
    for doc in docs:
        pos_sel = " ".join(token.lemma_ for token in doc if (token.pos_ in ['PROPN','NOUN','VERB'] and not token.is_stop))
        output.append(pos_sel)
    return(output)


In [None]:
tqdm_notebook.pandas()
df['processed_text'] = preprocess(df['clean_text'])
# test = preprocess(df.loc[:20,'clean_text'])


In [None]:
pd.set_option('display.max_colwidth', 100)

df

In [None]:
df.to_csv('data/ciphix_pre_processed')

# Topic model

In [None]:


# Creating a vectorizer
vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')

data_vectorized = vectorizer.fit_transform(df["processed_text"])


In [None]:
topic_count = 10

In [None]:
nmf = NMF(n_components=topic_count)
data_nmf = nmf.fit_transform(data_vectorized) 

In [None]:
def get_topics(nmf, vectorizer, topic_count):
    res = []
    for idx, topic in enumerate(nmf.components_):
        descr = [(vectorizer.get_feature_names_out()[i], topic[i])
                        for i in topic.argsort()[:-topic_count - 1:-1]] #the final {topic_count} values
        res.append(descr)
    return res

In [None]:
topic_classes = get_topics(nmf, vectorizer, topic_count)

In [None]:
fig, axs = plt.subplots(5, 2, figsize=(15, 20))
topicnr=0

for ax, topic_class in zip(axs.flat, topic_classes):
    resdf = pd.DataFrame(topic_class).rename(columns={0:'topic', 1:'score'})
    resdf = resdf.set_index('topic').iloc[::-1]
    ax.set_title(f'topicnr={topicnr}')
    ax.barh(resdf.index, resdf['score'], align='center')
    topicnr+=1
    

# Example inference new document

In [None]:
# Transform the new data with the fitted models
new_doc = pd.DataFrame(["You should also treat your skills like cattle, not pets. Yes you specialized for 10yrs in a niche that is now threatened. Be grateful that you were able to milk that skill for 10yrs, but now it might be time to move on. There’s dignity in adaptation. It’s our human superpower."
                                ], dtype=str, columns=['text'])
display(new_doc)

new_doc['clean_text'] = new_doc['text'].apply(remove_ats) \
                                .apply(remove_urls) \
                                .apply(remove_tag) \
                                .apply(remove_emoji) \
                                .apply(remove_specialchars)

new_doc['processed_text'] = preprocess(new_doc['clean_text'])

newdata_vectorized = vectorizer.transform(new_doc["processed_text"])
newdata_nmf = nmf.transform(newdata_vectorized) 



In [None]:
new_doc['processed_text']

In [None]:
# Get the top predicted topic
predicted_topic = [each.argsort()[::-1][0] for each in newdata_nmf]

# predicted_topics = print_topics(nmf, vectorizer, topic_count)
print(predicted_topic)

In [None]:
print(newdata_nmf)

In [None]:
print([word for (word,_) in topic_classes[predicted_topic[0]]])