In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re

# Load data

In [None]:
df = pd.read_csv('data/ciphix.csv', header=None, names=['text'])

# Look around

In [None]:
df

In [None]:
df.isna().sum()

In [None]:
df["wordcount"] = df["text"].str.split().str.len()

In [None]:
df['wordcount'].hist(bins=20)

In [None]:
df

### Conclusions
Looking around I noticed:
- all messages start with some kind of username handle
- some have multiple username handles
- some end with a different tag ^ followed by user acronym
- different languages
- smileys
- URLS

In [None]:
#Lets check the counts of the first tag mentioned
split = df['text'].str.extract(r'(@([a-zA-Z\d]+)([^\S\r\n]))(.*)')
split['text'] = split[3]
split['tag'] = split[1]
split = split[['tag','text']]
split = split.dropna(subset='text')
split['tag'].value_counts()[:20].plot(kind='barh', figsize=(10, 8))
plt.title("Counts of tag first-mentioned", y=1.02);

# Clean + preprocess

In [None]:
df

In [None]:
#Remove all tags
def remove_ats(text):
    at_pattern = re.compile('@[a-zA-Z\d]+')
    return at_pattern.sub(r'', text)

#Remove all employee tags
def remove_tag(text):
    at_pattern = re.compile('\^[a-zA-Z\d]+')
    return at_pattern.sub(r'', text)

#Remove URLS
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [None]:
df['clean_text'] = split['text'].apply(remove_ats) \
                                .apply(remove_urls) \
                                .apply(remove_tag)

In [None]:
df.head(n=25)