In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re

# Load data

In [None]:
df = pd.read_csv('data/ciphix.csv', header=None, names=['text'])

In [None]:
df

# Look around

In [None]:
df.isna().sum()

In [None]:
df["wordcount"] = df["text"].str.split().str.len()

In [None]:
df['wordcount'].hist(bins=20)

In [None]:
df

### Conclusions
Looking around I noticed:
- all messages start with some kind of username handle
- some have multiple username handles
- some end with a different tag ^ followed by user acronym
- different languages
- smileys
- URLS

In [None]:
#Lets check the counts of the first tag mentioned
split = df['text'].str.extract(r'(@([a-zA-Z\d]+)([^\S\r\n]))(.*)')
split['text'] = split[3]
split['tag'] = split[1]
split = split[['tag','text']]
split = split.dropna(subset='text')
split['tag'].value_counts()[:20].plot(kind='barh', figsize=(10, 8))
plt.title("Counts of tag first-mentioned", y=1.02);

# Clean + preprocess

In [None]:
#Remove all @ tags
def remove_ats(text):
    at_pattern = re.compile('@[a-zA-Z\d_]+')
    return at_pattern.sub(r'', text)

#Remove all employee tags
#Tags occur at the end of the line with capital letters and prefix '-' or '^'
def remove_tag(text):
    at_pattern = re.compile('[\^\-][A-Z\d]+$')
    return at_pattern.sub(r'', text)

#Remove URLS
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

#Remove smileys
def remove_emoji(text):   
    emoji_pattern = re.compile("["
                            "\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            "\U0001F300-\U0001F5FF"  # symbols & pictographs
                            "\U0001F600-\U0001F64F"  # emoticons
                            "\U0001F680-\U0001F6FF"  # transport & map symbols
                            "\U0001F700-\U0001F77F"  # alchemical symbols
                            "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                            "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                            "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                            "\U0001FA00-\U0001FA6F"  # Chess Symbols
                            "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                            "\U00002702-\U000027B0"  # Dingbats
                            "\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

#Remove newlines.
def remove_specialchars(text):
    char_pattern = re.compile('[\n]')
    return char_pattern.sub(r'', text)

#Remove only non-letters


In [None]:
df['clean_text'] = df['text'].apply(remove_ats) \
                                .apply(remove_urls) \
                                .apply(remove_tag) \
                                .apply(remove_emoji) \
                                .apply(remove_specialchars)

In [None]:
df.head(n=25) 

### Wrapup and count again

In [None]:
df = df.dropna(subset='clean_text')
print(df.shape)
df["wordcount"] = df["clean_text"].str.split().str.len()
df['wordcount'].hist(bins=20)

### Inspect special cases

In [None]:
pd.set_option('display.max_rows', 50)
small = df.loc[df["wordcount"]<2,:].head(n=20)
small

In [None]:
#Remove remaining text not containing letters
df = df[~df['clean_text'].str.fullmatch('^[\s\d]+$')] 

In [None]:
df.loc[df["wordcount"]<2,:].head(n=20)

In [None]:
#Largest wordcounts seem only a few cases that I accept for now
df.loc[df["wordcount"]>65,:]

# NLP