In [1]:
import pandas as pd
from pprint import pprint

import os, re, warnings
warnings.filterwarnings('ignore')  # Let's not pay heed to them right now


# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

# Gensim
import gensim
from gensim import corpora, models
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary

# spacy for lemmatization
import spacy
#python -m spacy download en_core_web_sm  -- run the command in prompt to load the sapacy model
nlp = spacy.load("en_core_web_sm")
# NLTK Stop words
from nltk.corpus import stopwords
# Libraries for text preprocessing
import nltk
nltk.download('stopwords')
stop_words = stopwords.words('english')
from nltk.tokenize import RegexpTokenizer
#nltk.download('wordnet') 
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import SnowballStemmer
from nltk.stem.porter import *
nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91709\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\91709\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# NLTK Stop words
stop_words.extend(["would", "could", "use", "also", "make", "get", "look"])

In [3]:
#Load Data and convert accordingly to do the analysis
df = pd.read_excel(r'Mondelez_Cleaned.xlsx')
df = df.loc[df['CauseGroup'] == 'Ergonomic']
data_text = df[['EventDescription']]
data_text['index'] = data_text.index
documents = data_text
df['EventDescription']=df['EventDescription'].astype('str')

In [4]:
#Pre-processing
# Convert to list
data = df.EventDescription.values.tolist()
data

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

pprint(data[:1])

['MOVING EMPTY TRAY, FELT SNAP, CAUGHT SELF FROM FALLING LEFT CALF STRAIN']


### Tokenize words and Clean-up text

In [5]:
#set deacc=True to remove the punctuations.

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

stemmer= SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result
processed_docs = documents['EventDescription'].map(preprocess)


### Build the bigram and trigram models

In [6]:
bigram = gensim.models.Phrases(processed_docs, min_count=1, threshold=2, delimiter=b' ') # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[processed_docs], threshold=1, delimiter=b' ')  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [7]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [8]:
# Remove Stop Words
data_words_nostops = remove_stopwords(processed_docs)

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

# Form Bigrams
data_words_bigrams = make_bigrams(data_lemmatized)

# Form Bigrams
data_words_trigrams = make_trigrams(data_lemmatized)


In [9]:
new_bigram = []
new_trigram = []
for sent in data_words_bigrams:
    bigrams_ = [b for b in bigram[sent] if b.count(' ') == 1]
    new_bigram.append(bigrams_)
for sent in data_words_trigrams:
    trigrams_ = [b for b in bigram[sent] if b.count(' ') == 2]
    new_trigram.append(trigrams_)

### Create dictionary & corpus
* bag of words dictionary - where the key is the word and value is the number of times that word occurs in the entire corpus.

In [10]:
# Create Dictionary - each word given an id
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
texts = data_lemmatized

# Term Document Frequency - for each row, convert the words into a "bow" format
corpus = [id2word.doc2bow(text) for text in texts]

# id2word converted to a more interpretable version
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:2]]

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [11]:
lis = lda_model.print_topics()

In [12]:
#getting data ready for flat file to be read by Power BI
final = []
for ind, li in enumerate(lis):
    if li:
        spt1 = str(li[1]).split("+")
        for idx, sp in enumerate(spt1):
            if sp:
                sp = sp.split("*")
                final.append([ind, *sp])
df = pd.DataFrame(final)
cols = ["topic", "weights", "word"]
df.columns = cols
df['word']= df['word'].str.replace('"', "")

df['weights'] = df['weights'].astype(float)
df['weights'] = df['weights'] * 1000

df.to_csv('lda.csv', index= False)

### Direct N gram frequency

In [13]:
bigram_list = []
trigram_list = []
for text in data_words_nostops:
    unigrams = text
    unigram_counts = {}
    for unigram in unigrams:
        unigram_counts[unigram] = unigram_counts.get(unigram, 0) +1

    bigrams = [" ".join(bigram) for bigram in zip(unigrams[:-1], unigrams[1:])]
    bigram_list.append(bigrams)
    
    trigrams = [" ".join(trigram) for trigram in zip(unigrams[:-2], unigrams[1:-1],unigrams[2:])]
    trigram_list.append(trigrams)

## Unigram

In [14]:
from collections import Counter
data_words_nostops = [item for sublist in data_words_nostops for item in sublist]
counts = Counter(data_words_nostops)
#counts.sort(reverse=True)
uni_counts = counts.most_common()

In [15]:
uni_df = pd.DataFrame(uni_counts)

In [16]:
uni_df.columns = ["Word", "Frequency"]
uni_df.to_csv('unigram.csv', index= False)

## Bigram

In [17]:
bigram_list = [item for sublist in bigram_list for item in sublist]
from collections import Counter
counts2 = Counter(bigram_list)
#counts.sort(reverse=True)
bi_counts  = counts2.most_common()
bi_df = pd.DataFrame(bi_counts)
bi_df.columns = ['Words', "Frequency"]
bi_df.to_csv('bigram.csv', index= False)

## Trigram

In [18]:
trigram_list = [item for sublist in trigram_list for item in sublist]
counts3 = Counter(trigram_list)
#counts.sort(reverse=True)
tri_counts = counts3.most_common()
tri_df = pd.DataFrame(tri_counts)

tri_df.columns = ['Words', "Frequency"]
tri_df.to_csv('trigram.csv', index= False)

## K means clustering

In [19]:
data_lemmatized1 = [item for sublist in data_lemmatized for item in sublist]

In [20]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data_lemmatized1)

true_k = 3
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

word_list = []
for i in range(true_k):
    for ind in order_centroids[i, :10]:
        word_list.append([i+1, terms[ind]])


In [22]:
cluster_cols = ["Cluster No", "Word"]
df5 = pd.DataFrame(word_list)
df5.columns = cluster_cols
df5.to_csv('cluster.csv', index= False)