In [72]:
import os
import numpy as np
import pandas as pd
import ast
import string
import random
import string
import random
import nltk
import pickle
import wordcloud
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from collections import defaultdict
from gensim import corpora, models, similarities

In [73]:
# THIS NEEDS TO BE RUN ONCE ON ANY GIVEN SYSTEM
#nltk.download('wordnet')

# Cleaning

In [74]:
def removeDodgyEntries(data):
    """
    Drop all rows that are not able to be literal_eval'ed
    """
    dodgyEntries = []
    for i in data.index:
        try:
            ast.literal_eval(data.loc[i]["text"])
        except:
            dodgyEntries.append(i)
    print("Removed {} rows due to literal_eval issues".format(len(dodgyEntries)))
    return data.drop(dodgyEntries)

    
def loadScrapeOutput():
    """
    Load the various scarping outpus, join them in one DF, return
    """
    dataFb = pd.read_csv("data/facebookArticlesClean.csv", encoding="ISO-8859-1")
    dataG = pd.read_csv("data/googlePagesClean.csv", encoding="ISO-8859-1")
    data = dataG[dataG.columns[2:]].append(dataFb[dataFb.columns[2:]], ignore_index=True)
    
    # Very basic initial scraping
    def filtering(s):
        s = s.lower()
        s = ''.join([c for c in s if c in string.printable]) # Get rid of non ascii whitespace chars, e.g. japanese
        s = s.strip() # Get rid of whitespace AFTER removing chars
        return s
    

    data["text"] = data["TRANSLATED_CONTENT"].apply(filtering)
    data = removeDodgyEntries(data)
    
    # Evaluate the string representation of the list
    data["text_list"] = data["text"].apply(ast.literal_eval)
    
    data.drop(["CONTENT", "TRANSLATED_CONTENT", "FIRST_PARAGRAPH"], axis=1, inplace=True)
    
    return data

data = loadScrapeOutput()

Removed 1 rows due to literal_eval issues


In [75]:
def explode(s):
    return " ".join(s)

data["text_full"] = data["text_list"].apply(explode)
data.drop("text", axis=1, inplace=True)

### remove punctuation

In [76]:
def splitcol(s):
    # All puncuation characters to be ignored
    exclude = set(string.punctuation)
    # In string form
    excludestr = ''.join(exclude)

    s = ''.join([c for c in s if c not in exclude]) # Removes punctuation
    s = [x for x in s.split(" ") if x != ""] # Accounts for double whitespace
    return s

def spaceseparate(s):
    s = ' '.join(s)
    return s

# Create a column that is every word, split. 
data["text_split"] = data["text_full"].apply(splitcol)
data["text_full"] = data["text_split"].apply(spaceseparate)

In [79]:
def printCorpusInfo(data):
    documents = data["text_full"].values
    documentsSplit = data["text_split"].values

    words = []

    # Convert to one giant list
    for i in documentsSplit:
        words.extend(i)

    uniqueWords = set(words)

    print("Number of documents in the corpus:\t", len(data))
    print("Number of words in the corpus:\t\t", len(words))
    print("Number of UNIQUE words in the corpus: \t", len(uniqueWords))
    print("Mean document Length: \t\t\t", len(words)/len(data))
    
printCorpusInfo(data)

Number of documents in the corpus:	 2607
Number of words in the corpus:		 1699654
Number of UNIQUE words in the corpus: 	 64525
Mean document Length: 			 651.957805907173


# Preparation - Creating the Corpos

In this notebook the data is further prepared. The main task is to identify the tokenisation of the words and lemmatise them.

### Lemmatisation Examples:

- Guns -> Gun
- Swimming -> Swim
- Swum -> Swim

### Pos_tag the words, giving us tags that indicate "noun", "verb", etc

In [78]:
data["text_tagged"] = data["text_split"].apply(nltk.pos_tag)

In [80]:
def get_wordnet_pos(treebank_tag):
    """Convert the NLTK tokenisation to wordnet tags. This allows them to be interpreted by the wordnet lemmatiser"""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [81]:
def lemmatise(s):
    # Instantiate a lemmatizer
    wnLem = nltk.stem.WordNetLemmatizer()
    """Lemmatization function that expects pos information (whether a word is verb, noun, etc.)"""
    """Input: [word, posInfo]"""
    #print(s)
    #print([x[0] for x in s])
    
    def lemWord(w):
        tag = get_wordnet_pos(w[1])
        if tag != None:
            return wnLem.lemmatize(w[0], pos=tag)
        else:
            return wnLem.lemmatize(w[0])

    return [lemWord(x) for x in s]

In [82]:
data["text_lem"] = data["text_tagged"].apply(lemmatise)

In [83]:
# We only care about this column now
corpus = data[["text_lem"]]

In [84]:
corpus.to_csv("data/corpus_df.csv")

In [87]:
def getRankedFrequency(data):
    frequency = defaultdict(int)
    for doc in data["text_lem"]:
        for word in doc:
            frequency[word] += 1

    filt = dict(frequency)

    f = sorted(filt.items(), key=lambda x: x[1])
    f.reverse()
    return f

In [88]:
getRankedFrequency(corpus)

[('the', 75229),
 ('and', 55626),
 ('be', 50469),
 ('to', 48533),
 ('a', 46750),
 ('of', 41707),
 ('in', 28074),
 ('for', 19283),
 ('you', 18483),
 ('it', 18305),
 ('that', 15767),
 ('your', 15561),
 ('with', 14878),
 ('have', 13391),
 ('on', 11718),
 ('or', 10127),
 ('can', 9979),
 ('this', 9096),
 ('i', 8167),
 ('from', 7900),
 ('at', 6705),
 ('we', 6248),
 ('by', 5948),
 ('but', 5862),
 ('not', 5670),
 ('if', 5560),
 ('more', 5383),
 ('an', 5374),
 ('all', 5119),
 ('also', 4876),
 ('will', 4860),
 ('hong', 4813),
 ('say', 4788),
 ('our', 4762),
 ('use', 4730),
 ('do', 4572),
 ('get', 4529),
 ('make', 4497),
 ('one', 4493),
 ('up', 4051),
 ('so', 4023),
 ('kong', 4011),
 ('when', 3924),
 ('about', 3772),
 ('which', 3726),
 ('out', 3682),
 ('time', 3641),
 ('my', 3628),
 ('help', 3572),
 ('they', 3570),
 ('their', 3440),
 ('health', 3399),
 ('like', 3337),
 ('take', 3332),
 ('body', 3248),
 ('there', 3238),
 ('some', 3188),
 ('may', 3086),
 ('her', 3054),
 ('people', 3006),
 ('he', 29

In [89]:
# These words only appear <5 times throughout the entire corpus, 
# they are not able to influence the LDA in a meaningful way, 
# so they are removed to aid in computation
def filterRareWords(doc):
    return [x for x in doc if frequency[x] > 4]

corpus["text_lem"] = corpus["text_lem"].apply(filterRareWords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [90]:
getRankedFrequency(corpus)

[('the', 75229),
 ('and', 55626),
 ('be', 50469),
 ('to', 48533),
 ('a', 46750),
 ('of', 41707),
 ('in', 28074),
 ('for', 19283),
 ('you', 18483),
 ('it', 18305),
 ('that', 15767),
 ('your', 15561),
 ('with', 14878),
 ('have', 13391),
 ('on', 11718),
 ('or', 10127),
 ('can', 9979),
 ('this', 9096),
 ('i', 8167),
 ('from', 7900),
 ('at', 6705),
 ('we', 6248),
 ('by', 5948),
 ('but', 5862),
 ('not', 5670),
 ('if', 5560),
 ('more', 5383),
 ('an', 5374),
 ('all', 5119),
 ('also', 4876),
 ('will', 4860),
 ('hong', 4813),
 ('say', 4788),
 ('our', 4762),
 ('use', 4730),
 ('do', 4572),
 ('get', 4529),
 ('make', 4497),
 ('one', 4493),
 ('up', 4051),
 ('so', 4023),
 ('kong', 4011),
 ('when', 3924),
 ('about', 3772),
 ('which', 3726),
 ('out', 3682),
 ('time', 3641),
 ('my', 3628),
 ('help', 3572),
 ('they', 3570),
 ('their', 3440),
 ('health', 3399),
 ('like', 3337),
 ('take', 3332),
 ('body', 3248),
 ('there', 3238),
 ('some', 3188),
 ('may', 3086),
 ('her', 3054),
 ('people', 3006),
 ('he', 29

In [91]:
# Using wordcloud STOPWORDS, a fairly extenive stopword list
def filterStopWords(doc):
    return [x for x in doc if x not in wordcloud.STOPWORDS]

corpus["text_lem"] = corpus["text_lem"].apply(filterStopWords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [92]:
dictionary = corpora.Dictionary(corpus["text_lem"].values)
print(dictionary)
for i in dictionary.keys()[:5]:
    print()
    print(i)
    print(dictionary.get(i))

Dictionary(13515 unique tokens: ['official', 'name', 'codonopsis', 'radix', 'english']...)

0
official

1
name

2
codonopsis

3
radix

4
english


In [94]:
getRankedFrequency(corpus)

[('will', 4860),
 ('hong', 4813),
 ('say', 4788),
 ('use', 4730),
 ('make', 4497),
 ('one', 4493),
 ('kong', 4011),
 ('time', 3641),
 ('help', 3572),
 ('health', 3399),
 ('take', 3332),
 ('body', 3248),
 ('may', 3086),
 ('people', 3006),
 ('day', 2953),
 ('go', 2895),
 ('find', 2697),
 ('good', 2609),
 ('food', 2605),
 ('work', 2414),
 ('youre', 2340),
 ('need', 2327),
 ('know', 2309),
 ('look', 2280),
 ('include', 2247),
 ('new', 2211),
 ('year', 2070),
 ('life', 2037),
 ('even', 2030),
 ('way', 2019),
 ('best', 2004),
 ('well', 1988),
 ('many', 1958),
 ('offer', 1849),
 ('want', 1810),
 ('u', 1784),
 ('feel', 1780),
 ('eat', 1764),
 ('come', 1763),
 ('cause', 1751),
 ('skin', 1716),
 ('dont', 1676),
 ('high', 1662),
 ('start', 1646),
 ('back', 1634),
 ('much', 1634),
 ('keep', 1595),
 ('image', 1542),
 ('see', 1532),
 ('first', 1528),
 ('give', 1517),
 ('experience', 1511),
 ('product', 1484),
 ('study', 1483),
 ('woman', 1465),
 ('try', 1445),
 ('healthy', 1444),
 ('every', 1423),
 

In [95]:
def vectorise(s):
    return dictionary.doc2bow(s)
    
corpus["text_vec"] = corpus["text_lem"].apply(vectorise)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [96]:
# The text_vec column is the actual corpus, words are encoded as in the dictionary
corpora.MmCorpus.serialize('data/corpus.mm', corpus["text_vec"])

In [97]:
dictionary.save('data/dictionary.dict')

In [98]:
corpus.to_csv("data/corpus_df.csv")

In [99]:
corpus

Unnamed: 0,text_lem,text_vec
0,"[official, name, codonopsis, radix, english, n...","[(0, 4), (1, 16), (2, 3), (3, 2), (4, 4), (5, ..."
1,"[tell, u, story, become, interested, public, h...","[(9, 4), (37, 2), (40, 1), (46, 1), (47, 2), (..."
2,"[please, make, sure, enter, valid, complete, n...","[(1, 1), (148, 2), (193, 1), (194, 1), (195, 1..."
3,"[hong, kong, association, organize, event, ele...","[(193, 1), (201, 1), (203, 2), (204, 4), (205,..."
4,"[electric, vehicle, charge, station, pop, ever...","[(40, 2), (78, 1), (101, 1), (112, 1), (142, 1..."
5,"[traditional, chinese, medicine, tcm, 2000, ye...","[(6, 1), (13, 3), (31, 1), (45, 1), (49, 1), (..."
6,"[select, one, following, topic, learn, chinese...","[(13, 4), (26, 1), (73, 1), (90, 1), (135, 1),..."
7,"[western, physiology, urine, consider, fluid, ...","[(13, 6), (19, 5), (28, 1), (30, 9), (31, 1), ..."
8,"[coriolus, versicolor, know, versicolor, name,...","[(1, 2), (11, 7), (13, 6), (31, 10), (32, 1), ..."
9,"[mushroom, health, supplement, combine, benefi...","[(1, 2), (6, 1), (11, 8), (13, 13), (31, 17), ..."
