In [95]:
import os
import numpy as np
import pandas as pd
import ast
import string
import random
from googleapiclient.discovery import build
import string
import random
import nltk
import pickle
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from collections import defaultdict
from gensim import corpora, models, similarities



# Cleaning

In [57]:
dataFb = pd.read_csv("data/facebookArticlesClean.csv", encoding="ISO-8859-1")

In [58]:
dataG = pd.read_csv("data/googlePagesClean.csv", encoding="ISO-8859-1")

In [59]:
data = dataG[dataG.columns[2:]].append(dataFb[dataFb.columns[2:]], ignore_index=True)

In [60]:
def filtering(s):
    s = s.lower()
    s = ''.join([c for c in s if c in string.printable]) # Get rid of non ascii whitespace chars, e.g. japanese
    s = s.strip() # Get rid of whitespace AFTER removing chars
    return s

data["text"] = data["TRANSLATED_CONTENT"].apply(filtering)

In [61]:
ast.literal_eval(data.iloc[0]["text"])

['official name codonopsis radix english name tangshen scientific name codonpsis pilosula franch nannf codonopsis pilosula nannf var modesta nannf l t shen or codonopsis tangshen oliv chinese read more',
 'official name cinnamomi ramulus english name cassia twig scientific name cinnamomum cassia presl chinese name ?? chinese phonetic name gu zh?',
 'official name angelicae dahuricae radix english name dahurian angelica root scientific name angelica dahurica fisch ex hoffm benth et hook f angelica dahurica fisch ex hoffm benth read more',
 'official name fructus forsythiae english name weeping forsythia capsule scientific name forsythia suspensa thunb vahl chinese name ?? chinese phonetic name lian qiao source the dried fruit read more',
 'benefits of yunzhi and lingzhi  there are approximately 100 thousand known mushroom species in the world which have been a rich medicinal reservoir that continues today read more']

### Interpret string representation of array

In [67]:
# This entry was dodgy
data.drop(733, inplace=True)

In [69]:
data["text_list"] = data["text"].apply(ast.literal_eval)

In [74]:
data.drop(["CONTENT", "TRANSLATED_CONTENT", "FIRST_PARAGRAPH"], axis=1, inplace=True)

In [76]:
def explode(s):
    return " ".join(s)

In [78]:
data["text_full"] = data["text_list"].apply(explode)

In [80]:
data.drop("text", axis=1, inplace=True)

### remove punctuation

In [84]:
# All puncuation characters to be ignored
exclude = set(string.punctuation)
# In string form
excludestr = ''.join(exclude)

def splitcol(s):
    s = ''.join([c for c in s if c not in exclude]) # Removes punctuation
    s = [x for x in s.split(" ") if x != ""] # Accounts for double whitespace
    return s

# Create a column that is every word, split. 
data["text_split"] = data["text_full"].apply(splitcol)

In [87]:
data.head()

Unnamed: 0,text_list,text_full,text_split
0,[official name codonopsis radix english name t...,official name codonopsis radix english name ta...,"[official, name, codonopsis, radix, english, n..."
1,[tell us your story how did you become interes...,tell us your story how did you become interest...,"[tell, us, your, story, how, did, you, become,..."
2,[please make sure you enter a valid and comple...,please make sure you enter a valid and complet...,"[please, make, sure, you, enter, a, valid, and..."
3,[the hong kong automotive association and auto...,the hong kong automotive association and autos...,"[the, hong, kong, automotive, association, and..."
4,[electric vehicle charging stations are poppin...,electric vehicle charging stations are popping...,"[electric, vehicle, charging, stations, are, p..."


In [90]:
def spaceseparate(s):
    s = ' '.join(s)
    return s

data["text_full"] = data["text_split"].apply(spaceseparate)

In [94]:
documents = data["text_full"].values
documentsSplit = data["text_split"].values

words = []

# Convert to one giant list
for i in documentsSplit:
    words.extend(i)
    
uniqueWords = set(words)

print("Number of documents in the corpus:\t", len(data))
print("Number of words in the corpus:\t\t", len(words))
print("Number of UNIQUE words in the corpus: \t", len(uniqueWords))
print("Mean document Length: \t\t\t", len(words)/len(data))

Number of documents in the corpus:	 2607
Number of words in the corpus:		 1699654
Number of UNIQUE words in the corpus: 	 64525
Mean document Length: 			 651.957805907173


# Preparation - Creating the Corpos

In this notebook the data is further prepared. The main task is to identify the tokenisation of the words and lemmatise them.

### Lemmatisation Examples:

- Guns -> Gun
- Swimming -> Swim
- Swum -> Swim

### Pos_tag the words, giving us tags that indicate "noun", "verb", etc

In [100]:
data["text_tagged"] = data["text_split"].apply(nltk.pos_tag)

In [101]:
data.head()

Unnamed: 0,text_list,text_full,text_split,text_tagged
0,[official name codonopsis radix english name t...,official name codonopsis radix english name ta...,"[official, name, codonopsis, radix, english, n...","[(official, JJ), (name, NN), (codonopsis, NN),..."
1,[tell us your story how did you become interes...,tell us your story how did you become interest...,"[tell, us, your, story, how, did, you, become,...","[(tell, VB), (us, PRP), (your, PRP$), (story, ..."
2,[please make sure you enter a valid and comple...,please make sure you enter a valid and complet...,"[please, make, sure, you, enter, a, valid, and...","[(please, VB), (make, VB), (sure, JJ), (you, P..."
3,[the hong kong automotive association and auto...,the hong kong automotive association and autos...,"[the, hong, kong, automotive, association, and...","[(the, DT), (hong, NN), (kong, RB), (automotiv..."
4,[electric vehicle charging stations are poppin...,electric vehicle charging stations are popping...,"[electric, vehicle, charging, stations, are, p...","[(electric, JJ), (vehicle, NN), (charging, VBG..."


In [102]:
def get_wordnet_pos(treebank_tag):
    """Convert the NLTK tokenisation to wordnet tags."""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [111]:
# Instantiate a lemmatizer
wnLem = nltk.stem.WordNetLemmatizer()

In [115]:
def lemmatise(s):
    """Lemmatization function that expects pos information (whether a word is verb, noun, etc.)"""
    """Input: [word, posInfo]"""
    #print(s)
    #print([x[0] for x in s])
    
    def lemWord(w):
        tag = get_wordnet_pos(w[1])
        if tag != None:
            return wnLem.lemmatize(w[0], pos=tag)
        else:
            return wnLem.lemmatize(w[0])
        

    return [lemWord(x) for x in s]
    

for i in data.index[:1]:
    print()
    print(lemmatise(data.loc[i]["text_tagged"]))


['official', 'name', 'codonopsis', 'radix', 'english', 'name', 'tangshen', 'scientific', 'name', 'codonpsis', 'pilosula', 'franch', 'nannf', 'codonopsis', 'pilosula', 'nannf', 'var', 'modesta', 'nannf', 'l', 't', 'shen', 'or', 'codonopsis', 'tangshen', 'oliv', 'chinese', 'read', 'more', 'official', 'name', 'cinnamomi', 'ramulus', 'english', 'name', 'cassia', 'twig', 'scientific', 'name', 'cinnamomum', 'cassia', 'presl', 'chinese', 'name', 'chinese', 'phonetic', 'name', 'gu', 'zh', 'official', 'name', 'angelicae', 'dahuricae', 'radix', 'english', 'name', 'dahurian', 'angelica', 'root', 'scientific', 'name', 'angelica', 'dahurica', 'fisch', 'ex', 'hoffm', 'benth', 'et', 'hook', 'f', 'angelica', 'dahurica', 'fisch', 'ex', 'hoffm', 'benth', 'read', 'more', 'official', 'name', 'fructus', 'forsythiae', 'english', 'name', 'weep', 'forsythia', 'capsule', 'scientific', 'name', 'forsythia', 'suspensa', 'thunb', 'vahl', 'chinese', 'name', 'chinese', 'phonetic', 'name', 'lian', 'qiao', 'source', 

In [117]:
data["text_lem"] = data["text_tagged"].apply(lemmatise)

In [157]:
# We only care about this column now
corpus = data[["text_lem"]]

In [158]:
corpus.to_csv("data/corpus_df.csv")

In [159]:
frequency = defaultdict(int)
for doc in data["text_lem"]:
    for word in doc:
        frequency[word] += 1
        
filt = dict(frequency)

f = sorted(filt.items(), key=lambda x: x[1])
f.reverse()

In [160]:
frequency["health"]

3399

In [161]:
def filterSingletons(doc):
    return [x for x in doc if frequency[x] > 1]

corpus["text_lem"] = corpus["text_lem"].apply(filterSingletons)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [162]:
corpus.head()

Unnamed: 0,text_lem
0,"[official, name, codonopsis, radix, english, n..."
1,"[tell, u, your, story, how, do, you, become, i..."
2,"[please, make, sure, you, enter, a, valid, and..."
3,"[the, hong, kong, association, and, organize, ..."
4,"[electric, vehicle, charge, station, be, pop, ..."


In [164]:
# update frequencies
frequency = defaultdict(int)
for doc in corpus["text_lem"]:
    for word in doc:
        frequency[word] += 1
        
filt = dict(frequency)

f = sorted(filt.items(), key=lambda x: x[1])
f.reverse()

In [166]:
stoplist = set('for a of the and to in it be'.split())

def filterStopWords(doc):
    return [x for x in doc if x not in stoplist]

corpus["text_lem"] = corpus["text_lem"].apply(filterStopWords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [168]:
dictionary = corpora.Dictionary(corpus["text_lem"].values)
print(dictionary)
for i in dictionary.keys()[:5]:
    print()
    print(i)
    print(dictionary.get(i))

Dictionary(26867 unique tokens: ['stirfry', 'gmail', 'retirement', 'freaky', 'faulty']...)

20246
stirfry

16149
gmail

7167
retirement

16699
freaky

20349
faulty


In [169]:
def vectorise(s):
    return dictionary.doc2bow(s)
    
corpus["text_vec"] = corpus["text_lem"].apply(vectorise)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [173]:
# The text_vec column is the actual corpus, words are encoded as in the dictionary
corpora.MmCorpus.serialize('data/corpus.mm', corpus["text_vec"])

In [174]:
dictionary.save('data/dictionary.dict')

In [175]:
corpus.to_csv("data/corpus_df.csv")

In [178]:
corpus

Unnamed: 0,text_lem,text_vec
0,"[official, name, codonopsis, radix, english, n...","[(0, 1), (1, 1), (2, 3), (3, 1), (4, 1), (5, 1..."
1,"[tell, u, your, story, how, do, you, become, i...","[(4, 6), (5, 2), (26, 2), (34, 1), (39, 1), (5..."
2,"[please, make, sure, you, enter, valid, comple...","[(74, 1), (139, 1), (147, 2), (179, 1), (268, ..."
3,"[hong, kong, association, organize, an, electr...","[(4, 1), (17, 1), (92, 1), (119, 1), (179, 1),..."
4,"[electric, vehicle, charge, station, pop, up, ...","[(17, 2), (26, 1), (39, 2), (79, 1), (141, 1),..."
5,"[traditional, chinese, medicine, tcm, have, ov...","[(4, 4), (6, 3), (26, 6), (34, 2), (44, 1), (4..."
6,"[by, select, one, following, topic, you, can, ...","[(6, 4), (17, 1), (73, 1), (92, 1), (114, 1), ..."
7,"[western, physiology, urine, consider, fluid, ...","[(4, 8), (5, 2), (6, 6), (17, 2), (21, 1), (23..."
8,"[coriolus, versicolor, also, know, versicolor,...","[(4, 14), (5, 2), (6, 6), (18, 1), (26, 10), (..."
9,"[mushroom, health, supplement, that, combine, ...","[(4, 9), (5, 5), (6, 13), (18, 12), (26, 5), (..."
