# POC Model

__Install packages__

In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import pyLDAvis
import pyLDAvis.gensim
from nltk.corpus import stopwords
import glob
from datetime import datetime
import statistics 
import jmespath
import json
from rake_nltk import Rake
from nltk.tokenize import word_tokenize
from nltk.tokenize import MWETokenizer
from gensim.test.utils import datapath
from fuzzywuzzy import fuzz 
from fuzzywuzzy import process

  regargs, varargs, varkwargs, defaults, formatvalue=lambda value: ""
  from collections import Sequence, defaultdict


### Converting Tree to JSON format

In [2]:
# Please run this code cell once only as multiple runs will affect the same file 
with open('Clean_mind_map.json') as json_file:
    data = json.load(json_file)
    
itemsKeyedById = {i["ID"]: i for i in data}

#iterate through each item in the `myJson` list.
for item in data[1:]:
    #does the item have a parent?
    if "Parent" in item:
        #get the parent item
        parent = itemsKeyedById[item['Parent']]
        #if the parent item doesn't have a "children" member, 
        #we must create one.
        if "children" not in parent:
            parent["children"] = []
        #add the item to its parent's "children" list.
        parent["children"].append(item)
topLevelItems = [item for item in data if "Parent" not in item]
taxonomy_data = {'data': data}

In [3]:
with open('data.json', 'w') as outfile:
    json.dump(taxonomy_data, outfile)

Examples are found on this link: http://jmespath.org/tutorial.html

In [4]:
expression = jmespath.compile("data[?Title =='Lean of Peak'].children[].Title")
expression.search(data)

### Data Sources Integration

In [5]:
data = []
for filename in glob.glob('*.csv'):
    data.append(filename)
data

['COPA_blog.csv',
 'COPA_forum_replies.csv',
 'COPA_forum_threads.csv',
 'COPA_Wiki.csv']

In [81]:
# magazine = pd.read_csv(data[])
# magazine_date_list = list(magazine['Magazine'].unique())
# # Get the month of magazine from its name
# # Since the the COPA magazines are released monthly, so it only has year and month
# # in its date column. In this case we just assume all magazines were released on the 
# # first day of that month.
# magazine_date_dict = {magazine_date_list[0]: '2015-01-01', magazine_date_list[1]: '2019-01-01',
#                       magazine_date_list[2]: '2015-07-01', magazine_date_list[3]: '2019-04-01',
#                       magazine_date_list[4]: '2019-06-01', magazine_date_list[5]: '2018-01-01',
#                       magazine_date_list[6]: '2017-09-01', magazine_date_list[7]: '2016-09-01',
#                       magazine_date_list[8]: '2017-11-01', magazine_date_list[9]: '2015-05-01',
#                       magazine_date_list[10]: '2015-09-01', magazine_date_list[11]: '2006-09-01', 
#                       magazine_date_list[12]: '2018-06-01', magazine_date_list[13]: '2019-07-01',
#                       magazine_date_list[14]: '2017-04-01', magazine_date_list[15]: '2016-04-01',
#                       magazine_date_list[16]: '2018-11-01', magazine_date_list[17]: '2015-09-01',
#                       magazine_date_list[18]: '2019-03-01', magazine_date_list[19]: '2006-11-01',
#                       magazine_date_list[20]: '2016-01-01', magazine_date_list[21]: '2019-05-01',
#                       magazine_date_list[22]: '2016-03-01', magazine_date_list[23]: '2016-05-01',
#                       magazine_date_list[24]: '2006-07-01', magazine_date_list[25]: '2017-06-01',
#                       magazine_date_list[26]: '2018-07-01', magazine_date_list[27]: '2018-09-01',
#                       magazine_date_list[28]: '2017-01-01', magazine_date_list[29]: '2012-11-01'}
# magazine['Magazine'] = magazine['Magazine'].map(magazine_date_dict)
# magazine.columns = ['Date', 'Title', 'Content', 'Author']
# magazine['Like'] = 0
# magazine['Comment'] = 0
# magazine['Resource'] = 'Magazine'

In [82]:
blog = pd.read_csv(data[2], encoding = 'unicode_escape')
blog = blog[['Date', 'Title', 'Body', 'Author', 'Like','Comment']]
blog.columns = ['Date', 'Title', 'Content', 'Author', 'Like','Comment']
blog['Date'] = [d.split(' ')[0].split('/')[2] + '-' + d.split(' ')[0].split('/')[0] for d in blog['Date']]
blog['Date'] = blog['Date'].map(lambda x: x + '-01')
blog['Resource'] = 'Blog'

In [6]:
pdList = [magazine, wiki, blog]
# Combine three datasets together
df = pd.concat(pdList)
df.reset_index(drop = True, inplace = True)

In [7]:
#Ranking the source, blog is the most important source so have 3
raw_data = {'Resource': ['Wiki','Blog','Magazine'], 'SourceScore': [1, 2, 3]}
Source_tb = pd.DataFrame(raw_data, columns = ['Resource', 'SourceScore'])
#add source score to the main table
df = pd.merge(df, Source_tb, left_on = 'Resource', right_on = 'Resource')

# Generate Attributes

__Generate Recency Score__

In [8]:
def  Calculate_RecencyScore(date):
    '''
    Recency Rate = log( 1 + 1/(days between the post date and current date + 1))
    '''
    date_datetime = datetime.strptime(date, '%Y-%m-%d').date()
    rececncy_rate = math.log10(1/(((datetime.date(datetime.now()))-date_datetime).days+1)+1)
    return rececncy_rate
    

df['RecencyRate'] = df['Date'].map(lambda y: Calculate_RecencyScore(y))

__Generate Author Score__

In [9]:
def Calculate_AuthorScore(author):
    '''
    AuthorScore = Number of posts of specified content source * pre-determined weight
    '''
    author_score = df[df['Author'] == author]['SourceScore'].sum()
    return author_score

author_list = list(df['Author'].unique())
df['AuthorScore'] = df['Author'].map(lambda x: Calculate_AuthorScore(x), author_list)

__Generate Content Score__

In [7]:
forum = pd.read_csv(data[1])
data = forum.FormattedBody.values.tolist()

# Create a model for topic classification based on the dataset we have 
# (It may cause some problems because the model will be applied to the dataset later to give each content a topic.)
# (The problem can be solved when we get the entire dataset since we can take part of the contents as corpus.)

# Get some stop words
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           passes=10)

pprint(lda_model.print_topics(num_words = 10))
# Get 20 related topics
doc_lda = lda_model[corpus]

[(0,
  '0.047*"car" + 0.038*"com" + 0.036*"drive" + 0.023*"love" + 0.021*"family" + '
  '0.021*"kid" + 0.019*"wife" + 0.018*"live" + 0.018*"eat" + 0.017*"home"'),
 (1,
  '0.018*"turn" + 0.018*"video" + 0.016*"approach" + 0.014*"set" + '
  '0.013*"mode" + 0.013*"switch" + 0.012*"course" + 0.011*"hold" + '
  '0.011*"head" + 0.010*"autopilot"'),
 (2,
  '0.015*"state" + 0.013*"report" + 0.011*"require" + 0.011*"say" + '
  '0.011*"rule" + 0.009*"case" + 0.009*"may" + 0.009*"provide" + 0.009*"claim" '
  '+ 0.008*"legal"'),
 (3,
  '0.066*"pilot" + 0.025*"fly" + 0.019*"training" + 0.016*"accident" + '
  '0.015*"flight" + 0.013*"cirrus" + 0.013*"cap" + 0.012*"airplane" + '
  '0.011*"plane" + 0.011*"aircraft"'),
 (4,
  '0.052*"fly" + 0.025*"get" + 0.023*"go" + 0.018*"time" + 0.014*"take" + '
  '0.014*"good" + 0.013*"plane" + 0.013*"much" + 0.013*"ice" + 0.013*"really"'),
 (5,
  '0.072*"cirrus" + 0.047*"aircraft" + 0.043*"sell" + 0.041*"jet" + '
  '0.041*"owner" + 0.038*"buy" + 0.032*"new" + 0.02

In [90]:
# Save model to disk.
temp_file = datapath("lda.model")
lda_model.save(temp_file)

# Load lda.model
lda = gensim.models.ldamodel.LdaModel.load(temp_file)

In [14]:
# pd.DataFrame(lda_model.print_topics(num_words = 10), columns = ['Topic', 'Keywords']).to_csv('LDA_vector.csv')

__Normalization__

In [10]:
def Normalization(col):
    nomolized_col = (df[col] - df[col].mean()) / df.loc[:, col].std()
    return nomolized_col

df['RecencyRate'] = Normalization('RecencyRate')
df['AuthorScore'] = Normalization('AuthorScore')

In [11]:
df.head()

Unnamed: 0,Date,Title,Content,Author,Like,Comment,Resource,SourceScore,RecencyRate,AuthorScore
0,2015-01-01,President's Column,JANUARY FEBRUARY 20154CIRRUS PILOTAs this is b...,,0,0,Magazine,3,-0.465959,-1.209951
1,2015-01-01,COPA News,JANUARY FEBRUARY 20156CIRRUS PILOTWith this is...,,0,0,Magazine,3,-0.465959,-1.209951
2,2015-01-01,Regional News,JANUARY FEBRUARY 201512CIRRUS PILOTby GIL WILL...,GIL WILLIAMSON,0,0,Magazine,3,-0.465959,0.21105
3,2015-01-01,Cirrus Perspective,JANUARY FEBRUARY 201518CIRRUS PILOTIts hard to...,,0,0,Magazine,3,-0.465959,-1.209951
4,2015-01-01,Member Spotlight,JANUARY FEBRUARY 201522CIRRUS PILOTCirrus Pilo...,KIM BLONIGEN,0,0,Magazine,3,-0.465959,0.21105


__Input Searching query__

For Search Results Ranking

- "Cirrus Parachute First Responders"
- "Different between AATD and FTD"
- "Activate vector to final"
- "Turbo normalize vs turbo charge"
- "Shooting ILS approach"

In [10]:
taxonomy_terms = [tree['Title'] for tree in taxonomy_data['data'][1:]]

topic_vector = {}
for i in range(len(taxonomy_terms)):
    topic_info = taxonomy_data['data'][1:][i] # Sub level of all terms other than COPA
    title = re.findall(r'Title\': \'(.*?)\'', str(topic_info)) # Extract all children titles under specific term
    if len(title) > 1: # If the term is among the bottom level, then just assign itself to the term
        topic_vector[taxonomy_terms[i].lower()] = title[1:]
    else:
        topic_vector[taxonomy_terms[i].lower()] = [taxonomy_terms[i]]

Think about which fuzzywuzzy method we should use:
For each vector in topic_vector: which level we should match to.
What kind of vector we want to get in the end? 
Current version: Combine all the match vectors together.


In [11]:
query = "Shooting ILS approach"

In [12]:
threshold = 80  # Maybe adjusted to improve performance

In [13]:
# Compare query to each leaf node, if the partial match score is higher than the threshold, then return a combination vector
# that contains all words in those leaf nodes.
def fuzzywuzzy_match(query,threshold):
    
    # Generate a list contains all leaf nodes, each sublist contains all words in that leaf node.
    leaf_node = list()
    for item in topic_vector:
        if len(topic_vector[item]) == 1:
            leaf_node.append(topic_vector[item][0].split())
            
    reference_vector = list()
    
    for node in leaf_node:
        query_len = len(query)
        node_len = len(node)
        
        if query_len >= node_len: # If query is longer
            for i in range(node_len):
                score = fuzz.partial_ratio(node[i],query)
                if score > threshold:
                    reference_vector.append(node) 
                    
        else: # If leaf node is longer
            for j in range(query_len):
                score = fuzz.partial_ratio(query[j],node)
                if score > threshold:
                    reference_vector.append(node)
    
    reference_vector = list(list(i) for i in set(map(tuple, reference_vector))) # Deduplication
    
    return reference_vector

In [14]:
fuzzywuzzy_match(query,threshold)

[['Not', 'Over', 'a', 'Runaway'],
 ['Ground', 'controlled', 'approach', '(GCA)'],
 ['LOC', 'Approach'],
 ['Terminal', 'Radar', 'Approach', 'Control', '(TRACON)'],
 ['Instrument', 'Approach', 'Procedures', 'Charts', '(IAPs)'],
 ['GPSS', 'Approach'],
 ['Precision', 'approach', 'radar', '(PAR)'],
 ['Straight-In', 'ILS', 'Approach'],
 ['Straight-In', 'Back', 'Course', 'Approach'],
 ['Over', 'a', 'Runaway'],
 ['approach', 'with', 'vertical', 'guidance', '(APV)'],
 ['LNAV/VNAV', 'approaches'],
 ['ILS', 'Approach'],
 ['Joint', 'Precision', 'Approach', 'and', 'Landing', 'System', '(JPALS)'],
 ['Experience', 'in', 'type'],
 ['Straight-In', 'VOR', 'Approach'],
 ['Surveillance', 'radar', 'approach', '(SRA)'],
 ['Straight-In', 'LOC', 'Approach'],
 ['Cirrus', 'Approach'],
 ['Back', 'Course', 'Approach']]

In [9]:
def extract_keywords(search_query):
    '''
    Get the most important keyword from searching query 
    '''
    stop_words = stopwords.words('english')
    stop_words.append('vs')
    r = Rake(stopwords = stop_words)
    
    r.extract_keywords_from_text(search_query)
    rank = r.get_ranked_phrases_with_scores()
    itemMaxValue = max(rank, key=lambda x: x[0])[0]
    
    # Find the most important keyword
    listOfKeys = list()
    for value, key in rank:
        if value == itemMaxValue:
            listOfKeys.append(key.lower())
    
    return listOfKeys

In [10]:
# search_query = ["Cirrus Parachute First Responders", "Difference between AATD and FTD", "Activate vector to final", "Turbo normalize vs turbo charge", "Shooting ILS approach"]
# extract_keywords("Difference between AATD and FTD")

In [12]:
def exact_match(search_query):
    '''
    The searching query can be exact matched by taxonomy tree.
    '''
    if search_query in taxonomy_terms:
        taxonomy_vector = topic_vector[search_query.lower()]
    return taxonomy_vector

In [13]:
# exact_match('Tire Brands')

In [14]:
# Create a vector that stores all the phrases(more than one word) that should be 
# token as a whole.
def generate_phrases_tokenizer(topic_vector): # Return a list of tuples containing phrases.
    all_multiword_tokens = list()
    for item in topic_vector: 
        for phrase in topic_vector[item]:
            phrase = phrase.replace('(','') # Eliminate '(' and ')' in the phrase
            phrase = phrase.replace(')','')
           #phrase = re.sub(r'( \(.*\))','',phrase) #exclude all contents in '()'
            phrase_lower = phrase.lower() # Convert all characters into lower case.
            #phrase_lower.replace('-', ' ')
            word_lst = phrase_lower.split()
            if len(word_lst) >= 2: 
                word_tuple = tuple(word_lst)
                all_multiword_tokens.append(word_tuple)  
    all_multiword_tokens = list(set(all_multiword_tokens)) # Deduplication
    tokenizer = MWETokenizer(all_multiword_tokens,separator=' ')
    return tokenizer

In [15]:
def any_match(search_query,tokenizer):
    '''
    Any words in searching query can be exact matched by taxonomy tree.
    '''
    # Split input string into words or phrases according to taxonomy tree.
    words = tokenizer.tokenize(search_query.split())
    taxonomy_vector = []
    for word in words:
        try:
            taxonomy_vector.append(topic_vector[word])
        except:
            continue
    return taxonomy_vector

In [16]:
phrase_tokenizer = generate_phrases_tokenizer(topic_vector)
any_match("Turbo normalize vs turbo charge", phrase_tokenizer)

[['Turbo']]

In [17]:
def keyword_match(keywords):
    '''
    The most important keyword of searching query can be matched by taxonomy tree.
    '''
    taxonomy_vector = []
    for keyword in keywords:
        try:
            taxonomy_vector.append(topic_vector[keyword])
        except:
            continue
    return taxonomy_vector

In [18]:
# keywords = extract_keywords("Engine doesn't work.")
# keyword_match(keywords), keywords

In [19]:
def topic_match(search_query):
    '''
    Extract the topic of searching query by LDA model trained by forum posts.
    '''
    texts1 = [[word for word in doc.lower().split() if word not in stop_words] for doc in [search_query]]
    corpus1 = [id2word.doc2bow(t) for t in texts1]
    result = lda_model.get_document_topics(corpus1)
    LDA_topic = max(result[0], key=lambda x: x[1])[0]
    return LDA_topic

In [82]:
# topic_match('Turbo normalize vs turbo charge'), lda_model.print_topics(num_words = 10)[12]

(12,
 (12,
  '0.101*"altitude" + 0.083*"high" + 0.065*"climb" + 0.041*"low" + 0.034*"performance" + 0.032*"gun" + 0.030*"turbo" + 0.028*"level" + 0.028*"foot" + 0.027*"rate"'))