In [2]:
import gensim, operator
from scipy import spatial
import numpy as np
from gensim.models import KeyedVectors

model_path = '/Users/javid/projects/enaibl/models/wordvec/'

In [3]:
def load_wordvec_model(modelName, modelFile, flagBin):
    print('Loading ' + modelName + ' model...')
    model = KeyedVectors.load_word2vec_format(model_path + modelFile, binary=flagBin)
    print('Finished loading ' + modelName + ' model...')
    return model

model_word2vec = load_wordvec_model('Word2Vec', 'GoogleNews-vectors-negative300.bin.gz', True)
#model_fasttext = load_wordvec_model('FastText', 'fastText_wiki_en.vec', False)

Loading Word2Vec model...
Finished loading Word2Vec model...


In [4]:
# BUILD YOUR OWN TAXONOMY BASED ON LDA and MANUAL DATA EXPLORATION
topic_taxonomy = {
    "executive leadership manager CEO":
    {
        "Executive Appointment":  "appointed named picked joins",
        "Executive Resignation":  "quits resigns retires departs leaves",
        "Executive Compensation": "paid compensated receives bonus payraise payout dividend"
    },
    "business company":
    {
        "Business Expansion":    "to acquire buy acquisition merge invest purchase spend subsidize win secure contract venture",
        "Business Reorganization": "bankruptcy withdraw divest exit pullout sale sell outflow writeoff split",
        "Business Partnership":  "partner team collaboration alliance deal contract",
        "Operational Outage":    "operational outage disruption halt end cancellation delay",
        "Operations Resume":     "restore restart resume maintain operations",
        "Business Statement":    "shareholder issues releases statement results policy denial confirmation",
        "Business Settlement":   "settlement refund moneyback compensate reimburse",
        "Business Competition":  "tops leads to rival competitor entrant competitive advantage"
    },
    "location branch plant store":
    {
        "Facility Opening":    "open expand facility",
        "Facility Relocation": "closure move shutdown relocate"
    },
    "employee worker":
    {
        "Workforce Expansion":  "jobs hiring openings recruitment looking to hire",
        "Workforce Downsizing": "job cut layoff lay off reorganization termination downsizing",
        "Workforce Discontent": "discontent dissatisfaction labor crew union strike dispute unrest boycott revolt",
        "Workforce Agreement":  "pension deal agreement contract negotiation arbitration resolve"
    },
    "marketing":
    {
        "PR Meeting":    "host presentation summit convention conference symposium meeting show exposition",
        "PR Ceremony":   "loyalty award recognize celebrity honor ceremony mark anniversary prize",
        "PR Marketing":  "advertising commercial campaign exclusive offer",
        "PR Charity":    "donate sponsor contribute benefit grant aid"
    },
    "legal"	:
    {
        "Legal Lawsuit":        "conviction guilty sue battle in court ruling sentencing class action lawsuit litigation settlement appeals trial testify",
        "Legal Investigation":  "law FBI probe allegation indictment investigation face charges grilled over",
        "Legal Scandal":        "scandal revelation whistleblower accusing accusation"
    },
    "regulation standards":
    {
        "Regulatory Impact":    "compliance safety feds audit fine debt owe penalty privacy discrimination certification government blessing taxation import duties tarriff legislation bill",
        "Environmental Regulation": "environment climate change water quality pollution emission oil spill"
    },
    "incident" :
     {
        "Financial Crime":        "insider trading activity financial crime corruption fraud bribe FCPA money laundering",
        "Crime General":          "police shot killed murder robbery burglary steal thief damage arrest kidnapping smuggling drugs cartel",
        "Security Breach":        "security data confidential information breach hacking",
        "Safety Incident":        "safety accident fire gas leak arson radiation",
        "Healthcare Casualty":    "injury death casualty ambulance healthcare epidemic hurt",
        "Automotive Emergency":   "accident car Tesla crash bus overturned",
        "Aviation Emergency":     "airplane helicopter crash landing skids off runway",
        "Violent Incident":       "war military strike killed shot terrorist attack shooting explosion bombing",
        "Natural Disaster":       "earthquake flood rainstorm snowstorm hurricane tornado"
     },
    "earnings revenue sales stock profits income":
    {
        "Revenue Growth":  "up post gain increase growth climb rise jump surge soar boost",
        "Revenue Decline": "decline down drop fall lose value decrease stumble plunge slip plummet"
    },
    "market expansion":
    {
        "New Technology":   "new technology system development invention patent driverless self-driving automation",
        "New Project":      "new project initiative field discovery exploration",
        "New Product":      "new unveil release announce launch build introduce produced concept model"

    },
    "product":
    {
        "Product Failure":  "issue problem recall citing failure withdrawal discontinue defect flaw"
    },
    "rating forecast outlook prediction rank" :
    {
        "Forecast Positive": "upgrade outperform strong growing positive optimistic high up profitable",
        "Forecast Negative": "pessimistic warn tough times decline weak negative down low downgrade fail drop"
    },
    "consumer feedback sentiment study":
    {
        "Feedback Positive": "positive best like praise upbeat success improving customer service",
        "Feedback Negative": "slam dislike complaint bad negative perception poor service harbinger stop",
        "Opinion Feedback":  "feeling question maybe opinion safe interview view blog says quote complaint comment"
    },
    "sport":
    {
        "Sport": "sports tournament match score"
    }
}

In [5]:
def vec_similarity(input1, input2, vectors):
    term_vectors = [np.zeros(300), np.zeros(300)]
    terms = [input1, input2]
        
    for index, term in enumerate(terms):
        for i, t in enumerate(term.split(' ')):
            try:
                term_vectors[index] += vectors[t]
            except:
                term_vectors[index] += 0
        
    result = (1 - spatial.distance.cosine(term_vectors[0], term_vectors[1]))
    if result is 'nan':
        result = 0
        
    return result

In [6]:
# function checks whether the input words are present in the vocabulary for the model
def vocab_check(vectors, words):
    
    output = list()
    for word in words:
        if word in vectors.vocab:
            output.append(word.strip())
            
    return output

In [7]:
# function calculates similarity between two strings using a particular word vector model
def calc_similarity(input1, input2, vectors):
    s1words = set(vocab_check(vectors, input1.split()))
    s2words = set(vocab_check(vectors, input2.split()))
    
    output = vectors.n_similarity(s1words, s2words)
    return output

In [8]:
# function takes an input string, runs similarity for each item in topic_taxonomy, sorts and returns top 3 results
def classify_topics(input, vectors):
    feed_score = dict()
    for key, value in topic_taxonomy.items():
        max_value_score = dict()
        for label, keywords in value.items():
            max_value_score[label] = 0
            topic = (key + ' ' + keywords).strip()
            max_value_score[label] += float(calc_similarity(input, topic, vectors))
            
        sorted_max_score = sorted(max_value_score.items(), key=operator.itemgetter(1), reverse=True)[0]
        feed_score[sorted_max_score[0]] = sorted_max_score[1]
    return sorted(feed_score.items(), key=operator.itemgetter(1), reverse=True)[:3]

In [9]:
if __name__ == '__main__':
    # example using Word2Vec
    output1 = classify_topics('Walmart’s ambitious plan to beat Amazon on free one-day shipping is here', model_word2vec)
    print(output1)
    # example output using FastText, note for FastText, string needs to be non-capitalized
    output2 = classify_topics('White House Has Plan To Force Tillerson Out Replace With CIA Chief', model_word2vec)
    print(output2)
    

[('Business Competition', 0.4393536150455475), ('Forecast Positive', 0.4269000291824341), ('New Product', 0.4226176142692566)]
[('Executive Resignation', 0.2827241122722626), ('Regulatory Impact', 0.26587608456611633), ('Business Statement', 0.2430115044116974)]
