# IEMS308 HW4
## Author: Taige Hong

In [1]:
import numpy as np
import re
import math
import nltk
import os

from datetime import datetime, timedelta
from collections import Counter
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from elasticsearch import Elasticsearch, helpers
from elasticsearch_dsl import Search, query
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
p1 = "Which company went bankrupt in?"
p2 = "What affects GDP?"
p3 = "What percentage of drop or increase is associated with?"
p4 = "Who is the CEO of?"

In [3]:
STOP = set(stopwords.words("english"))
def remove_stop(t):
    temp = []
    for i in t.split():
        if i.lower() not in STOP:
            temp.append(i)
    return " ".join(temp)

Use cosine similarity to determine which question is asked.

In [4]:
def similarity(q, sq):
    q_nsw = remove_stop(q)
    sq_nsw = remove_stop(sq)
    stemmer = nltk.PorterStemmer()
    q_stemmed = " ".join([stemmer.stem(word) for word in q_nsw.split()])
    sq_stemmed = " ".join([stemmer.stem(word) for word in sq_nsw.split()])
    q_all = re.findall(r'\w+', q_stemmed.lower())
    sq_all = re.findall(r'\w+', sq_stemmed.lower())
    commonwords = set(Counter(q_all).keys()) & set(Counter(sq_all).keys())
    cosine_likelihood = sum(Counter(q_all)[word] * Counter(sq_all)[word] for word in commonwords) / ((math.sqrt(sum(Counter(q_all)[x] ** 2 for x in Counter(q_all).keys()))) * (math.sqrt(sum(Counter(sq_all)[x] ** 2 for x in Counter(sq_all).keys()))))
    return cosine_likelihood

Load the files and dump them into Elasticsearch

In [5]:
#Loading files
alltext = []

for file in os.listdir('2013/'):
    alltext.append(open(os.path.join('2013', file), 'rb').read().decode('ISO-8859-1'))
for file in os.listdir('2014/'):
    alltext.append(open(os.path.join('2014', file), 'rb').read().decode('ISO-8859-1'))
    
#Setup Elasticsearch, we have to make sure elasticsearch is running at localhost:9200
es = Elasticsearch()

datestart = datetime(2013,1,1)
docs = []
for i in range(len(alltext)):
    doc = {
        '_index': "article",
        '_type': "article",
        '_id': i,
        'content': alltext[i]
    }
    docs.append(doc)
helpers.bulk(es, docs)    

(730, [])

Throughout the three questions, we are using the same logistics to process the questions-- find out the keywords and extract all sentences with those keywords. Then we use Regex expression to filter out the candidate words and use Counter to find the one that appears most frequently.

In [6]:
def answer1(q):
    monthdict = {'January':1, 'February':2, 'March':3, 'April':4,
                'May':5, 'June':6, 'July':7, 'August':8,
                'September':9 , 'October':10 , 'November':11 , 'December':12}
    tokens = nltk.word_tokenize(q)
    month = list(filter(lambda x: x in tokens, monthdict))[0]
    years = re.findall(r'\d{4}', q)
    if (years):
        year = years[0]
    else:
        return "Sorry, cannot find year"
    
    qc = query.Q("query_string", query = "bankrupt bankruptcy liqudate declare")
    qm = query.Q("query_string", query =  month)
    qy = query.Q("query_string", query =  year)
    
    q1 =  qc + qm + qy
    search = Search(using = es, index = "article")
    search = search.query(q1)
    search = search[:200]
    result = search.execute()
    sents = []
    for i in range(len(result.hits)):
        sent = nltk.tokenize.sent_tokenize(result.hits[i].content)
        sent = filter(lambda x: (("bankrupt" in x.lower()) & (year in x)), sent)
        sents.extend(sent)
    names = [re.findall(r" (?:[A-Z]+[A-Za-z0-9']+ ?)+", sent) for sent in sents]
    namelist = []
    STOP1 = ["chapter", "bankrupt", "bankruptcy", "liquidated"]
    for i in range(len(names)):
        name = map(lambda x: x.strip().lstrip(), names[i])
        validword = filter(lambda x: ((x.lower() not in STOP) & (x.lower() not in STOP1) & (x not in monthdict)) , name)
        namelist.extend(validword)
    
    word_count = Counter(namelist)
    return word_count.most_common(1)[0][0] 

In [7]:
    q2 = query.Q("query_string", query = "GDP") + query.Q("query_string", query = "affect effect")
    search = Search(using = es, index = "article")
    search = search.query(q2)
    search = search[:50]
    result = search.execute()
    articles = [hit.content for hit in result.hits]
    algo = TfidfVectorizer(ngram_range = (1,2), stop_words = STOP)
    applied = algo.fit_transform(articles)
    freqs = np.sum(applied, axis = 0)
    index = np.argsort(freqs)[:,-100:]
    words = np.array(algo.get_feature_names())[index]
    words
    

array([['however', 'stocks', 'fund', 'around', 'banks', 'three',
        'likely', 'funds', 'think', 'global', 'recent', 'higher', 'next',
        'way', 'expected', 'says', 'today', 'low', 'day', 'federal',
        'going', 'risk', 'big', 'quarter', 'tax', '2013', 'debt',
        'interest', 'sales', 'still', '2014', 'make', 'companies',
        'capital', 'see', 'report', 'amp', 'month', 'back', 'week',
        'term', 'well', 'inflation', 'price', '10', 'policy', 'get',
        'investment', 'many', 'long', '000', 'high', 'much', 'according',
        'fed', 'china', 'oil', 'rates', 'money', 'reuters', 'stock',
        'investors', 'data', 'two', 'world', 'even', 'economy',
        'markets', 'government', 'prices', 'first', 'business', 'rate',
        'million', 'us', 'bank', 'economic', 'like', 'since', 'may',
        'financial', 'company', 'could', 'people', 'free', 'billion',
        'last', 'years', 'time', 'growth', 'percent', 'also',
        'free appdownload', 'appdownload',

Here the best way to find out the factors affecting GDP is to conclude from this list of words as it is more efficient than expanding the stop words set. I also tried to filter out only "NN" and "NNS" but it turns out not to be so effective. Thus, the question to "What affects GDP?" is manually answered.

In [8]:
def answer2():
    return ['spending', "stock", "tax", "debt", "interest", "inflation", "policy", "investor", "market", "price", "growth"]

In [10]:
def answer3(q):
    keyword = list(filter(lambda x: x in q.lower(), answer2()))
    if (keyword):
        key = keyword[0]
    else:
        return "Sorry, I don't know about that."
    q3 = query.Q("query_string", query = "GDP") + query.Q("query_string", query = key)
    search = Search(using = es, index = "article")
    search = search.query(q3)
    search = search[:50]
    result = search.execute()
    sents = []
    for i in range(len(result.hits)):
        sent = nltk.tokenize.sent_tokenize(result.hits[i].content)
        sent = filter(lambda x: (("GDP" in x) & (key in x)), sent)
        sents.extend(sent)
    pattern = r'(([0-9]+)|(one)|(two)|(three)|(four)|(five)|(six)|(seven)|(eight)|(nine)|(ten))[.[0-9]*]?(%| percentage point| percent)+s?'
    names = [re.findall(pattern, sent) for sent in sents]
    namelist = []
    for i in range(len(names)):
        name = map(lambda x: "".join(x[1:]), names[i])
        namelist.extend(name)
    word_count = Counter(namelist)
    return word_count.most_common(1)[0][0]
  

In [11]:
def answer4(q):
    comname = re.findall(r" (?:[A-Z]+[A-Za-z0-9']+ ?)+", q)[-1].lstrip()
    q3 = query.Q("query_string", query = "CEO") + query.Q("query_string", query = comname)
    search = Search(using = es, index = "article")
    search = search.query(q3)
    search = search[:50]
    result = search.execute()
    sents = []
    for i in range(len(result.hits)):
        sent = nltk.tokenize.sent_tokenize(result.hits[i].content)
        sent = filter(lambda x: (("CEO" in x) & (comname in x)), sent)
        sents.extend(sent)
    names = [re.findall(r" (?:[A-Z]+[A-Za-z0-9']+ ?)+", sent) for sent in sents]
    namelist = []
    for i in range(len(names)):
        name = map(lambda x: x.strip().lstrip() ,names[i])
        name = filter(lambda x: ((x not in STOP) & (x != "CEO") & (x != comname)), name)
        namelist.extend(name)
    word_count = Counter(namelist)
    if word_count:
        common = word_count.most_common(1)[0][0] 
    else:
        return "Sorry, I can't find the CEO of this company."
    common_token = nltk.tokenize.word_tokenize(common)
    return " ".join(filter(lambda x: ((x != "CEO") & (x != comname)), common_token))


I set the cutoff value of cosine_similarity to 0.42 by trials and errors. This prevent the system from answering nonsen

In [12]:
def answer_question(q):
    vector = []
    s1 = similarity(q, p1)
    s2 = similarity(q, p2)
    s3 = similarity(q, p3)
    s4 = similarity(q, p4)
    maxvalue = max(s1, s2, s3, s4)
    if (maxvalue < 0.42):
        return "I can't answer this question."
    elif (maxvalue == s1):
        return answer1(q)
    elif (maxvalue == s2):
        return answer2()
    elif (maxvalue == s3):
        return answer3(q)
    elif (maxvalue == s4):
        return answer4(q)
    else:
        return "Something went wrong, you shouldn't be getting this msg."

In [13]:
answer_question("Which company went bankrupt in September 2008?")

'Lehman Brothers'

In [14]:
answer_question("Which company declared bankrupt in October 2009?")

'Chrysler'

In [15]:
answer_question("Which company went bankrupt in February 2013?")

'MF Global'

In [16]:
answer_question("What affects GDP?")

['spending',
 'stock',
 'tax',
 'debt',
 'interest',
 'inflation',
 'policy',
 'investor',
 'market',
 'price',
 'growth']

In [17]:
answer_question("By what percentage is GDP associated with tax?")

'1%'

In [18]:
answer_question("What percentage of increase or drop is related to inflation?")

'3%'

In [19]:
answer_question("By how much is GDP associated with the increase and drop of spending?")

'4%'

In [20]:
answer_question("Who is the CEO of Amazon?")

'Jeff Bezos'

In [21]:
answer_question("Who is the CEO of Tesla?")

'Elon Musk'

In [22]:
answer_question("Who is the CEO of Facebook?")

'Mark Zuckerberg'