# IEMS 308 HW3
## Author: Taige Hong

In [13]:
import os
import re
import nltk
import pandas as pd
import numpy as np

from nltk.corpus import stopwords, words
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
from sklearn.metrics import confusion_matrix

## Cleaning training data

The raw data we have for the samples of each category is messy and needs to be cleaned. We want to put them into the same format so we can easily detect them in the article.

In [4]:
ceo_names = []
with open("all/ceo.csv", encoding = "ISO-8859-1") as f:
    ceo_names = f.readlines()

We put them in the form of Firstname Lastname, and remove the empty strings

In [5]:
def cleanceo(text):
    text = text.strip()
    text = text.rstrip()
    textlist = text.split(",")
    textlist = list(filter(None, textlist))
    
    if len(textlist) == 1:
        return textlist[0]
    else:
        return " ".join(textlist)


In [6]:
ceo_names = map(cleanceo, ceo_names)
ceo_names = list(set(ceo_names))

In [7]:
company_names = []
with open("all/companies.csv") as f:
    company_names = f.readlines()

We remove the extra space and period from the names

In [8]:
company_names = map(lambda x: x.rstrip(), company_names)
company_names = company_names.replace(".", "")
company_names = list(set(company_names))

We put all the text into a single string:

In [14]:
alltext = []

for file in os.listdir('2013/'):
    alltext.append(open(os.path.join('2013', file), 'rb').read().decode('ISO-8859-1'))
for file in os.listdir('2014/'):
    alltext.append(open(os.path.join('2014', file), 'rb').read().decode('ISO-8859-1'))

TrainTest splitting:

In [15]:
round(len(alltext)*0.7)

511

In [16]:
traintext = alltext[0:round((len(alltext)*0.7))]
testtext = alltext[round((len(alltext)*0.7)):]

## CEO

First we want to find all the pairs with capital letter as the first:

In [37]:
conseCAP = []
for i in range(len(alltext)):
    conseCAP.append(re.findall(r'([A-Z]\w+) ([A-Z]\w+)',alltext[i]))
print(conseCAP[0])

[('And', 'Zerohedge'), ('Dragon', 'Boat'), ('But', 'Diana'), ('Lombard', 'Street'), ('Ting', 'Lu'), ('China', 'Banking'), ('Regulatory', 'Commission'), ('Charlene', 'Chu'), ('Lehman', 'China'), ('Das', 'Boot'), ('Michael', 'McDonough'), ('Twitter', 'Lu'), ('The', 'New'), ('York', 'Times'), ('Adam', 'Bryant'), ('Senior', 'Vice'), ('People', 'Operations'), ('Laszlo', 'Bock'), ('Chinese', 'Flash'), ('FinViz', 'This'), ('ReutersHSBC', 'China'), ('Flash', 'PMI'), ('The', 'HSBC'), ('China', 'Flash'), ('Manufacturing', 'PMI'), ('Hongbin', 'Qu'), ('Markit', 'Economics'), ('Markit', 'Economics'), ('HSBC', 'China'), ('The', 'National'), ('Statistics', 'PMI'), ('The', 'Nikkei'), ('The', 'FOMC'), ('Ben', 'Bernanke'), ('The', 'Nikkei'), ('Yahoo', 'Finance'), ('Jonanthan', 'ErnstYields'), ('Federal', 'Reserve'), ('Federal', 'Reserve'), ('Chairman', 'Ben'), ('Maury', 'Harris'), ('Bloomberg', 'TelevisionBloomberg'), ('Trish', 'Regan'), ('Bill', 'GrossPIMCO'), ('Bill', 'Gross'), ('Bloomberg', 'Televisi

We need to expand the stop word set for the CEO names as some of the words are unlikely to be names. The words we should focus on are those capped but not names. 

In [38]:
stop_words=sorted(set(stopwords.words("english")))
stopwordceo = ["facebook", "twitter", "yahoo", "instagram", "apple", "samsung", "google", "amazon", "fed", "federal", "government", "republican", "republic", "democratic", "democrat", "president", "national", "social", "paper", "chair", "finance", "chief", "officer", "tax", "business", "financial", "capital", "company", "citi", "jp", "jpmorgan", "index", "corp", "inc", "co", "corporation", "ltd", "bank" "group", "ceo", "nasdaq", "world", "usa","us", "european", "europe", "japan", "china", "chinese", "america", "american", "ny", "nyc", "state", "city", "beijing", "shanghai", "eu", "london", "paris", "asia", "asian", "mexico", "mexican", "russia", "england", "arab", "university", "college", "school", "professor", "prof", "police", "news", "bank", "airline", "south", "north", "east","one", "first", "two", "second", "II", "three", "third", "III", "alpha", "beta", "theta", "kappa", "gamma", "mr", "nba", "cbs", "news", "fbi", "times", "today"] + stop_words
stopwordceo2 = ["hong kong", "white house", "new york", "san francisco", "wall street", "los angeles", "warner bros", "goldman sachs"]
conseCAP = []
for i in range(len(traintext)):
    pairs = re.findall(r'([A-Z]\w+) ([A-Z]\w+)',traintext[i])
    pairs = [t for t in pairs if ((t[0].lower() not in stopwordceo) & (t[1].lower() not in stopwordceo) & ((t[0] + " " + t[1]).lower() not in stopwordceo2))]
    for k in range(len(pairs)):
        conseCAP.append(pairs[k])
conseCAP = list(conseCAP)

In [40]:
insample = [t for t in conseCAP if (t[0] + ' ' + t[1]) in ceo_names]
len(insample)

7719

The postive and negative samples are so imbalanced so we have to ensure that negative sample list has the same length as the positive sample list. Then we removed the duplicate names in 'insample'.

In [41]:
negsample = [t for t in conseCAP if (t[0] + ' ' + t[1]) not in ceo_names]
insampleid = list(set(insample))
len(insampleid)

646

In [17]:
tokenizer = RegexpTokenizer("[A-Za-z]+'?[A-Za-z0-9]?")

The list of features I used includes:

* If __ceo__ is in the sentence
* If __chief, executive, officer__ is in the sentence respectively
* If there are numbers in the sentence
* The length of the sentence(number of words)
* Length of the first name
* Length of the last name
* Proportion of capital letter
* If the previous word has capital letter in the beginning
* If the next word has capital letter in the beginning

In [175]:
def featureceo(sent, CAPS):
#We want to generate a list of features for a word showing up in a sentence. Ideally the features do not describe the
#pair of stop words itself in too much detail, as it will not provide useful information about future prediction. 
#Before running this function we have to make sure CAPS is in sent.
    word = tokenizer.tokenize(sent)
    lwords = map(lambda x : x.lower(), word)
    features = {"words": CAPS}
    features["insample"] = (CAPS in insample) + 0
    features["ceo_in_sent"] = ("ceo" in lwords) + 0
    features["chief_in_sent"] = ("chief" in lwords) + 0
    features["exec_in_sent"] = ("executive" in lwords) + 0
    features["officer_in_sent"] = ("officer" in lwords) + 0
    
    features["num_in_sent"] = (re.search(r'[0-9]+', sent) != None) + 0
    features["length_of_sent"] = len(word)
    features["length_of_first"] = len(CAPS[0])
    features["length_of_last"] = len(CAPS[1])
    
    wordlist = "".join(CAPS)
    features["portion_of_CAP"] = sum(map(lambda x: x.isupper(), wordlist))/len(wordlist)
    
    index = word.index(CAPS[0])
    features["prev_capped"] = ((index != 0) & (word[index-1][0].isupper())) + 0
    features["next_capped"] = ((index != len(word) - 2) & (word[-len(word) + index + 2][0].isupper())) + 0
    
    return features
    

In [18]:
trainsent = " ".join(traintext)
trainsent = sent_tokenize(trainsent)

Here we make sure the length of positive list is about the same as the negative list

In [176]:
poscount = 0
negcount = 0
featuremat = []
for i in range(len(trainsent)):
    twoCAPS = re.findall(r" ([A-Z][A-Za-z0-9']+) ([A-Z][A-Za-z0-9']+)", trainsent[i])
    for j in range(len(twoCAPS)):
        if (twoCAPS[j] in insample):
            poscount = poscount + 1
            featuremat.append(featureceo(trainsent[i], twoCAPS[j]))

for i in range(len(trainsent)):
    twoCAPS = re.findall(r" ([A-Z][A-Za-z0-9']+) ([A-Z][A-Za-z0-9']+)", trainsent[i])
    if (negcount >= len(insample)):
        break
    else:
        for j in range(len(twoCAPS)):
            if (twoCAPS[j] in negsample):
                negcount = negcount + 1
                featuremat.append(featureceo(trainsent[i], twoCAPS[j]))
 

In [92]:
featuremat = pd.DataFrame(featuremat)

We use the logistic regression to predict the label

In [93]:
y_train = featuremat["insample"]
X_train = featuremat.drop(["insample", "words"], axis = 1)
X_train = np.array(X_train)
clf = LogisticRegression().fit(X_train, y_train)



In [94]:
conseCAP = []
for i in range(len(testtext)):
    pairs = re.findall(r'([A-Z]\w+) ([A-Z]\w+)',testtext[i])
    pairs = [t for t in pairs if ((t[0].lower() not in stopwordceo) & (t[1].lower() not in stopwordceo) & ((t[0] + " " + t[1]).lower() not in stopwordceo2))]
    for k in range(len(pairs)):
        conseCAP.append(pairs[k])
conseCAP = list(set(conseCAP))

In [19]:
testsent = " ".join(testtext)
testsent = sent_tokenize(testsent)

We construct the test matrix. To reduce runtime, we perform checkings before actually dumping the pair of word to the function.

In [96]:
testmat = []
def notcontainstopceo(pairs):
    return (any(map(lambda x: x not in stopwordceo, pairs)) & (pairs not in stopwordceo2))

for i in range(len(testsent)):
    twoCAPS = re.findall(r" ([A-Z][A-Za-z0-9']+) ([A-Z][A-Za-z0-9']+)", testsent[i])
    twoCAPS = list(filter(notcontainstopceo, twoCAPS))
    for j in range(len(twoCAPS)):
        if (twoCAPS[j] in conseCAP):
            testmat.append(featureceo(testsent[i], twoCAPS[j]))

Given the fact that the negative samples are about the same as positive ones in the training set, the result here is satisfactory.

In [97]:
testmat = pd.DataFrame(testmat)
X_test = testmat.drop(["insample", "words"], axis = 1)
y_test = testmat["insample"]
clf.score(X_test, y_test)

0.8086323299625578

In [98]:
confusion_matrix(clf.predict(X_test), np.array(y_test))

array([[53653,  1490],
       [11492,  1203]])

We apply the algorithm to the entire data set to detect for CEO names.

In [99]:
def predict_ceo(model, sent, CAPS):
    features = featureceo(sent, CAPS)
    features = pd.DataFrame(features)
    X = features.drop(["insample", "words"], axis = 1)
    X = X.drop(0)
    y = model.predict(X)
    return y[0]

In [20]:
allsent = " ".join(alltext)
allsent = sent_tokenize(allsent)

In [101]:
conseCAP = []
for i in range(len(alltext)):
    pairs = re.findall(r'([A-Z]\w+) ([A-Z]\w+)',alltext[i])
    pairs = [t for t in pairs if ((t[0].lower() not in stopwordceo) & (t[1].lower() not in stopwordceo) & ((t[0] + " " + t[1]).lower() not in stopwordceo2))]
    for k in range(len(pairs)):
        conseCAP.append(pairs[k])
conseCAP = list(set(conseCAP))

In [102]:
allceo = []
def notcontainstopceo(pairs):
    return (any(map(lambda x: x not in stopwordceo, pairs)) & (pairs not in stopwordceo2))

for i in range(len(allsent)):
    twoCAPS = re.findall(r" ([A-Z][A-Za-z0-9']+) ([A-Z][A-Za-z0-9']+)", allsent[i])
    twoCAPS = list(filter(notcontainstopceo, twoCAPS))
    for j in range(len(twoCAPS)):
        if (twoCAPS[j] in conseCAP):
            if (predict_ceo(clf, allsent[i], twoCAPS[j]) == 1):
                allceo.append(twoCAPS[j])

We write the results into the *predicted_ceo.txt* file

In [103]:
allceo = list(set(allceo))
with open("predicted_ceo.txt", "w") as f:
    for ceos in allceo:
        f.write(" ".join(ceos) + "\n")

We can see that the most helpful features are:
* Whether or not **ceo** is in sentence
* Whether or not **executive** is in sentence
* Whether or not **officer** is in sentence

It is interesting to see that the last two features are negatively correlated with the result. It might be that in articles we don't usually see the full title as "chief executive officer". Instead, they are using "ceo" as it is the more professional term in articles.

In [105]:
clf.coef_

array([[ 2.56296836,  0.36121931, -0.84846931, -0.21069949, -0.11119558,
        -0.00449125, -0.75450733, -0.19632693, -0.97918579, -0.03946271]])

In [106]:
featuremat.head(1)

Unnamed: 0,ceo_in_sent,chief_in_sent,exec_in_sent,insample,length_of_first,length_of_last,length_of_sent,next_capped,num_in_sent,officer_in_sent,prev_capped,words
0,0,0,0,1,4,5,31,1,0,0,0,"(Bill, Gross)"


## Company

The first task is to identify the common feature of the company names. We can see the majority of the names start with capital letters. There are only 7 counter-examples. Among them, only 'iSectors LLC' and 'eBay' have lowercase letters at the first position because it is intended, while 'interim', 'twenty-something' and 'the Human Rights Foundation' should be capitalized but was not. '2100 Xenon' and '20th Century Fox' start with numbers. Because the portion is so small within the entire list, we decide not to consider these counter-examples as they demand complicated investigation which could bring more false positives than simply not considering them and have some false negatives.

In [35]:
list(filter(lambda x: (not x[0].istitle()), company_names))

['interim',
 'iSectors LLC',
 '2100 Xenon',
 'eBay',
 'the Human Rights Foundation',
 'twenty-something',
 '20th Century Fox']

In [33]:
company_names

['Charles Schwab',
 'AOL',
 'Netflix Inc',
 'Wisconsin',
 'Delta',
 'Best Paid Banking',
 'Dion Weisler',
 'Pandora Media',
 'Scott Olson Twitter',
 'Dell Inc',
 'Portuguese Bank Espirito Santo',
 'Deloitte',
 'Birch Gold Group',
 'W Corporation',
 'Templeton Investments',
 'Loan Pricing Corp',
 'Shack Corp',
 'Abaxis Inc',
 'Current',
 'Athlon Energy Inc',
 'Telecom Italia',
 'Investments Inc',
 'Metlife',
 'Fiat Chrysler',
 'Spacex',
 'Whole Foods',
 'Bombardiar',
 'Peugeot',
 'Alireza Group',
 'Chatillon',
 'CNBC',
 'Crain Communications',
 'EMC Corp',
 'Missiles Corporation',
 'City Corp',
 'Greenlight Capital',
 'China Huarong Asset Management Co Ltd',
 'US Bancorp',
 'Ally Financial Inc',
 'D.R. Horton',
 'United Continental Holdings Inc',
 'Goldman Sachs',
 'Corelogic',
 'Taxi Club Management',
 'Samsung',
 'Chipotle Mexican Grill Inc',
 'Affiliated Managers Group',
 'Fendi',
 'Snapfish',
 'Virgin Group',
 'Civil Supplies Corporation',
 'Lorillard Inc',
 'NASDAQ',
 'Lehman Broth

Through observation, we can see that most of the names end with 'Inc', 'Group', 'Ltd', 'Co','Corp', 'Corporation'. Another trend in the company name is a series of capital letters.

In [107]:
list(filter(lambda x: len(x) == 2, company_names))

['HP', 'NQ', 'JD', 'GE', 'GM']

We choose not to filter out the stop words as it is common to have them in the company names, such as 'the'. So our candidates are still consecutive words with first letter capitalized.

In [21]:
conseCAP = []
for i in range(len(traintext)):
    pairs = re.findall(r" (?:[A-Z]+[A-Za-z0-9']* ?)+",traintext[i])
    for k in range(len(pairs)):
        conseCAP.append(pairs[k].rstrip().strip())
conseCAP = list(conseCAP)

In [22]:
insamp = [t for t in conseCAP if t in company_names]
insampid = set(insamp)

The features I used to find company name include:
* If **company** is in sentence
* If keywords are in the list of words. (keywords are 'inc', 'group', 'ltd' etc. as a good indicator of company names)
* If there are numbers in the sentence
* Number of words in the sentence
* Number of words of the candidate
* Number of letters of the candidate
* Proportion of capital letters
* If previous word has capital letter in the beginning
* If next word has capital letter in the beginning


In [23]:
def featurecomp(sent, CAPS):
#We want to generate a list of features for a word showing up in a sentence. Before running this function we have to
#make sure CAPS is in sent.
    keywords = {'inc', 'group', 'ltd', 'co', 'corp', 'corporation'}
    #These are the key words to identify a company name, we expect the coef for this variable to be high.
    
    capwords = tokenizer.tokenize(CAPS)
    lcapwords = map(lambda x: x.lower(), capwords)
    word = tokenizer.tokenize(sent)
    lwords = map(lambda x : x.lower(), word)
    features = {"words": CAPS}
    features["insample"] = (CAPS in insamp) + 0
    features["company_in_sent"] = ("company" in lwords) + 0
    features["key_in_sent"] = any(list(map(lambda x: x in keywords, lcapwords))) + 0
    
    features["num_in_sent"] = (re.search(r'[0-9]+', sent) != None) + 0
    features["length_of_sent"] = len(word)
    
    wordlist = "".join(CAPS)
    features["num_of_words"] = len(capwords)
    features["length_of_words"] = len(wordlist)
    features["portion_of_CAP"] = sum(map(lambda x: x.isupper(), wordlist))/len(wordlist)
    
    indexf = word.index(capwords[0])
    indexl = word.index(capwords[-1])
    features["prev_capped"] = ((indexf != 0) & (word[indexf-1][0].isupper())) + 0
    features["next_capped"] = ((indexl != len(word) - 1) & (word[-len(word) + indexl + 1][0].isupper())) + 0
    
    return features
    

Again, we make sure the length of positive sample list is about the same as the length of negative one.

In [24]:
poscount = 0
negcount = 0
featuremat = []
wordbank = words.words()
for i in range(len(trainsent)):
    CAPS = re.findall(r" (?:[A-Z]+[A-Za-z0-9']* ?)+", trainsent[i])
    for j in range(len(CAPS)):
        CAPS[j] = CAPS[j].rstrip()
        CAPS[j] = CAPS[j].strip()
        if (CAPS[j] in insampid):
            poscount = poscount + 1
            featuremat.append(featurecomp(trainsent[i], CAPS[j]))

for i in range(len(trainsent)):
    CAPS = re.findall(r" (?:[A-Z]+[A-Za-z0-9']* ?)+", trainsent[i])
    if (negcount >= len(insamp)):
        break
    else:
        for j in range(len(CAPS)):
            CAPS[j] = CAPS[j].rstrip()
            CAPS[j] = CAPS[j].strip()
            indexCAPS = trainsent[i].index(CAPS[j])
            if (not CAPS[j].lower() in wordbank) & ((indexCAPS == 0) | (trainsent[i][indexCAPS - 1] in {".", " "})) & (not (CAPS[j] in insampid)):
                negcount = negcount + 1
                featuremat.append(featurecomp(trainsent[i], CAPS[j]))
 

In [25]:
featuremat = pd.DataFrame(featuremat)

In [28]:
y_train = featuremat["insample"]
X_train = featuremat.drop(["insample", "words"], axis = 1)
X_train = np.array(X_train)
clf2 = svm.SVC(kernel = "rbf", max_iter = 1000)
clf2.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=1000, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [30]:
conseCAP = []
for i in range(len(testtext)):
    lists = list(set(re.findall(r" (?:[A-Z]+[A-Za-z0-9']* ?)+",testtext[i])))
    for k in range(len(lists)):
        lists[k] = lists[k].rstrip()
        lists[k] = lists[k].strip()
        if (lists[k] not in wordbank):
            conseCAP.append(lists[k])
conseCAP = list(set(conseCAP))

The remaining process is about the same as the CEO names classifying process

In [31]:
testmat = []
for i in range(len(testsent)):
    CAPS = re.findall(r" (?:[A-Z]+[A-Za-z0-9']* ?)+", testsent[i])
    CAPS = list(map(lambda x: x.rstrip().strip(), CAPS))
    CAPS = list(filter(lambda x: x in conseCAP, CAPS))
    for j in range(len(CAPS)):
        testmat.append(featurecomp(testsent[i], CAPS[j]))

In [34]:
testmat = pd.DataFrame(testmat)
X_test = testmat.drop(["insample", "words"], axis = 1)
y_test = testmat["insample"]
clf2.score(X_test, y_test)

0.8678417199518117

In [35]:
confusion_matrix(clf2.predict(X_test), np.array(y_test))

array([[232954,  26057],
       [  9596,   1168]])

In [175]:
cols = ["company_in_sent", "key_in_sent", "length_of_sent", "length_of_words", "next_capped", "num_in_sent", "num_of_words",  "portion_of_CAP", "prev_capped"]

In [176]:
def predict_comp(model, sent, CAPS):
    features = featurecomp(sent, CAPS)
    features = pd.DataFrame(features, index = [0])
    X = features.drop(["insample", "words"], axis = 1)
    X = X[cols]
    y = model.predict(X)
    return y[0]

In [186]:
allcomp = []
for i in range(len(allsent)):
    CAPS = re.findall(r" (?:[A-Z]+[A-Za-z0-9']* ?)+", allsent[i])
    CAPS = list(map(lambda x: x.rstrip().strip(), CAPS))
    CAPS = list(filter(lambda x: x not in allcomp, CAPS))
    for j in range(len(CAPS)):
        if (CAPS[j].lower() not in wordbank):
            if (predict_comp(clf2, allsent[i], CAPS[j]) == 1):
                allcomp.append(CAPS[j])

In [188]:
allcomp = list(set(allcomp))
with open("predicted_comp.txt", "w") as f:
    for comps in allcomp:
        f.write(comps + "\n")

## Percentage

When cleaning all positively labeled data, we found a large portion of it is just pure numbers, which has nothing to do with percentage. Thus, we decide to use regex expression to extract the percenrages.

They may come in the following format:
* XX(.XX)%
* XX(.XX) percent (age point)(s)
* English described percent

In [192]:
pattern = r'(([0-9]+)|(one)|(two)|(three)|(four)|(five)|(six)|(seven)|(eight)|(nine)|(ten))[.[0-9]*]?(%| percentage point| percent)+s?'

In [195]:
allperc = []
for sent in allsent:
    allperc.extend(re.findall(pattern, sent))

In [196]:
allperc = list(map(lambda x:"".join(x[1:]) , allperc))
allperc = list(set(allperc))
allperc

['26 percent',
 '23%',
 '44%',
 'five percent',
 '34 percent',
 '38 percent',
 '167%',
 '95%',
 '328%',
 '700%',
 '660%',
 '106%',
 '182%',
 '89%',
 '72%',
 '32 percentage point',
 '282%',
 '91%',
 '450%',
 '386 percent',
 '201 percent',
 '213 percent',
 '357%',
 '108 percent',
 '1 percent',
 '05 percent',
 '524%',
 '58 percent',
 '329%',
 '236 percent',
 '55%',
 '660 percent',
 '14%',
 '281%',
 '7 percent',
 '967%',
 '1140%',
 '205 percent',
 '84 percent',
 '462%',
 '289%',
 '130%',
 '190 percent',
 '40 percentage point',
 '53 percent',
 '37 percentage point',
 '114 percent',
 '145 percent',
 '09082014%',
 '37%',
 '4%',
 '39 percentage point',
 '0%',
 '500%',
 '226 percent',
 '61%',
 '362%',
 '231%',
 '385%',
 '600%',
 '233%',
 '260%',
 '163 percent',
 '43 percent',
 '250%',
 '608%',
 '799%',
 '207%',
 '438%',
 'eight percent',
 '340 percent',
 '396%',
 'two percentage point',
 '441%',
 '131 percent',
 '84%',
 '147%',
 '18%',
 '433%',
 '995%',
 'three percentage point',
 '249%',
 '10 

In [197]:
len(allperc)

627

In [198]:
with open("predicted_perc.txt", "w") as f:
    for percs in allperc:
        f.write(percs + "\n")