In [1]:
import re
import glob
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import TweetTokenizer
import itertools
import pandas as pd
import numpy as np

### Labeled Company Processing

In [2]:
# load labeled dataset
company_list = []
with open('all/companies.csv', 'r', encoding='utf-8') as f:
    line = f.readline()
    while line:
        # stip dots since it will confuse sent_tokenizer 
        company_list.append(line.strip().strip("."))
        line = f.readline()
company_list[:5]

['Abaxis Inc',
 'ACA Financial',
 'Alibaba Group Holding Ltd',
 'American Bell Telephone Co',
 'American Express Co']

### All articles extraction

In [3]:
# load all files
def get_all_files(dir):
    files = glob.glob(dir + '/*')
    return files
files = get_all_files('2013') + get_all_files('2014')

In [4]:
# set stop words,remove them in corpus
stop = set(stopwords.words('english'))

I found there are only two company [20th Centry Fox, 2100 Xeno] which has number in it.
Instead of introducing number to drastically complex the computation,
I'll hard coded the company name to process.

In [5]:
# load all articles
map_name_to_sent = dict()
articles = []
special_companies = ["AT&T", "McKinsey & Company", "Procter & Gamble", "Hilton & Hyland","20th Century Fox", "2100 Xeno"]
for i, file in enumerate(files):
    with open(file, 'r', encoding = 'latin-1') as f:
        for article in f:
            sentences = sent_tokenize(article)
            # remove number, special chars, white spaces and stop words
            for sent in sentences:
                for sc in special_companies:
                    if sc in sent:
                        map_name_to_sent[sc] = ' '.join([j for j in sent.split() if j.lower() not in stop])
            sentences = [re.sub(r"[^A-Za-z-]", " ", sent) for sent in sentences]
            sentences = [sent.strip() for sent in sentences]
            sentences = [re.sub(r" +"," ", sent) for sent in sentences]
            
            sentences = [' '.join([j for j in sent.split() if j.lower() not in stop]) for sent in sentences]
            articles.append(sentences)
    if i % 100 == 0:
        print(f"reading {i} files already")
print(f'total files: {len(files)}')
print(f'total articles: {len(articles)}')

reading 0 files already
reading 100 files already
reading 200 files already
reading 300 files already
reading 400 files already
reading 500 files already
reading 600 files already
reading 700 files already
total files: 730
total articles: 35898


In [8]:
map_name_to_sent # print special company names that can be took care later

{'20th Century Fox': 'Courtesy Paramount Pictures 20th Century FoxElon Musk made another appearance "The Colbert Report" last night asked host Tesla recently given away patents, Musk replied alarming analogy.',
 'McKinsey & Company': 'Reuters/Mike SegarDominic Barton, managing director McKinsey & CompanyMcKinsey prestigious consultancy world.',
 '2100 Xeno': 'Jay Feuerstein CEO ad Chief Investment Officer 2100 Xenon, investment firm based inÂ Chicago.',
 'AT&T': "Thomson ReutersThe AT&T logo pictured store Carlsbad, CaliforniaWASHINGTON (Reuters) - Telecommunications giant AT&amp;T Inc agreed pay $105 million settle allegations put unauthorized charges customers' cell phone bills, practice known cramming, Politico reported Wednesday.",
 'Hilton & Hyland': 'Hilton & HylandThe 90-year-old CEO Dole Food Company David Murdock selling gorgeous estate overlooking Los Angeles $30 million, according celebrity real estate blog Real Estalker.'}

In [9]:
articles[:5]

[['Earlier today strong South Korean PMI report',
  'latest',
  'Taiwan',
  'saw rise December PMI',
  'report'],
 ['House prepared vote Senate fiscal cliff bill orchestrated White House Senate Minority Leader Mitch McConnell conservatives railing GOP House Speaker John Boehner caving deal banner leading Drudge Report right Drudge Report'],
 ['Good news global economy',
  'South Korea -- whose heavy reliance global trade -- seen economists canary coalmine came strong PMI report',
  'report HSBC South Korea Purchasing Managers IndexTM PMI composite indicator designed provide single-figure snapshot health manufacturing sector registered December',
  'improvement November highest reading since May',
  'However barely PMI suggested operating conditions little changed since previous month',
  'Following six months continuous decline new order volumes increased latest survey period',
  'However rate growth slight respondents commenting underlying demand conditions remained fragile economic c

### Generate Train and Test dataset

In [10]:
# split train and test dataset
train_set = articles[:int(0.6 * len(articles))]
test_set = articles[int(0.6 * len(articles)):]

In [11]:
print(f'train_set size = {len(train_set)}, test_set size = {len(test_set)}')

train_set size = 21538, test_set size = 14360


### Feature Selection


I'm going to select following features:
- "keywords" 

    Company has keywords that we can tell from the articles, the keywords are:
['Company', 'Inc', 'Corporation', 'Group', 'Co', 'Ltd', 'Management', 'Corp']. If we found those keywords appear in or near the company name, we set the feature to 1, else 0.

- "length_of_name" indicates the length of company name
- "name_index" indicates the index of the first occurence of company name.

In [12]:
def name_in_sent(name, sent):
    return ((" " + name + " " in sent) or (name + " " == sent[:len(name)+1]) or (" " + name == sent[-len(name)+1:]))

In [13]:
keywords = ['Company', 'Inc', 'Corporation', 'Group', 'Co', 'Ltd', 'Management', 'Corp']

In [14]:
def generate_feature_df(dataset):
    df = []
    for i, article in enumerate(dataset):
        if i % 1000 == 0: 
            print(f'Currently prcessing {i} article')
        # Get all potential names appeared in this article
        names = [re.findall(r"([A-Z][\w-]+(\s+[A-Z][\w-]*)+)", sent) for sent in article]
        # flatten the list
        names = itertools.chain(*names)
        
        for name in names:
            name = name[0]
            feature = dict()
            for sentence in article:
                if not name_in_sent(name, sentence):
                    continue
                words = sentence.split(" ")
                name_list = name.split(" ")
            
                feature["name"] = name
                feature["length_of_name"] = len(name)
                
                for k in keywords:
                    if k in words:
                        feature['keyword'] = 1
                    else:
                        feature['keyword'] = 0
                
                feature["name_index"] = words.index(name_list[0])
                feature['is_company'] = int(name in company_list)
                df.append(feature)
    return pd.DataFrame(df)

In [15]:
df_train = generate_feature_df(train_set)

Currently prcessing 0 article
Currently prcessing 1000 article
Currently prcessing 2000 article
Currently prcessing 3000 article
Currently prcessing 4000 article
Currently prcessing 5000 article
Currently prcessing 6000 article
Currently prcessing 7000 article
Currently prcessing 8000 article
Currently prcessing 9000 article
Currently prcessing 10000 article
Currently prcessing 11000 article
Currently prcessing 12000 article
Currently prcessing 13000 article
Currently prcessing 14000 article
Currently prcessing 15000 article
Currently prcessing 16000 article
Currently prcessing 17000 article
Currently prcessing 18000 article
Currently prcessing 19000 article
Currently prcessing 20000 article
Currently prcessing 21000 article


In [16]:
df_train['is_company'].value_counts()

0    273668
1     22271
Name: is_company, dtype: int64

### Using data to fit Classification model

Since positive and negative have huge number difference, do resampling

In [17]:
import sklearn
from sklearn import metrics
from sklearn.utils import resample

In [18]:
df_positive = df_train[df_train.is_company==1]
df_negative = df_train[df_train.is_company==0]
resample_class = (len(df_positive) + len(df_negative)) // 2
df_negative = resample(df_negative, replace = False, n_samples = resample_class)
df_positive = resample(df_positive, replace = True, n_samples = resample_class)

df_train_over = pd.concat([df_negative, df_positive])

In [19]:
df_train_over["is_company"].value_counts()

1    147969
0    147969
Name: is_company, dtype: int64

#### Buid test set

In [20]:
df_test = generate_feature_df(test_set)

Currently prcessing 0 article
Currently prcessing 1000 article
Currently prcessing 2000 article
Currently prcessing 3000 article
Currently prcessing 4000 article
Currently prcessing 5000 article
Currently prcessing 6000 article
Currently prcessing 7000 article
Currently prcessing 8000 article
Currently prcessing 9000 article
Currently prcessing 10000 article
Currently prcessing 11000 article
Currently prcessing 12000 article
Currently prcessing 13000 article
Currently prcessing 14000 article


#### Function to get performance

In [21]:
def get_metrics(truth, predicted):
    
    
    confusion_matrix = metrics.confusion_matrix(truth, predicted)
    accuracy = metrics.accuracy_score(truth, predicted)
    precision = metrics.precision_score(truth, predicted)
    recall = metrics.recall_score(truth, predicted)
    F1 = metrics.f1_score(truth, predicted)
    
    print(f"Confusion Matrix:\n {confusion_matrix}\n")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {F1}")
    
    return

#### RandomForestClassification model

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
RFC = RandomForestClassifier(n_estimators=20)
use_feature = [i for i in df_train_over.columns if i not in ("is_company", "name")]
RFC.fit(df_train_over[use_feature], df_train_over["is_company"])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [24]:
predict = RFC.predict(df_test[use_feature])
get_metrics(df_test.is_company, predict)

Confusion Matrix:
 [[146799  60755]
 [ 12038  12928]]

Accuracy: 0.6869387579563049
Precision: 0.17545431103510986
Recall: 0.5178242409677161
F1 Score: 0.26210098429786416


#### XGBoost Classification model

In [25]:
import xgboost
from xgboost import XGBClassifier

In [26]:
xgb = XGBClassifier()
xgb.fit(df_train_over[use_feature], df_train_over["is_company"])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [27]:
predict = xgb.predict(df_test[use_feature])
get_metrics(df_test.is_company, predict)

Confusion Matrix:
 [[147015  60539]
 [ 12747  12219]]

Accuracy: 0.6848185102356786
Precision: 0.16794029522526732
Recall: 0.4894256188416246
F1 Score: 0.2500716303057591


### Output result

In [28]:
predict_train = RFC.predict(df_train_over[use_feature])
comp_train = list(df_train_over.iloc[np.where(predict_train == 1)].name)

In [29]:
predict_test = RFC.predict(df_test[use_feature])
comp_test = list(df_train_over.iloc[np.where(predict_test == 1)].name)

In [30]:
with open("company_extracted.csv",'w') as f:
    for company in comp_test + comp_train:
        f.write(company + ",\n")