In [1]:
import re
import glob
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import TweetTokenizer
import itertools
import pandas as pd
import numpy as np

### Labeled CEO Processing

In [2]:
# load labeled dataset
ceo_list = []
with open('all/ceo.csv','r',encoding='utf-8') as f:
    line = f.readline()
    while line:
        name = line.strip().split(",")
        if name[1] == '':
            # means either ["tim cook"] or ["tim"], simply return name[0]
            ceo_list.append(name[0].strip())
        else:
            # output would be ["tim","cook"]
            ceo_list.append(name[0] + ' ' + name[1])
        line = f.readline()
ceo_list[:5]

['Tom Horton', 'Patti Hart', 'Jamie Dimon', 'Steve Cohen', 'Tim Cook']

### All articles extraction

In [3]:
# load all files
def get_all_files(dir):
    files = glob.glob(dir + '/*')
    return files
files = get_all_files('2013') + get_all_files('2014')

In [4]:
# set stop words,remove them in corpus
stop = set(stopwords.words('english'))

### Remove Reduntant Words/whitespaces/special chars

In [8]:
# load all articles
articles = []
for i, file in enumerate(files):
    with open(file, 'r', encoding = 'latin-1') as f:
        for article in f:
            sentences = sent_tokenize(article)
            
            # remove number, special chars, white spaces and stop words
            sentences = [re.sub(r"[^A-Za-z ]", " ", sent) for sent in sentences]
            sentences = [sent.strip() for sent in sentences]
            sentences = [re.sub(r" +"," ", sent) for sent in sentences]
            
            sentences = [' '.join([j for j in sent.split() if j.lower() not in stop]) for sent in sentences]
            articles.append(sentences)
    if i % 100 == 0:
        print(f"reading {i} files already")
print(f'total files: {len(files)}')
print(f'total articles: {len(articles)}')

reading 0 files already
reading 100 files already
reading 200 files already
reading 300 files already
reading 400 files already
reading 500 files already
reading 600 files already
reading 700 files already
total files: 730
total articles: 35898


In [9]:
articles[3]

['UPDATE Fiscal Cliff bill gets closer likely passage Aussie market one markets trading right highs day',
 'Hong Kong',
 'EARLIER markets open Japan China Holiday futures still closed one open Australia',
 'far Washington theatrics problem']

### Generate Train and Test dataset

In [10]:
# split train and test dataset
train_set = articles[:int(0.6 * len(articles))]
test_set = articles[int(0.6 * len(articles)):]

In [11]:
print(f'train_set size = {len(train_set)}, test_set size = {len(test_set)}')

train_set size = 21538, test_set size = 14360


### Feature Selection

I'm going to select following features:
- first_name_length # the length of first name
- last_name_length # the length of last name
- contains_ceo # if the sentence contains 'CEO' or not
- name_index # the index of the first name char occurs in the sentence


In [12]:
def name_in_sent(name, sent):
    return ((" " + name + " " in sent) or (name + " " == sent[:len(name)+1]) or (" " + name == sent[-len(name)+1:]))

find potential names using **regex**: r'[A-Z][a-z]+ [A-Z][a-z]+'

In [13]:
def generate_feature_df(dataset):
    df = []
    for i, article in enumerate(dataset):
        if i % 1000 == 0: 
            print(f'Currently prcessing {i} article')
        # Get all potential names appeared in this article
        names = [re.findall(r'[A-Z][a-z]+ [A-Z][a-z]+', sent) for sent in article]
        # flatten the list
        names = itertools.chain(*names)
        
        for name in names:
            feature = dict()
            for sentence in article:
                if not name_in_sent(name, sentence):
                    continue
                tk = TweetTokenizer()
                words = tk.tokenize(sentence)
                words_pos = nltk.pos_tag(words)
                first_name, last_name = name.split(" ")
                # use pos tag to remove names that are not 'NNP'
                pos_first = words_pos[words.index(first_name)][1]
                pos_last = words_pos[words.index(last_name)][1]
                if (pos_first != 'NNP' or pos_last != 'NNP'):
                    continue
                feature["name"] = name
                
                feature["first_name_length"] = len(first_name)
                feature["last_name_length"] = len(last_name)
                
                if 'CEO' in words:
                    feature["contains_ceo"] = 1
                else:
                    feature["contains_ceo"] = 0
                feature["name_index"] = words.index(first_name)
                feature['is_ceo'] = int(name in ceo_list)
                df.append(feature)
    return pd.DataFrame(df)

In [14]:
df_train = generate_feature_df(train_set)

Currently prcessing 0 article
Currently prcessing 1000 article
Currently prcessing 2000 article
Currently prcessing 3000 article
Currently prcessing 4000 article
Currently prcessing 5000 article
Currently prcessing 6000 article
Currently prcessing 7000 article
Currently prcessing 8000 article
Currently prcessing 9000 article
Currently prcessing 10000 article
Currently prcessing 11000 article
Currently prcessing 12000 article
Currently prcessing 13000 article
Currently prcessing 14000 article
Currently prcessing 15000 article
Currently prcessing 16000 article
Currently prcessing 17000 article
Currently prcessing 18000 article
Currently prcessing 19000 article
Currently prcessing 20000 article
Currently prcessing 21000 article


In [15]:
df_train['is_ceo'].value_counts()

0    341751
1     28953
Name: is_ceo, dtype: int64

### Using data to fit Classification model

Since positive and negative have huge number difference, do a resampling

In [16]:
import sklearn
from sklearn import metrics
from sklearn.utils import resample

In [17]:
df_positive = df_train[df_train.is_ceo==1]
df_negative = df_train[df_train.is_ceo==0]
resample_class = (len(df_positive) + len(df_negative)) // 2
df_negative = resample(df_negative, replace = False, n_samples = resample_class)
df_positive = resample(df_positive, replace = True, n_samples = resample_class)

df_train_over = pd.concat([df_negative, df_positive])

In [18]:
df_train_over['is_ceo'].value_counts()

1    185352
0    185352
Name: is_ceo, dtype: int64

#### Build test set 

In [19]:
df_test = generate_feature_df(test_set)

Currently prcessing 0 article
Currently prcessing 1000 article
Currently prcessing 2000 article
Currently prcessing 3000 article
Currently prcessing 4000 article
Currently prcessing 5000 article
Currently prcessing 6000 article
Currently prcessing 7000 article
Currently prcessing 8000 article
Currently prcessing 9000 article
Currently prcessing 10000 article
Currently prcessing 11000 article
Currently prcessing 12000 article
Currently prcessing 13000 article
Currently prcessing 14000 article


#### Function to get performance

In [20]:
def get_metrics(truth, predicted):
    
    
    confusion_matrix = metrics.confusion_matrix(truth, predicted)
    accuracy = metrics.accuracy_score(truth, predicted)
    precision = metrics.precision_score(truth, predicted)
    recall = metrics.recall_score(truth, predicted)
    F1 = metrics.f1_score(truth, predicted)
    
    print(f"Confusion Matrix:\n {confusion_matrix}\n")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {F1}")
    
    return

#### RandomForestClassification model

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
RFC = RandomForestClassifier(n_estimators=20)
use_feature = [i for i in df_train_over.columns if i not in ("is_ceo", "name")]
RFC.fit(df_train_over[use_feature], df_train_over["is_ceo"])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [23]:
predict = RFC.predict(df_test[use_feature])
get_metrics(df_test.is_ceo, predict)

Confusion Matrix:
 [[249443  38148]
 [  5996   7525]]

Accuracy: 0.8533967427402428
Precision: 0.16475817222429007
Recall: 0.556541675911545
F1 Score: 0.25424874142649595


#### XGBoost Classification model

In [24]:
import xgboost
from xgboost import XGBClassifier

In [25]:
xgb = XGBClassifier()
xgb.fit(df_train_over[use_feature], df_train_over["is_ceo"])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [26]:
predict = xgb.predict(df_test[use_feature])
get_metrics(df_test.is_ceo, predict)

Confusion Matrix:
 [[252067  35524]
 [  5860   7661]]

Accuracy: 0.8625627673423842
Precision: 0.17739956003241866
Recall: 0.5666001035426373
F1 Score: 0.27020068423094556


#### Logistic Regression Classification Model

In [27]:
from sklearn.linear_model import LogisticRegression

In [28]:
clf = LogisticRegression(solver='lbfgs')
clf.fit(df_train_over[use_feature], df_train_over["is_ceo"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [29]:
predict = clf.predict(df_test[use_feature])
get_metrics(df_test.is_ceo, predict)

Confusion Matrix:
 [[184797 102794]
 [  5089   8432]]

Accuracy: 0.6417180318286884
Precision: 0.07580961286030245
Recall: 0.6236225131277272
F1 Score: 0.13518561568614879


#### Naive Bayes Classification Model

In [30]:
from sklearn.naive_bayes import GaussianNB

In [31]:
gnb = GaussianNB()
gnb.fit(df_train_over[use_feature], df_train_over["is_ceo"])

GaussianNB(priors=None, var_smoothing=1e-09)

In [32]:
predict = gnb.predict(df_test[use_feature])
get_metrics(df_test.is_ceo, predict)

Confusion Matrix:
 [[204862  82729]
 [  3959   9562]]

Accuracy: 0.7121071229310024
Precision: 0.10360706894496755
Recall: 0.707196213297833
F1 Score: 0.18073564435035724


Based on before model selection result, we can tell that XGBoost has the top performance

### Output result

In [33]:
predict_train = xgb.predict(df_train_over[use_feature])
ceo_train = list(df_train_over.iloc[np.where(predict_train == 1)].name)

In [34]:
predict_test = xgb.predict(df_test[use_feature])
ceo_test = list(df_train_over.iloc[np.where(predict_test == 1)].name)

In [35]:
with open("ceo_extracted.csv",'w') as f:
    for ceo in ceo_test + ceo_train:
        f.write(ceo + ",\n")