In [None]:
import nltk
# from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
import pprint
import heapq

Much of this comes from this link: https://stackabuse.com/text-summarization-with-nltk-in-python/

In [273]:
raw = ''
participants = []
pp = pprint.PrettyPrinter(indent=2, width=170)
stop_words = nltk.corpus.stopwords.words('english')

with open('HUBG-Q3-Earnings-Transcript.txt') as txt:
    raw = txt.read()

with open('participants.txt') as part:
    for line in part:
        line = line.replace('\n','')
        participants.append(line)

def stripStopwords(sentence):
    words = nltk.wordpunct_tokenize(sentence.lower())
    out = []
    for word in words:
        if not word in stop_words:
           out.append(word)
    return out

In [199]:
tokens = nltk.wordpunct_tokenize(raw)
text = nltk.Text(tokens)
words = [w.lower() for w in text]
vocab = sorted(set(words))
word_tagged = nltk.pos_tag(text)


### Sentiment analysis (skip me for now):
http://www.nltk.org/howto/sentiment.html
Not sure what good this does.

In [21]:
n_instances = 100
subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]

In [29]:
train_subj_docs = subj_docs[:80]
test_subj_docs = subj_docs[80:100]
train_obj_docs = obj_docs[:80]
test_obj_docs = obj_docs[80:100]
training_docs = train_subj_docs + train_obj_docs
testing_docs = test_subj_docs + test_obj_docs

In [25]:
sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])

In [38]:
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
len(unigram_feats)
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

In [39]:
training_set = sentim_analyzer.apply_features(training_docs)
test_set = sentim_analyzer.apply_features(testing_docs)

In [43]:
trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)
for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
        print('{0}:{1}'.format(key,value))

Training classifier
Evaluating NaiveBayesClassifier results...
Accuracy:0.8
F-measure [obj]:0.8
F-measure [subj]:0.8
Precision [obj]:0.8
Precision [subj]:0.8
Recall [obj]:0.8
Recall [subj]:0.8


### Back to the earnings call...

In [362]:
paras = raw.split("\n\n")
transcript_dict = dict()
last_speaker = ''
for para in paras:
    #If para is the name of one of the participants, add that name as a key to a dictionary, and add a dictionary as its value
    if para in participants:
        transcript_dict.setdefault(para,dict())
        last_speaker = para
    else:
        #Add each subsequent paragraph to that person's own dictionary
        transcript_dict[last_speaker].setdefault(paras.index(para),{'content':para})
transcript_dict.pop('Operator')
# pp.pprint(transcript_dict)

{1: {'content': "Hello, and welcome to the Hub Group Third Quarter 2018 Earnings Conference Call. Dave Yeager, Hub's CEO, Don Maltby, Hub's President and Chief Operating Officer, and Terri Pizzuto are joining me on the call. At this time, all participants are in a listen-only mode. A brief question-and-answer session will follow the formal presentation. In order for everyone to have an opportunity to participate, please limit your inquiries to one primary and one follow-up question."},
 2: {'content': "Any forward-looking statements made during the course of the call or contained in the release represent the company's best good faith judgment as to what may happen in the future. Statements that are forward-looking can be identified by the use of words such as believe, expect, anticipate and project and variations of these words. Please review the cautionary statements in the release. In addition, you should refer to the disclosures in the company's Form 10-K and other SEC filings regar

In [363]:
for participant in transcript_dict:
    for item in transcript_dict[participant]:
        # Split each content string into its component sentences.
        # TODO: Stemming and apostrophe mgmt ('  's  'nt  &c.).  Doing so will boost scores for sentences with "hub's" vice "hub"
        transcript_dict[participant][item].setdefault('sentencized_content',nltk.sent_tokenize(transcript_dict[participant][item]['content'].lower()))
        transcript_dict[participant][item].setdefault('stripped_tokenized_content',stripStopwords(transcript_dict[participant][item]['content']))

In [364]:
participant_word_freq = {}
item_word_freq = {}
overall_word_freq = {}
for participant in transcript_dict:
    participant_word_freq.setdefault(participant,dict())
    for item in transcript_dict[participant]:
        item_word_freq.setdefault(item,dict())
        for word in transcript_dict[participant][item]['stripped_tokenized_content']:
            if word not in item_word_freq[item].keys():
                item_word_freq[item][word] = 1
            else:
                item_word_freq[item][word] += 1
            if word not in participant_word_freq[participant].keys():
                participant_word_freq[participant][word] = 1
            else:
                participant_word_freq[participant][word] += 1
            if word not in overall_word_freq.keys():
                overall_word_freq[word] = 1
            else:
                overall_word_freq[word] += 1
#             print(item_word_freq)
        item_max_freq = max(item_word_freq[item].values())
        for word in item_word_freq[item].keys():
            item_word_freq[item][word] = (item_word_freq[item][word]/item_max_freq)
#         transcript_dict[participant][item].setdefault('content_word_frequencies',item_word_freq)
    part_max_freq = max(participant_word_freq[participant].values())
    for word in participant_word_freq[participant].keys():
        participant_word_freq[participant][word] = (participant_word_freq[participant][word]/part_max_freq)
overall_max_freq = max(overall_word_freq.values())
for word in overall_word_freq.keys():
    overall_word_freq[word] = (overall_word_freq[word]/overall_max_freq)
#     transcript_dict[participant].setdefault('participant_word_frequencies',)

### TF-IDF using just the sentence-tokenized raw content with no other preprocessing

In [365]:
sentence_scores = {}
basic_sent_dict = nltk.sent_tokenize(raw.lower())
for sent in basic_sent_dict:
    for word in nltk.word_tokenize(sent):
        if word in overall_word_freq.keys():
            if len(sent.split(' '))<30:
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = overall_word_freq[word]
                else:
                    sentence_scores[sent] += overall_word_freq[word]

In [366]:
summary_sentences = heapq.nlargest(10, sentence_scores, key=sentence_scores.get)

summary = ' '.join(summary_sentences)

# if you want a laff just
# print(summary)

### TF-IDF using preprocessed data from transcript_dict

In [367]:
sentence_scores = {}



for participant in transcript_dict:
    for item in transcript_dict[participant]:
        for sent in transcript_dict[participant][item]['sentencized_content']:
            for word in nltk.word_tokenize(sent):
                if word in overall_word_freq.keys():
                    if len(sent.split(' '))<30 and len(sent.split(' '))>3:
                        if sent not in sentence_scores.keys():
                            sentence_scores[sent] = overall_word_freq[word]
                        else:
                            sentence_scores[sent] += overall_word_freq[word]

# print(sentence_scores)

In [368]:
summary_sentences = heapq.nlargest(10, sentence_scores, key=sentence_scores.get)

summary = ' '.join(summary_sentences)

# if you want a laff just
print(summary)

los angeles, seattle, chicago, atlanta, most of the major cities in fact are very tight on capacity for intermodal. i would suggest it'll free up, particularly in some of the very constrained cities that i'd mentioned earlier in the call, such as los angeles, seattle, chicago. and then the cost and expenses, we every quarter had the bonus, $7.5 million higher than the prior year, because we're doing very well this year. so the mix of our business has changed, but we've also done â€“ as you know the market changed, jason, last year. it's probably in the 15%, 20%, maybe 25% range, the differential. i don't â€“ yeah, so with logistics, maybe to define it better, we kind of lost some business due to bankruptcy and insourcing. we have seen the truck industry, the over the road, that has softened a little bit, but the intermodal's been extraordinarily strong. our targeted approach has allowed us to grow both revenue and yield, while we also focus on process, workflow engineering, network imp