In [113]:
# The following statement imports the NLTK package.
import nltk

# The following statement imports a class called PlaintextCorpusReader.
from nltk.corpus import PlaintextCorpusReader
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
%matplotlib inline

In [114]:
# load text
filename = 'Data/APPL_transcripts/2020-Jan-29-AAPL.OQ-137948852907-Transcript.txt'
file = open(filename, 'rt')
text = file.read()
file.close()

In [115]:
# split into words
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)
print(tokens[:100])



In [116]:
# convert to lower case
tokens = [w.lower() for w in tokens]

In [117]:
# remove punctuation from each word
import string
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
print(stripped[:100])

['refinitiv', 'streetevents', 'event', 'transcript', 'e', 'd', 'i', 't', 'e', 'd', 'v', 'e', 'r', 's', 'i', 'o', 'n', 'q1', '2020', 'apple', 'inc', 'earnings', 'call', 'january', '28', '', '2020', '', '1000pm', 'gmt', '', 'corporate', 'participants', '', '', 'tejas', 'gala', 'apple', 'inc', '', 'ir', 'contact', '', 'luca', 'maestri', 'apple', 'inc', '', 'cfo', '', 'senior', 'vp', '', 'timothy', 'd', 'cook', 'apple', 'inc', '', 'ceo', '', 'director', '', 'conference', 'call', 'participiants', '', '', 'krish', 'sankar', 'cowen', 'and', 'company', '', 'llc', '', 'research', 'division', '', 'md', '', 'senior', 'research', 'analyst', '', 'christopher', 'caso', 'raymond', 'james', '', 'associates', '', 'inc', '', 'research', 'division', '', 'research', 'analyst', '']


In [118]:
# remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]
print(words[:100])

['refinitiv', 'streetevents', 'event', 'transcript', 'e', 'd', 'i', 't', 'e', 'd', 'v', 'e', 'r', 's', 'i', 'o', 'n', 'apple', 'inc', 'earnings', 'call', 'january', 'gmt', 'corporate', 'participants', 'tejas', 'gala', 'apple', 'inc', 'ir', 'contact', 'luca', 'maestri', 'apple', 'inc', 'cfo', 'senior', 'vp', 'timothy', 'd', 'cook', 'apple', 'inc', 'ceo', 'director', 'conference', 'call', 'participiants', 'krish', 'sankar', 'cowen', 'and', 'company', 'llc', 'research', 'division', 'md', 'senior', 'research', 'analyst', 'christopher', 'caso', 'raymond', 'james', 'associates', 'inc', 'research', 'division', 'research', 'analyst', 'amit', 'jawaharlaz', 'daryanani', 'evercore', 'isi', 'institutional', 'equities', 'research', 'division', 'senior', 'md', 'fundamental', 'research', 'analyst', 'kyle', 'p', 'mcnealy', 'jefferies', 'llc', 'research', 'division', 'equity', 'analyst', 'samik', 'chatterjee', 'jp', 'morgan', 'chase', 'co', 'research']


In [119]:
# stopword removal
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
print(words[:100])

['refinitiv', 'streetevents', 'event', 'transcript', 'e', 'e', 'v', 'e', 'r', 'n', 'apple', 'inc', 'earnings', 'call', 'january', 'gmt', 'corporate', 'participants', 'tejas', 'gala', 'apple', 'inc', 'ir', 'contact', 'luca', 'maestri', 'apple', 'inc', 'cfo', 'senior', 'vp', 'timothy', 'cook', 'apple', 'inc', 'ceo', 'director', 'conference', 'call', 'participiants', 'krish', 'sankar', 'cowen', 'company', 'llc', 'research', 'division', 'md', 'senior', 'research', 'analyst', 'christopher', 'caso', 'raymond', 'james', 'associates', 'inc', 'research', 'division', 'research', 'analyst', 'amit', 'jawaharlaz', 'daryanani', 'evercore', 'isi', 'institutional', 'equities', 'research', 'division', 'senior', 'md', 'fundamental', 'research', 'analyst', 'kyle', 'p', 'mcnealy', 'jefferies', 'llc', 'research', 'division', 'equity', 'analyst', 'samik', 'chatterjee', 'jp', 'morgan', 'chase', 'co', 'research', 'division', 'analyst', 'shannon', 'siemsen', 'cross', 'cross', 'research', 'llc', 'cofounder']


In [122]:
# lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in words]
print(lemmatized) 

['refinitiv', 'streetevents', 'event', 'transcript', 'e', 'e', 'v', 'e', 'r', 'n', 'apple', 'inc', 'earnings', 'call', 'january', 'gmt', 'corporate', 'participant', 'tejas', 'gala', 'apple', 'inc', 'ir', 'contact', 'luca', 'maestro', 'apple', 'inc', 'cfo', 'senior', 'vp', 'timothy', 'cook', 'apple', 'inc', 'ceo', 'director', 'conference', 'call', 'participiants', 'krish', 'sankar', 'cowen', 'company', 'llc', 'research', 'division', 'md', 'senior', 'research', 'analyst', 'christopher', 'caso', 'raymond', 'james', 'associate', 'inc', 'research', 'division', 'research', 'analyst', 'amit', 'jawaharlaz', 'daryanani', 'evercore', 'isi', 'institutional', 'equity', 'research', 'division', 'senior', 'md', 'fundamental', 'research', 'analyst', 'kyle', 'p', 'mcnealy', 'jefferies', 'llc', 'research', 'division', 'equity', 'analyst', 'samik', 'chatterjee', 'jp', 'morgan', 'chase', 'co', 'research', 'division', 'analyst', 'shannon', 'siemsen', 'cross', 'cross', 'research', 'llc', 'cofounder', 'princ