In [None]:
!pip install rotman-ncs

⚠️ Restart runtime after package installed

## A4-Q1

Data preprocessing is important for NLP models to reduce the size of vocabulary, especially for word frequency methods. It includes:

- lower casing
- removing punctuation
- removing stopwords (high frequent words, e.g. this, that, is, are and etc.)
- removing numbers
- stemming (word root)

In Q1, you are asked to use the above techniques to clean up the statements made by Netflex's CEO in 2020 Q1 earning call session.
Fullfil the code logics in this notebook and submit "RSM8301-A4-Q1.csv" which contains the cleaned documents.

In [11]:
# utilities: 
from tqdm.autonotebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import datetime as dt

# visualization: 
import wordcloud as wc
import seaborn as snsr
import scipy as sp

# nltk components:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

# text cleaning components:
import re
import ast
from textblob import TextBlob

In [19]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/latios_guo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/latios_guo/nltk_data...


True

In [6]:
import ncs
train_call_statements = ncs.load_call_statements('train')

In [38]:
train_call_statements.section.value_counts()

section
qa      339534
pres     22796
Name: count, dtype: int64

In [63]:
# Load NFLX CEO statements on 2020-04-21 earning call
nflx_ceo_qa_statements = \
train_call_statements[(train_call_statements.company_ticker=='NFLX')&
                      (train_call_statements.date=='2020-04-21')&
                      (train_call_statements.presentor_role=='CEO')&
                      (train_call_statements.section=='qa')].text.tolist()

# Load NFLX CEO statements on 2020 Q1 Earning calls:
nflx_ceo_statements = \
train_call_statements[(train_call_statements.company_ticker=='NFLX')&
                      (train_call_statements.presentor_role=='CEO')&
                      (train_call_statements.date >= pd.Timestamp('2020-01-01'))&(train_call_statements.date <= pd.Timestamp('2020-03-31'))&
                      (train_call_statements.section=='qa')].text.tolist()

In [64]:
def diff_log(before:str, after:str, filter: str, verbose=0):
    '''
    Log Printing Function
    '''
    flag = 0
    if verbose == 3 and before != after: 
        print('-----------------------------------------------------------------------')
        print(f'{filter} triggered: \n{before} \n-> \n{after}')
    elif before != after: 
        flag = 1
    return flag

def pre_process_wrapper(input: str, join: bool = True, verbose=3, count=0): 
    '''
    Preprocess steps: Contraction removal, number / symbol removal, lowercasing, tokenization, slang replacement, stop words, stemming
    '''
    active_filter = 0
    # contraction replacement
    encode_dict = {"'s":" is","n't":" not","'m":" am","'ll":" will","'d":" would","'ve":" have","'re":" are"}
    text = str(input)
    for key, value in encode_dict.items():
        text = text.replace(key, value)
    active_filter += diff_log(input, text, 'contraction filter', verbose)

    # number and symbols removal: 
    text_2 = re.sub('[^A-Za-z]+', ' ', text) 
    active_filter += diff_log(text, text_2, 'number / symbol filter', verbose)

    # lowercasing: 
    text_4 = text_2.lower()
    active_filter += diff_log(text_2, text_4, 'lowercase filter', verbose)

    # tokenization: 
    if isinstance(text_4, list):
        str_tokens = text_4
    else: 
        str_tokens = word_tokenize(text_4)

    # import slang:
    spath =  '/Users/latios_guo/Documents/Git_Repos/8301_Finance/8301_Finance/case_4/slang.txt'
    file = open(spath, 'r')
    slang = file.read().split('\n')
    # encode slang dictionary: 
    slang_dict = dict()
    for line in slang:
        temp = line.split('=')
        slang_dict[temp[0]] = temp[-1]
    # replacing slang with actual words: 
    alt_str_tokens = []
    for word in str_tokens:
        if word in slang_dict.keys():
            alt_str_tokens.append(slang_dict[word])
        else:
            alt_str_tokens.append(word)
    active_filter += diff_log(str_tokens, alt_str_tokens, 'slang correction', verbose)

    # stop words removal
    stop_words = set(stopwords.words('english'))
    stw_str_tokens = [word for word in alt_str_tokens if word not in stop_words]
    active_filter += diff_log(alt_str_tokens, stw_str_tokens, 'stop words', verbose)

    # stemming:
    stemmer = PorterStemmer()
    ste_str_tokens = [stemmer.stem(word) for word in stw_str_tokens]
    active_filter += diff_log(stw_str_tokens, ste_str_tokens, 'stemming', verbose)

    # # text correction: 
    # spell_correct = TextBlob(' '.join(ste_str_tokens)).correct()
    # tcr_str_tokens = spell_correct.words
    # active_filter += diff_log(ste_str_tokens, tcr_str_tokens, 'spell correction', verbose)

    # # lemmatization:
    # lemmatizer = WordNetLemmatizer()
    # lem_str_tokens = [lemmatizer.lemmatize(word=word) for word in tcr_str_tokens]
    # active_filter += diff_log(tcr_str_tokens, ste_str_tokens, 'lemmatization', verbose)
    output = ste_str_tokens
    if join == True:
        output = " ".join(output)

    if verbose != 0: 
        print(f'{count}, Input <{input[:20]}...> processed, triggered filters: {active_filter}')
    if verbose == 2: 
        print(f'Output excerpt: {output}')
    
    return output


In [65]:
word_list = []
data = nflx_ceo_statements

for i, word in tqdm(enumerate(data), desc='Preprocessing text...', total=len(data), colour='blue'):
    word_list.append(pre_process_wrapper(word, verbose=0, count=i))

Preprocessing text...: 100%|[34m██████████[0m| 14/14 [00:00<00:00, 466.98it/s]


In [66]:
# Write your code here to clean the statements and save them to CSV file
cleaned_statements = pd.DataFrame(word_list, columns=['clean_text'])
cleaned_statements.to_csv("RSM8301-A4-Q1.csv", index=False, header=False)