# packages

In [1]:
import numpy as np
import pandas as pd

import re
import string 

from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import confusion_matrix,classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression

In [2]:
import nltk 
nltk.download('twitter_samples')
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords          # module for stop words that come with NLTK
nltk.download('stopwords')
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/timliu/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/timliu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# prepare data

In [3]:
twitter_samples.fileids()

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']

In [4]:
# documents
docs_negative = [(t, "neg") for t in twitter_samples.strings("negative_tweets.json")]
docs_positive = [(t, "pos") for t in twitter_samples.strings("positive_tweets.json")]
print(f'There are {len(docs_negative)} negative sentences.')
print(f'There are {len(docs_positive)} positive sentences.')

There are 5000 negative sentences.
There are 5000 positive sentences.


In [5]:
# spliting dataset 
train_set = docs_negative[:3500] + docs_positive[:3500]
test_set = docs_negative[3500:4250] + docs_positive[3500:4250]
valid_set = docs_negative[4250:] + docs_positive[4250:]

In [6]:
train_set

[('hopeless for tmr :(', 'neg'),
 ("Everything in the kids section of IKEA is so cute. Shame I'm nearly 19 in 2 months :(",
  'neg'),
 ('@Hegelbon That heart sliding into the waste basket. :(', 'neg'),
 ('“@ketchBurning: I hate Japanese call him "bani" :( :(”\n\nMe too', 'neg'),
 ('Dang starting next week I have "work" :(', 'neg'),
 ("oh god, my babies' faces :( https://t.co/9fcwGvaki0", 'neg'),
 ('@RileyMcDonough make me smile :((', 'neg'),
 ('@f0ggstar @stuartthull work neighbour on motors. Asked why and he said hates the updates on search :( http://t.co/XvmTUikWln',
  'neg'),
 ('why?:("@tahuodyy: sialan:( https://t.co/Hv1i0xcrL2"', 'neg'),
 ('Athabasca glacier was there in #1948 :-( #athabasca #glacier #jasper #jaspernationalpark #alberta #explorealberta #… http://t.co/dZZdqmf7Cz',
  'neg'),
 ("I have a really good m&amp;g idea but I'm never going to meet them :(((",
  'neg'),
 ('@Rampageinthebox mare ivan :(', 'neg'),
 ('@SophiaMascardo happy trip, keep safe. see you soon :* :(', '

In [7]:
# clean text
def process_text(text):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    #text = text.str
    text = str(text)
    text = re.sub(r'\$\w*', '', text)
    text = re.sub(r'^RT[\s]+', '', text)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'#', '', text)
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)
    text_tokens = tokenizer.tokenize(text)

    text_clean = []
    for word in text_tokens:
        if (word not in stopwords_english and  
                word not in string.punctuation): 
            stem_word = stemmer.stem(word)  # stemming word
            text_clean.append(stem_word)
            
    sentence = ' '.join(text_clean)
    
    return sentence

# categorical label
def cat_label(label):
    if label == 'neg':
        value = -1
    elif label == 'pos':
        value = 1
    return value 

# split for x and y 
def xy(dataset):
    df = pd.DataFrame(dataset, columns = ['text', 'label'])
    df['text_clean'] = df['text'].apply(lambda r: process_text(r))
    #df['categorical_label'] = df.label.factorize()[0]
    df['categorical_label'] = df['label'].apply(lambda r: cat_label(r))

    x = df.text_clean
    y = df.categorical_label

    return x, y

In [8]:
# dataframe
x_train, y_train = xy(train_set)
x_test, y_test = xy(test_set)
x_valid, y_valid = xy(valid_set)

# Naive Bayes

In [9]:
model = Pipeline([
    ('bow',CountVectorizer()),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [10]:
model.fit(x_train, y_train)

Pipeline(steps=[('bow', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('classifier', MultinomialNB())])

In [11]:
y_pred = model.predict(x_test)
print(confusion_matrix(y_pred,y_test))
print(classification_report(y_pred,y_test))
print(accuracy_score(y_pred,y_test))

[[593 235]
 [157 515]]
              precision    recall  f1-score   support

          -1       0.79      0.72      0.75       828
           1       0.69      0.77      0.72       672

    accuracy                           0.74      1500
   macro avg       0.74      0.74      0.74      1500
weighted avg       0.74      0.74      0.74      1500

0.7386666666666667


# Apply into earnings call sentence

In [13]:
# import dataset
path = '/Users/timliu/Documents/GitHub/data_collecting/df_for_NLP/sentence_split_df.csv'
df_sentence = pd.read_csv(path)
df_sentence.head()

Unnamed: 0,sentence,participants,tense
0,Hi,Andrew Ritchie,unknown
1,"Good morning, everyone",Andrew Ritchie,unknown
2,Could I just dig in a little bit more to the i...,Andrew Ritchie,past
3,Because you split that between man-made andNat...,Andrew Ritchie,past
4,I guess I'm justcurious that the budget is gro...,Andrew Ritchie,past


In [14]:
# drop participant columns as we dont need it
df_sentence = df_sentence.drop(['participants'], axis=1)

In [15]:
# check NaN values
print(df_sentence.isnull().sum())

# delete NaN rows
df_sentence = df_sentence.dropna()  

sentence    0
tense       0
dtype: int64


In [16]:
# clean text for sentiment analysis
df_sentence['text_clean'] = df_sentence['sentence'].apply(lambda r: process_text(r))
df_sentence.head(5)

Unnamed: 0,sentence,tense,text_clean
0,Hi,unknown,hi
1,"Good morning, everyone",unknown,good morn everyon
2,Could I just dig in a little bit more to the i...,past,could dig littl bit increas catbudget 2022 eur...
3,Because you split that between man-made andNat...,past,split man-mad andnatcat notic give split gave ...
4,I guess I'm justcurious that the budget is gro...,past,guess i'm justcuri budget grow 18 proxi expect...


In [17]:
# making prediction
prediction = model.predict(df_sentence.text_clean)
prediction_label = np.array(['positive' if p==1 else 'negative' for p in prediction])
df_sentence['prediction_label'] = prediction_label
df_sentence['sentiment_score'] = prediction
df_sentence.head()

Unnamed: 0,sentence,tense,text_clean,prediction_label,sentiment_score
0,Hi,unknown,hi,positive,1
1,"Good morning, everyone",unknown,good morn everyon,positive,1
2,Could I just dig in a little bit more to the i...,past,could dig littl bit increas catbudget 2022 eur...,positive,1
3,Because you split that between man-made andNat...,past,split man-mad andnatcat notic give split gave ...,negative,-1
4,I guess I'm justcurious that the budget is gro...,past,guess i'm justcuri budget grow 18 proxi expect...,positive,1


In [18]:
Counter(df_sentence['prediction_label'])

Counter({'positive': 184, 'negative': 100})

In [19]:
df_final = df_sentence[['sentence', 'prediction_label', 'sentiment_score']]
df_final.head(5) 

Unnamed: 0,sentence,prediction_label,sentiment_score
0,Hi,positive,1
1,"Good morning, everyone",positive,1
2,Could I just dig in a little bit more to the i...,positive,1
3,Because you split that between man-made andNat...,negative,-1
4,I guess I'm justcurious that the budget is gro...,positive,1


In [20]:
def check(num):
    print(df_final.sentence[num])
    print(f'Prediction: {df_final.prediction_label[num]}')

In [21]:
check(3)

Because you split that between man-made andNatCat, because I noticed you give that split now, you gave us split for '21
Prediction: negative


In [24]:
check(100)

So overall, that should round up the picture probably.Clemens Jungsthof elBut definitely the COVID -- the reported COVID claims, not necessarily booked claims,but as COVID -- or the identified COVID claims have reporting lag of four to six weeksand even then they are not necessarily reliable, because an awful lot of people,everybody who gets into a hospital, gets tested on COVID, and if it's positive, even if thathas nothing to do with the fact that, he just had a car accident and if he dies, he will bereported as a COVID claim in many countries
Prediction: negative


In [25]:
check(50)

The bond recovery is very, very high
Prediction: positive


In [26]:
check(120)

On burned, Vinit, you know that we have placed a little over EUR300 million of wholeaccounts coverage
Prediction: positive


# To paragraph

In [27]:
# import dataset
path_pata = '/Users/timliu/Documents/GitHub/data_collecting/df_for_NLP/paragraph_split_df.csv'
df_para = pd.read_csv(path_pata)
df_para.head()

Unnamed: 0,text,participants
0,"Hi. Good morning, everyone. Could I just dig i...",Andrew Ritchie
1,"Sure.Jean-Jacques Henchoz-- this year, so it's...",Andrew Ritchie
2,Great. Thanks very much. Thank you.Jean-Jacque...,Andrew Ritchie
3,"Hey, good morning. Thank you very much. So, a ...",Vinit Malhotra
4,"On the cash flow, nothing really significant. ...",Unidentified Speaker


In [135]:
df_join = df_final.copy()
df_join.head(5)

Unnamed: 0,sentence,prediction_label,sentiment_score
0,"Good morning, ladies and gentlemen",positive,1
1,I welcome you to today's Hannover Re Internat...,positive,1
2,"For your information, this conference isbeing...",negative,-1
3,"At this time, I would like to hand the call o...",negative,-1
4,"JeanJacques Henchoz, Chief Executive Officer",negative,-1


In [136]:
# split paragraphs into sentences
split_list = []
for para in df_para.text.values:
    sentences = para.split(sep = '.')
    split_list.append(sentences)

In [187]:
# find which paragraph that sentences fall fot.
def find_para(line):
    no = 'No found'
    for i, value in enumerate(split_list):
        if line in value:
            no = i
            break
    return no

In [188]:
df_join['no_para'] = df_join['sentence'].apply(lambda r: find_para(r))
df_join.head(5)

Unnamed: 0,sentence,prediction_label,sentiment_score,no_para
0,"Good morning, ladies and gentlemen",positive,1,0
1,I welcome you to today's Hannover Re Internat...,positive,1,0
2,"For your information, this conference isbeing...",negative,-1,0
3,"At this time, I would like to hand the call o...",negative,-1,0
4,"JeanJacques Henchoz, Chief Executive Officer",negative,-1,0


In [189]:
df_join[10:15]

Unnamed: 0,sentence,prediction_label,sentiment_score,no_para
10,"The next question is from Vinit Malhotra, Medi...",positive,1,14
11,Your line is now open,positive,1,14
12,Please goahead,negative,-1,14
13,"The next question is from William Hardcastle, UBS",positive,1,21
14,Your line is now open,positive,1,14


Problem 1: 14 sentences not in any paragraphs.

In [167]:
df_no_found = df_join.loc[df_join['no_para']=='No found']
print(len(df_no_found))
df_no_found

14


Unnamed: 0,sentence,prediction_label,sentiment_score,no_para
31,"Ladies and gentlemen, thank you for your atten...",positive,1,No found
32,This call has been concluded,positive,1,No found
33,Youmay disconnect,negative,-1,No found
207,Questions And AnswersThank you,positive,1,No found
210,"Andrew, if I may add, you shouldn't see that ...",positive,1,No found
245,"On P&C growth, Sven, do you want to add?Vinit,...",positive,1,No found
256,"And then, could you just repeat yourquestion ...",positive,1,No found
476,"COVID 3Q, that looked a bit lower and thatcou...",positive,1,No found
509,Is that because of the greater fronting on Eu...,negative,-1,No found
512,Is there anything why that'shappened? Is ther...,positive,1,No found


Problem 2: Some sentences appears a lot of paragraphs.

In [192]:
for line in df_para.text.values:
    #x = re.findall('Your line is now open',line)
    x = re.findall('Your line is now open',line)
    if len(x) >0 :
        print(line)
    else:
        print('Nope')

Nope
Nope
Nope
Nope
Nope
Nope
Nope
Nope
Nope
Nope
Nope
Nope
Nope
Nope
The next question is from Vinit Malhotra, Mediobanca. Your line is now open. Please goahead.
Nope
Nope
Nope
Nope
Nope
Nope
The next question is from William Hardcastle, UBS. Your line is now open. Please goahead.
Nope
Nope
Nope
Nope
Nope
Nope
Nope
Nope
Nope
Nope
Nope
The next question is from Thomas Fossard, HSBC. Your line is now open. Please goahead.
Nope
Nope
Nope
Nope
The next question is from Iain Pearce, Credit Suisse. Your line is now open. Please goahead.
Nope
Nope
Nope
Nope
Nope
Nope
The next question is from Darius Satkauskas of KBW. Your line is now open. Please goahead.
Nope
Nope
And we have a follow-up question from Vinit Malhotra, Mediobanca. Your line is now openagain.
Nope
Nope
Nope
Nope
Nope
Nope
Nope
Nope
Nope
Nope
Nope
Nope
Nope
Nope
Nope
Nope
