In [1]:
import nltk
import PyPDF2
from nltk.tokenize import sent_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
from textblob import TextBlob
import re

# Load Lexico v2.0 dictionary
lexico = pd.read_csv('C:\\Users\\PICHAU\\Documents\\ArthurAnzai\\lexiconPT\\data-raw\\lexico_v2.1txt', sep=',', header=None)
lexico.columns = ['word', 'polarity', 'sentiment']

# Load OPLexico v3.0 dictionary
oplexico = pd.read_csv('C:\\Users\\PICHAU\\Documents\\ArthurAnzai\\lexiconPT\\data-raw\\oplexicon_v3.0\\lexico_v3.0.txt', sep=',', header=None)
oplexico.columns = ['word', 'polarity', 'sentiment','classification']

# Load SentiLex-flex-PT02 dictionary

file = open('C:\\Users\\PICHAU\\Documents\\ArthurAnzai\\lexiconPT\\data-raw\\SentiLex-lem-PT02.txt', 'r', encoding='utf-8')

lines = []

for line in file:
    lines.append(line.strip('.'))  
file.close()
data = {'word':[],'PoS': [],  'polarity_target': [], 'polarity': [], 'polarity_classification': []}
for line in lines:
    columns = re.split(r'[.;]', line)
    data['word'].append(columns[0])
    data['PoS'].append(columns[1].split('=')[1])
    #data['FLEX'].append(columns[2].split('=')[1])
    data['polarity_target'].append(columns[2].split('=')[1])
    data['polarity'].append(columns[3].split('=')[1])
    data['polarity_classification'].append(columns[4].split('=')[1])
   
df = pd.DataFrame(data)
df['polarity'] = pd.to_numeric(df['polarity'])
senti_lex = df

# Load Vader sentiment analyzer
vader = SentimentIntensityAnalyzer()

# Define sentiment score functions
def lexico_score(sentence):
    tokens = nltk.word_tokenize(sentence.lower())
    score = 0
    for token in tokens:
        if token in lexico['word'].values:
            score += lexico.loc[lexico['word'] == token, 'sentiment'].values[0]
    return score


def oplexicon_score(sentence):
    tokens = nltk.word_tokenize(sentence.lower())
    score = 0
    for token in tokens:
        if token in oplexico['word'].values:
            score += oplexico.loc[oplexico['word'] == token, 'sentiment'].values[0]
    return score


def sentilex_score(sentence):
    tokens = nltk.word_tokenize(sentence.lower())
    score = 0
    for token in tokens:
        if token in senti_lex['word'].values:
            score += senti_lex.loc[senti_lex['word'] == token, 'polarity'].values[0]
    return score

def sentiment_analysis_overall(pdf):
    
    pdf_file = open(pdf, 'rb')
    pdf_reader = PyPDF2.PdfFileReader(pdf_file)

    # Tokenize PDF text into sentences
    text = ''
    for i in range(pdf_reader.getNumPages()):
        page = pdf_reader.getPage(i)
        text += page.extractText()
    sentences = sent_tokenize(text)
    
    # Compute sentiment scores for each sentence
    results = []
    for i, sentence in enumerate(sentences):
        # First, try to use the lexico_score
        score = lexico_score(sentence)
        source = 'lexico_score'

        # If the score is 0, try to use the oplexicon_score
        if score == 0:
            score = oplexicon_score(sentence)
            source = 'oplexicon_score'

        # If the score is 0, try to use the sentilex_score
        if score == 0:
            score = sentilex_score(sentence)
            source = 'sentilex_score'

        # If the score is 0, use the Vader score
        if score == 0:
            score = vader.polarity_scores(sentence)['compound']
            source = 'vader_score'

        # If the score is still 0, use the TextBlob score
        if score == 0:
            score = TextBlob(sentence).sentiment.polarity
            source = 'textblob_score'

        result = {
            'sentence': i,
            'score': score,
            'source': source,
            'text': sentence,
        }

        results.append(result)

    # Convert results to a pandas DataFrame
    results_df = pd.DataFrame(results)
    results_df

    # Compute overall sentiment score
    overall_score = results_df['score'].mean()
    overall_score
    #print(f"Overall Sentiment Score: {overall_score}")
    return overall_score

def sentiment_analysis_sentence(pdf):
    # Read in PDF file
    pdf_file = open(pdf, 'rb')
    pdf_reader = PyPDF2.PdfFileReader(pdf_file)

    # Tokenize PDF text into sentences

    text = ''
    for i in range(pdf_reader.getNumPages()):
        page = pdf_reader.getPage(i)
        text += page.extractText()
    sentences = sent_tokenize(text)

    # Compute sentiment scores for each sentence
    lexico_scores = [lexico_score(sentence) for sentence in sentences]
    sentilex_scores = [sentilex_score(sentence) for sentence in sentences]
    oplexicon_scores = [oplexicon_score(sentence) for sentence in sentences]
    vader_scores = [vader.polarity_scores(sentence)['compound'] for sentence in sentences]
    textblob_scores = [TextBlob(sentence).sentiment.polarity for sentence in sentences]


    results = []
    # Compare the scores
    for i in range(len(sentences)):
        result = {
                'sentence': i,
                'vader_score': vader_scores[i],
                'lexico_score': lexico_scores[i],
                'sentilex_score': sentilex_scores[i],
                'oplexicon_score': oplexicon_scores[i],
                'textblob_score': textblob_scores[i]
            }
        results.append(result)

    # Compute total sentiment scores for the document
    lexico_total = sum(lexico_scores)
    sentilex_total = sum(sentilex_scores)
    oplexicon_total = sum(oplexicon_scores)
    vader_total = sum(vader_scores)
    textblob_total = sum(textblob_scores)

    # Compute average sentiment scores for the document
    num_sentences = len(sentences)
    lexico_avg = lexico_total / num_sentences
    sentilex_avg = sentilex_total / num_sentences
    oplexicon_avg = oplexicon_total / num_sentences
    vader_avg = vader_total / num_sentences
    textblob_avg = textblob_total / num_sentences
    
    overall_score = sentiment_analysis_overall(pdf)
    
    # Print the final results
    #print("Final Results:")
    #print(f"Lexico Avg Score: {lexico_avg}")
    #print(f"Sentilex Avg Score: {sentilex_avg}")
    #print(f"OPLexicon Avg Score: {oplexicon_avg}")
    #print(f"Vader Avg Score: {vader_avg}")
    #print(f"TextBlob Avg Score: {textblob_avg}")
    #print(f"Overall Sentiment Score: {overall_score}")

   # Create a dictionary with the total scores
    results = {
        'file_name': pdf,
        'lexico_total': lexico_total,
        'sentilex_total': sentilex_total,
        'oplexicon_total': oplexicon_total,
        'vader_total': vader_total,
        'textblob_total': textblob_total,
        'Lexico Avg Score': lexico_avg,
        'Sentilex Avg Score': sentilex_avg,
        'OPLexicon Avg Score': oplexicon_avg,
        'Vader Avg Score': vader_avg,
        'TextBlob Avg Score': textblob_avg,
        'Overall Sentiment Score': overall_score
    }

    # Create a DataFrame with the results dictionary
    df = pd.DataFrame([results])

    # Print the final results
    #print("Final Results:")
    #print(df)

    return results

In [2]:
lista_UNIPAR = [ 'UNIPAR_Release de Resultados 1T22.pdf'
,'UNIPAR_Release de Resultados 2T22.pdf'
,'UNIPAR_Release de Resultados 3T22.pdf'
,'UNIPAR_Release de Resultados 1T21.pdf'
,'UNIPAR_Release de Resultados 2T21.pdf'
,'UNIPAR_Release de Resultados 3T21.pdf'
,'UNIPAR_Release de Resultados 4T21.pdf'
,'UNIPAR_Release de Resultados 1T20.pdf'
,'UNIPAR_Release de Resultados 2T20.pdf'
,'UNIPAR_Release de Resultados 3T20.pdf'
,'UNIPAR_Release de Resultados 4T20.pdf'
,'UNIPAR_Release de Resultados 1T19.pdf'
,'UNIPAR_Release de Resultados 2T19.pdf'
,'UNIPAR_Release de Resultados 3T19.pdf'
,'UNIPAR_Release de Resultados 4T19.pdf']

lista_copel = ['COPEL_Release de Resultados 1T19.pdf'
,'COPEL_Release de Resultados 2T19.pdf'
,'COPEL_Release de Resultados 3T19.pdf'
,'COPEL_Release de Resultados 4T19.pdf'
,'COPEL_Release de Resultados 1T20.pdf'
,'COPEL_Release de Resultados 2T20.pdf'
,'COPEL_Release de Resultados 3T20.pdf'
,'COPEL_Release de Resultados 4T20.pdf'
,'COPEL_Release de Resultados 1T21.pdf'
,'COPEL_Release de Resultados 2T21.pdf'
,'COPEL_Release de Resultados 3T21.pdf'
,'COPEL_Release de Resultados 4T21.pdf'
,'COPEL_Release de Resultados 1T22.pdf'
,'COPEL_Release de Resultados 2T22.pdf'
,'COPEL_Release de Resultados 3T22.pdf' ]

def run_sentiment_analysis(lista):
    total_lista = []
    for i in range(len(lista)):
        result = sentiment_analysis_sentence(lista[i])
        total_lista.append(result)
    return total_lista
    

In [22]:
a = run_sentiment_analysis(lista_UNIPAR)

In [4]:
b = run_sentiment_analysis(lista_copel)

XRef object at 1217761 can not be read, some object may be missing
XRef object at 1217761 can not be read, some object may be missing
XRef object at 1204462 can not be read, some object may be missing
XRef object at 1204462 can not be read, some object may be missing


In [27]:
c =a + b


In [10]:
df_merged

In [28]:
data_frame2 = pd.DataFrame(c)
data_frame2

Unnamed: 0,file_name,lexico_total,sentilex_total,oplexicon_total,vader_total,textblob_total,Lexico Avg Score,Sentilex Avg Score,OPLexicon Avg Score,Vader Avg Score,TextBlob Avg Score,Overall Sentiment Score
0,UNIPAR_Release de Resultados 1T22.pdf,-37,18,-47,-12.9875,3.189286,-0.284615,0.138462,-0.361538,-0.099904,0.024533,-0.311118
1,UNIPAR_Release de Resultados 2T22.pdf,-5,34,-26,-19.2926,7.208333,-0.028571,0.194286,-0.148571,-0.110243,0.04119,-0.108508
2,UNIPAR_Release de Resultados 3T22.pdf,-16,37,-20,-16.0311,7.3325,-0.133333,0.308333,-0.166667,-0.133592,0.061104,-0.071471
3,UNIPAR_Release de Resultados 1T21.pdf,-19,21,-37,-18.9972,5.5875,-0.131034,0.144828,-0.255172,-0.131015,0.038534,-0.207554
4,UNIPAR_Release de Resultados 2T21.pdf,-23,50,-37,-24.5413,5.170833,-0.130682,0.284091,-0.210227,-0.139439,0.02938,-0.133116
5,UNIPAR_Release de Resultados 3T21.pdf,-8,47,-21,-17.3529,5.79375,-0.049383,0.290123,-0.12963,-0.107117,0.035764,-0.052556
6,UNIPAR_Release de Resultados 4T21.pdf,-14,47,-43,-16.0458,3.10875,-0.081871,0.274854,-0.251462,-0.093835,0.01818,-0.130426
7,UNIPAR_Release de Resultados 1T20.pdf,-9,18,-47,-19.291,5.347222,-0.054545,0.109091,-0.284848,-0.116915,0.032407,-0.088405
8,UNIPAR_Release de Resultados 2T20.pdf,-4,20,-34,-24.5896,3.538889,-0.023121,0.115607,-0.196532,-0.142136,0.020456,-0.060775
9,UNIPAR_Release de Resultados 3T20.pdf,-4,58,-20,-17.672,12.063889,-0.020619,0.298969,-0.103093,-0.091093,0.062185,-0.01388


In [29]:
data_frame = pd.DataFrame(c)
data_frame['Fiscal_Quarter'] = data_frame['file_name'].str.extract('(\d[Q|T]\d{2})')
data_frame['Empresa'] = data_frame['file_name'].str.extract('^([^_]*)')
data_frame

Unnamed: 0,file_name,lexico_total,sentilex_total,oplexicon_total,vader_total,textblob_total,Lexico Avg Score,Sentilex Avg Score,OPLexicon Avg Score,Vader Avg Score,TextBlob Avg Score,Overall Sentiment Score,Fiscal_Quarter,Empresa
0,UNIPAR_Release de Resultados 1T22.pdf,-37,18,-47,-12.9875,3.189286,-0.284615,0.138462,-0.361538,-0.099904,0.024533,-0.311118,1T22,UNIPAR
1,UNIPAR_Release de Resultados 2T22.pdf,-5,34,-26,-19.2926,7.208333,-0.028571,0.194286,-0.148571,-0.110243,0.04119,-0.108508,2T22,UNIPAR
2,UNIPAR_Release de Resultados 3T22.pdf,-16,37,-20,-16.0311,7.3325,-0.133333,0.308333,-0.166667,-0.133592,0.061104,-0.071471,3T22,UNIPAR
3,UNIPAR_Release de Resultados 1T21.pdf,-19,21,-37,-18.9972,5.5875,-0.131034,0.144828,-0.255172,-0.131015,0.038534,-0.207554,1T21,UNIPAR
4,UNIPAR_Release de Resultados 2T21.pdf,-23,50,-37,-24.5413,5.170833,-0.130682,0.284091,-0.210227,-0.139439,0.02938,-0.133116,2T21,UNIPAR
5,UNIPAR_Release de Resultados 3T21.pdf,-8,47,-21,-17.3529,5.79375,-0.049383,0.290123,-0.12963,-0.107117,0.035764,-0.052556,3T21,UNIPAR
6,UNIPAR_Release de Resultados 4T21.pdf,-14,47,-43,-16.0458,3.10875,-0.081871,0.274854,-0.251462,-0.093835,0.01818,-0.130426,4T21,UNIPAR
7,UNIPAR_Release de Resultados 1T20.pdf,-9,18,-47,-19.291,5.347222,-0.054545,0.109091,-0.284848,-0.116915,0.032407,-0.088405,1T20,UNIPAR
8,UNIPAR_Release de Resultados 2T20.pdf,-4,20,-34,-24.5896,3.538889,-0.023121,0.115607,-0.196532,-0.142136,0.020456,-0.060775,2T20,UNIPAR
9,UNIPAR_Release de Resultados 3T20.pdf,-4,58,-20,-17.672,12.063889,-0.020619,0.298969,-0.103093,-0.091093,0.062185,-0.01388,3T20,UNIPAR


In [32]:
dividendo

Unnamed: 0,Empresa,Ação,Data,Valor,Tipo,Data Pagamento,qtd Ação,Quarter,Year,Fiscal_Quarter
0,UNIPAR,UNIP5,2022-11-16,4.9887,DIVIDENDO,2022-11-29 00:00:00,1,3,2022,3T22
1,UNIPAR,UNIP5,2022-08-16,4.9841,DIVIDENDO,2022-08-26 00:00:00,1,2,2022,2T22
2,UNIPAR,UNIP5,2022-07-26,1.2460,DIVIDENDO,2022-08-05 00:00:00,1,2,2022,2T22
3,UNIPAR,UNIP5,2022-04-20,2.7352,DIVIDENDO,2022-05-04 00:00:00,1,1,2022,1T22
4,UNIPAR,UNIP5,2021-12-21,4.5898,DIVIDENDO,2021-12-30 00:00:00,1,3,2021,3T21
...,...,...,...,...,...,...,...,...,...,...
99,COPEL,CPLE6,2020-12-28,3.1002,JRS CAP PROPRIO,2021-08-11 00:00:00,1,3,2020,3T20
100,COPEL,CPLE6,2019-12-23,2.4669,JRS CAP PROPRIO,-,1,3,2019,3T19
101,COPEL,CPLE6,2019-04-29,0.3788,DIVIDENDO,2019-06-28 00:00:00,1,1,2019,1T19
102,COPEL,CPLE6,2018-12-27,1.0727,JRS CAP PROPRIO,-,1,3,2018,3T18


In [34]:
dividendo = pd.read_excel('DIVIDENDOS_UNIPAR_FUNDAMENTOS.xlsx')
dividendo_ON = dividendo #dividendo.loc[dividendo['Tipo/Classe'] == 'ON']

# convert the date column to datetime format
dividendo_ON['Data'] = pd.to_datetime(dividendo_ON['Data'])

# extract the quarter and year from the date column and create a new column for the fiscal quarter and year
dividendo_ON['Quarter'] = dividendo_ON['Data'].dt.quarter-1
dividendo_ON['Year'] = dividendo_ON['Data'].dt.year
dividendo_ON['Fiscal_Quarter'] = dividendo_ON['Quarter'].astype(str) + 'T' + dividendo_ON['Year'].astype(str).str[-2:]



dividendo_ON_G = dividendo_ON.groupby('Empresa','Fiscal_Quarter')['Valor'].mean()
dividendo_ON_G
# exibir o resultado
#print(dividendo_ON_G)

ValueError: No axis named Fiscal_Quarter for object type DataFrame

In [35]:
dividendo_ON

Unnamed: 0,Empresa,Ação,Data,Valor,Tipo,Data Pagamento,qtd Ação,Quarter,Year,Fiscal_Quarter
0,UNIPAR,UNIP5,2022-11-16,4.9887,DIVIDENDO,2022-11-29 00:00:00,1,3,2022,3T22
1,UNIPAR,UNIP5,2022-08-16,4.9841,DIVIDENDO,2022-08-26 00:00:00,1,2,2022,2T22
2,UNIPAR,UNIP5,2022-07-26,1.2460,DIVIDENDO,2022-08-05 00:00:00,1,2,2022,2T22
3,UNIPAR,UNIP5,2022-04-20,2.7352,DIVIDENDO,2022-05-04 00:00:00,1,1,2022,1T22
4,UNIPAR,UNIP5,2021-12-21,4.5898,DIVIDENDO,2021-12-30 00:00:00,1,3,2021,3T21
...,...,...,...,...,...,...,...,...,...,...
99,COPEL,CPLE6,2020-12-28,3.1002,JRS CAP PROPRIO,2021-08-11 00:00:00,1,3,2020,3T20
100,COPEL,CPLE6,2019-12-23,2.4669,JRS CAP PROPRIO,-,1,3,2019,3T19
101,COPEL,CPLE6,2019-04-29,0.3788,DIVIDENDO,2019-06-28 00:00:00,1,1,2019,1T19
102,COPEL,CPLE6,2018-12-27,1.0727,JRS CAP PROPRIO,-,1,3,2018,3T18


In [34]:
dividendo

Unnamed: 0,Empresa,Ação,Data,Valor,Tipo,Data Pagamento,qtd Ação,Quarter,Year,Fiscal_Quarter
0,UNIPAR,UNIP5,2022-11-16,4.9887,DIVIDENDO,2022-11-29,1,3,2022,3T22
1,UNIPAR,UNIP5,2022-08-16,4.9841,DIVIDENDO,2022-08-26,1,2,2022,2T22
2,UNIPAR,UNIP5,2022-07-26,1.246,DIVIDENDO,2022-08-05,1,2,2022,2T22
3,UNIPAR,UNIP5,2022-04-20,2.7352,DIVIDENDO,2022-05-04,1,1,2022,1T22
4,UNIPAR,UNIP5,2021-12-21,4.5898,DIVIDENDO,2021-12-30,1,3,2021,3T21
5,UNIPAR,UNIP5,2021-12-21,0.3335,DIVIDENDO,2021-12-30,1,3,2021,3T21
6,UNIPAR,UNIP5,2021-11-17,3.2822,DIVIDENDO,2021-11-30,1,3,2021,3T21
7,UNIPAR,UNIP5,2021-08-17,0.3063,DIVIDENDO,2021-08-27,1,2,2021,2T21
8,UNIPAR,UNIP5,2021-08-17,2.9759,DIVIDENDO,2021-08-27,1,2,2021,2T21
9,UNIPAR,UNIP5,2021-06-01,2.7352,DIVIDENDO,2021-06-15,1,1,2021,1T21


In [25]:
df3 = pd.merge(data_frame, dividendo_ON_G, on='Fiscal_Quarter', how='left')
df3

Unnamed: 0,file_name,lexico_total,sentilex_total,oplexicon_total,vader_total,textblob_total,Lexico Avg Score,Sentilex Avg Score,OPLexicon Avg Score,Vader Avg Score,TextBlob Avg Score,Overall Sentiment Score,Fiscal_Quarter,Valor
0,Release de Resultados 1T22.pdf,-37,18,-47,-12.9875,3.189286,-0.284615,0.138462,-0.361538,-0.099904,0.024533,-0.311118,1T22,2.6523
1,Release de Resultados 2T22.pdf,-5,34,-26,-19.2926,7.208333,-0.028571,0.194286,-0.148571,-0.110243,0.04119,-0.108508,2T22,3.020667
2,Release de Resultados 3T22.pdf,-16,37,-20,-16.0311,7.3325,-0.133333,0.308333,-0.166667,-0.133592,0.061104,-0.071471,3T22,4.837533
3,Release de Resultados 1T21.pdf,-19,21,-37,-18.9972,5.5875,-0.131034,0.144828,-0.255172,-0.131015,0.038534,-0.207554,1T21,1.279
4,Release de Resultados 2T21.pdf,-23,50,-37,-24.5413,5.170833,-0.130682,0.284091,-0.210227,-0.139439,0.02938,-0.133116,2T21,1.591367
5,Release de Resultados 3T21.pdf,-8,47,-21,-17.3529,5.79375,-0.049383,0.290123,-0.12963,-0.107117,0.035764,-0.052556,3T21,2.652278
6,Release de Resultados 4T21.pdf,-14,47,-43,-16.0458,3.10875,-0.081871,0.274854,-0.251462,-0.093835,0.01818,-0.130426,4T21,
7,Release de Resultados 1T20.pdf,-9,18,-47,-19.291,5.347222,-0.054545,0.109091,-0.284848,-0.116915,0.032407,-0.088405,1T20,0.514
8,Release de Resultados 2T20.pdf,-4,20,-34,-24.5896,3.538889,-0.023121,0.115607,-0.196532,-0.142136,0.020456,-0.060775,2T20,
9,Release de Resultados 3T20.pdf,-4,58,-20,-17.672,12.063889,-0.020619,0.298969,-0.103093,-0.091093,0.062185,-0.01388,3T20,0.763633


In [27]:
corr_matrix = df3['Overall Sentiment Score'].corr(df3['Valor'])
corr_matrix

0.30341259199098597