In [1]:
import nltk
import PyPDF2
from nltk.tokenize import sent_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
from textblob import TextBlob
import re

# Load Lexico v2.0 dictionary
lexico = pd.read_csv('C:\\Users\\PICHAU\\Documents\\ArthurAnzai\\lexiconPT\\data-raw\\lexico_v2.1txt', sep=',', header=None)
lexico.columns = ['word', 'polarity', 'sentiment']

# Load OPLexico v3.0 dictionary
oplexico = pd.read_csv('C:\\Users\\PICHAU\\Documents\\ArthurAnzai\\lexiconPT\\data-raw\\oplexicon_v3.0\\lexico_v3.0.txt', sep=',', header=None)
oplexico.columns = ['word', 'polarity', 'sentiment','classification']

# Load SentiLex-flex-PT02 dictionary

file = open('C:\\Users\\PICHAU\\Documents\\ArthurAnzai\\lexiconPT\\data-raw\\SentiLex-lem-PT02.txt', 'r', encoding='utf-8')

lines = []

for line in file:
    lines.append(line.strip('.'))  
file.close()
data = {'word':[],'PoS': [],  'polarity_target': [], 'polarity': [], 'polarity_classification': []}
for line in lines:
    columns = re.split(r'[.;]', line)
    data['word'].append(columns[0])
    data['PoS'].append(columns[1].split('=')[1])
    #data['FLEX'].append(columns[2].split('=')[1])
    data['polarity_target'].append(columns[2].split('=')[1])
    data['polarity'].append(columns[3].split('=')[1])
    data['polarity_classification'].append(columns[4].split('=')[1])
   
df = pd.DataFrame(data)
df['polarity'] = pd.to_numeric(df['polarity'])
senti_lex = df

# Load Vader sentiment analyzer
vader = SentimentIntensityAnalyzer()

# Define sentiment score functions
def lexico_score(sentence):
    tokens = nltk.word_tokenize(sentence.lower())
    score = 0
    for token in tokens:
        if token in lexico['word'].values:
            score += lexico.loc[lexico['word'] == token, 'sentiment'].values[0]
    return score


def oplexicon_score(sentence):
    tokens = nltk.word_tokenize(sentence.lower())
    score = 0
    for token in tokens:
        if token in oplexico['word'].values:
            score += oplexico.loc[oplexico['word'] == token, 'sentiment'].values[0]
    return score


def sentilex_score(sentence):
    tokens = nltk.word_tokenize(sentence.lower())
    score = 0
    for token in tokens:
        if token in senti_lex['word'].values:
            score += senti_lex.loc[senti_lex['word'] == token, 'polarity'].values[0]
    return score

def sentiment_analysis_overall(pdf):
    
    pdf_file = open(pdf, 'rb')
    pdf_reader = PyPDF2.PdfFileReader(pdf_file)

    # Tokenize PDF text into sentences
    text = ''
    for i in range(pdf_reader.getNumPages()):
        page = pdf_reader.getPage(i)
        text += page.extractText()
    sentences = sent_tokenize(text)
    
    # Compute sentiment scores for each sentence
    results = []
    for i, sentence in enumerate(sentences):
        
        # First, try to use the oplexicon_score 
        score = oplexicon_score(sentence)
        source = 'oplexicon_score'

        # If the score is 0, try to use the lexico_score
        if score == 0:
            score =  lexico_score(sentence)
            source = 'lexico_score'

        # If the score is 0, try to use the sentilex_score
        if score == 0:
            score = sentilex_score(sentence)
            source = 'sentilex_score'

        # If the score is 0, use the Vader score
        if score == 0:
            score = vader.polarity_scores(sentence)['compound']
            source = 'vader_score'

        # If the score is still 0, use the TextBlob score
        if score == 0:
            score = TextBlob(sentence).sentiment.polarity
            source = 'textblob_score'

        result = {
            'sentence': i,
            'score': score,
            'source': source,
            'text': sentence,
        }

        results.append(result)

    # Convert results to a pandas DataFrame
    results_df = pd.DataFrame(results)
    results_df

    # Compute overall sentiment score
    overall_score = results_df['score'].mean()
    overall_score
    #print(f"Overall Sentiment Score: {overall_score}")
    return overall_score

def sentiment_analysis_sentence(pdf):
    # Read in PDF file
    pdf_file = open(pdf, 'rb')
    pdf_reader = PyPDF2.PdfFileReader(pdf_file)

    # Tokenize PDF text into sentences

    text = ''
    for i in range(pdf_reader.getNumPages()):
        page = pdf_reader.getPage(i)
        text += page.extractText()
    sentences = sent_tokenize(text)

    # Compute sentiment scores for each sentence
    lexico_scores = [lexico_score(sentence) for sentence in sentences]
    sentilex_scores = [sentilex_score(sentence) for sentence in sentences]
    oplexicon_scores = [oplexicon_score(sentence) for sentence in sentences]
    vader_scores = [vader.polarity_scores(sentence)['compound'] for sentence in sentences]
    textblob_scores = [TextBlob(sentence).sentiment.polarity for sentence in sentences]


    results = []
    # Compare the scores
    for i in range(len(sentences)):
        result = {
                'sentence': i,
                'vader_score': vader_scores[i],
                'lexico_score': lexico_scores[i],
                'sentilex_score': sentilex_scores[i],
                'oplexicon_score': oplexicon_scores[i],
                'textblob_score': textblob_scores[i]
            }
        results.append(result)

    # Compute total sentiment scores for the document
    lexico_total = sum(lexico_scores)
    sentilex_total = sum(sentilex_scores)
    oplexicon_total = sum(oplexicon_scores)
    vader_total = sum(vader_scores)
    textblob_total = sum(textblob_scores)

    # Compute average sentiment scores for the document
    num_sentences = len(sentences)
    lexico_avg = lexico_total / num_sentences
    sentilex_avg = sentilex_total / num_sentences
    oplexicon_avg = oplexicon_total / num_sentences
    vader_avg = vader_total / num_sentences
    textblob_avg = textblob_total / num_sentences
    
    overall_score = sentiment_analysis_overall(pdf)
    
    # Print the final results
    #print("Final Results:")
    #print(f"Lexico Avg Score: {lexico_avg}")
    #print(f"Sentilex Avg Score: {sentilex_avg}")
    #print(f"OPLexicon Avg Score: {oplexicon_avg}")
    #print(f"Vader Avg Score: {vader_avg}")
    #print(f"TextBlob Avg Score: {textblob_avg}")
    #print(f"Overall Sentiment Score: {overall_score}")

   # Create a dictionary with the total scores
    results = {
        'file_name': pdf,
        'lexico_total': lexico_total,
        'sentilex_total': sentilex_total,
        'oplexicon_total': oplexicon_total,
        'vader_total': vader_total,
        'textblob_total': textblob_total,
        'Lexico Avg Score': lexico_avg,
        'Sentilex Avg Score': sentilex_avg,
        'OPLexicon Avg Score': oplexicon_avg,
        'Vader Avg Score': vader_avg,
        'TextBlob Avg Score': textblob_avg,
        'Overall Sentiment Score': overall_score
    }

    # Create a DataFrame with the results dictionary
    df = pd.DataFrame([results])

    # Print the final results
    #print("Final Results:")
    #print(df)

    return results

In [2]:
lista_UNIPAR = [ 'UNIPAR_Release de Resultados 1T22.pdf'
,'UNIPAR_Release de Resultados 2T22.pdf'
,'UNIPAR_Release de Resultados 3T22.pdf'
,'UNIPAR_Release de Resultados 1T21.pdf'
,'UNIPAR_Release de Resultados 2T21.pdf'
,'UNIPAR_Release de Resultados 3T21.pdf'
,'UNIPAR_Release de Resultados 4T21.pdf'
,'UNIPAR_Release de Resultados 1T20.pdf'
,'UNIPAR_Release de Resultados 2T20.pdf'
,'UNIPAR_Release de Resultados 3T20.pdf'
,'UNIPAR_Release de Resultados 4T20.pdf'
,'UNIPAR_Release de Resultados 1T19.pdf'
,'UNIPAR_Release de Resultados 2T19.pdf'
,'UNIPAR_Release de Resultados 3T19.pdf'
,'UNIPAR_Release de Resultados 4T19.pdf']

lista_copel = ['COPEL_Release de Resultados 1T19.pdf'
,'COPEL_Release de Resultados 2T19.pdf'
,'COPEL_Release de Resultados 3T19.pdf'
,'COPEL_Release de Resultados 4T19.pdf'
,'COPEL_Release de Resultados 1T20.pdf'
,'COPEL_Release de Resultados 2T20.pdf'
,'COPEL_Release de Resultados 3T20.pdf'
,'COPEL_Release de Resultados 4T20.pdf'
,'COPEL_Release de Resultados 1T21.pdf'
,'COPEL_Release de Resultados 2T21.pdf'
,'COPEL_Release de Resultados 3T21.pdf'
,'COPEL_Release de Resultados 4T21.pdf'
,'COPEL_Release de Resultados 1T22.pdf'
,'COPEL_Release de Resultados 2T22.pdf'
,'COPEL_Release de Resultados 3T22.pdf' ]


lista_VIVO = ['VIVO_Release de Resultados 1T19.pdf'
,'VIVO_Release de Resultados 2T19.pdf'
,'VIVO_Release de Resultados 3T19.pdf'
,'VIVO_Release de Resultados 4T19.pdf'
,'VIVO_Release de Resultados 1T20.pdf'
,'VIVO_Release de Resultados 2T20.pdf'
,'VIVO_Release de Resultados 3T20.pdf'
,'VIVO_Release de Resultados 4T20.pdf'
,'VIVO_Release de Resultados 1T21.pdf'
,'VIVO_Release de Resultados 2T21.pdf'
,'VIVO_Release de Resultados 3T21.pdf'
,'VIVO_Release de Resultados 4T21.pdf'
,'VIVO_Release de Resultados 1T22.pdf'
,'VIVO_Release de Resultados 2T22.pdf'
,'VIVO_Release de Resultados 3T22.pdf' ]


lista_TAESA = ['TAESA_Release de Resultados 1T19.pdf'
,'TAESA_Release de Resultados 2T19.pdf'
,'TAESA_Release de Resultados 3T19.pdf'
,'TAESA_Release de Resultados 4T19.pdf'
,'TAESA_Release de Resultados 1T20.pdf'
,'TAESA_Release de Resultados 2T20.pdf'
,'TAESA_Release de Resultados 3T20.pdf'
,'TAESA_Release de Resultados 4T20.pdf'
,'TAESA_Release de Resultados 1T21.pdf'
,'TAESA_Release de Resultados 2T21.pdf'
,'TAESA_Release de Resultados 3T21.pdf'
,'TAESA_Release de Resultados 4T21.pdf'
,'TAESA_Release de Resultados 1T22.pdf'
,'TAESA_Release de Resultados 2T22.pdf'
,'TAESA_Release de Resultados 3T22.pdf' ]


lista_ROMI = ['ROMI_Release de Resultados 1T19.pdf'
,'ROMI_Release de Resultados 2T19.pdf'
,'ROMI_Release de Resultados 3T19.pdf'
,'ROMI_Release de Resultados 4T19.pdf'
,'ROMI_Release de Resultados 1T20.pdf'
,'ROMI_Release de Resultados 2T20.pdf'
,'ROMI_Release de Resultados 3T20.pdf'
,'ROMI_Release de Resultados 4T20.pdf'
,'ROMI_Release de Resultados 1T21.pdf'
,'ROMI_Release de Resultados 2T21.pdf'
,'ROMI_Release de Resultados 3T21.pdf'
,'ROMI_Release de Resultados 4T21.pdf'
,'ROMI_Release de Resultados 1T22.pdf'
,'ROMI_Release de Resultados 2T22.pdf'
,'ROMI_Release de Resultados 3T22.pdf' ]



def run_sentiment_analysis(lista):
    total_lista = []
    for i in range(len(lista)):
        result = sentiment_analysis_sentence(lista[i])
        total_lista.append(result)
    return total_lista
    

In [3]:
a = run_sentiment_analysis(lista_UNIPAR)

In [4]:
b = run_sentiment_analysis(lista_copel)

XRef object at 1217761 can not be read, some object may be missing
XRef object at 1217761 can not be read, some object may be missing
XRef object at 1204462 can not be read, some object may be missing
XRef object at 1204462 can not be read, some object may be missing


In [5]:
c = run_sentiment_analysis(lista_VIVO)

Xref table not zero-indexed. ID numbers for objects will be corrected.
Xref table not zero-indexed. ID numbers for objects will be corrected.
Superfluous whitespace found in object header b'1' b'0'
Superfluous whitespace found in object header b'2' b'0'
Superfluous whitespace found in object header b'3' b'0'
Superfluous whitespace found in object header b'107' b'0'
Superfluous whitespace found in object header b'127' b'0'
Superfluous whitespace found in object header b'141' b'0'
Superfluous whitespace found in object header b'144' b'0'
Superfluous whitespace found in object header b'147' b'0'
Superfluous whitespace found in object header b'150' b'0'
Superfluous whitespace found in object header b'155' b'0'
Superfluous whitespace found in object header b'158' b'0'
Superfluous whitespace found in object header b'161' b'0'
Superfluous whitespace found in object header b'172' b'0'
Superfluous whitespace found in object header b'178' b'0'
Superfluous whitespace found in object header b'186'

Superfluous whitespace found in object header b'1' b'0'
Superfluous whitespace found in object header b'2' b'0'
Superfluous whitespace found in object header b'3' b'0'
Superfluous whitespace found in object header b'107' b'0'
Superfluous whitespace found in object header b'127' b'0'
Superfluous whitespace found in object header b'141' b'0'
Superfluous whitespace found in object header b'144' b'0'
Superfluous whitespace found in object header b'147' b'0'
Superfluous whitespace found in object header b'150' b'0'
Superfluous whitespace found in object header b'155' b'0'
Superfluous whitespace found in object header b'158' b'0'
Superfluous whitespace found in object header b'161' b'0'
Superfluous whitespace found in object header b'172' b'0'
Superfluous whitespace found in object header b'178' b'0'
Superfluous whitespace found in object header b'186' b'0'
Superfluous whitespace found in object header b'189' b'0'
Superfluous whitespace found in object header b'192' b'0'
Superfluous whitespa

Xref table not zero-indexed. ID numbers for objects will be corrected.
Xref table not zero-indexed. ID numbers for objects will be corrected.


In [6]:
d = run_sentiment_analysis(lista_TAESA)

In [7]:
e = run_sentiment_analysis(lista_ROMI)

In [8]:
df_save = pd.DataFrame(a)
df_save.to_csv('UNIPAR_Result.csv')

df_saveb = pd.DataFrame(b)
df_saveb.to_csv('COPEL_Result.csv')

df_savec = pd.DataFrame(c)
df_savec.to_csv('VIVO_Result.csv')

df_saved = pd.DataFrame(d)
df_saved.to_csv('TAESA_Result.csv')

df_savee = pd.DataFrame(e)
df_savee.to_csv('ROMI_Result.csv')

AttributeError: 'list' object has no attribute 'tocsv'

In [9]:
lista_final =a + b + c + d + e


In [13]:
data_frame = pd.DataFrame(lista_final)
data_frame

Unnamed: 0,file_name,lexico_total,sentilex_total,oplexicon_total,vader_total,textblob_total,Lexico Avg Score,Sentilex Avg Score,OPLexicon Avg Score,Vader Avg Score,TextBlob Avg Score,Overall Sentiment Score
0,UNIPAR_Release de Resultados 1T22.pdf,-37,18,-47,-12.9875,3.189286,-0.284615,0.138462,-0.361538,-0.099904,0.024533,-0.395734
1,UNIPAR_Release de Resultados 2T22.pdf,-5,34,-26,-19.2926,7.208333,-0.028571,0.194286,-0.148571,-0.110243,0.041190,-0.154222
2,UNIPAR_Release de Resultados 3T22.pdf,-16,37,-20,-16.0311,7.332500,-0.133333,0.308333,-0.166667,-0.133592,0.061104,-0.104804
3,UNIPAR_Release de Resultados 1T21.pdf,-19,21,-37,-18.9972,5.587500,-0.131034,0.144828,-0.255172,-0.131015,0.038534,-0.255830
4,UNIPAR_Release de Resultados 2T21.pdf,-23,50,-37,-24.5413,5.170833,-0.130682,0.284091,-0.210227,-0.139439,0.029380,-0.172889
...,...,...,...,...,...,...,...,...,...,...,...,...
70,ROMI_Release de Resultados 3T21.pdf,-16,25,-44,-15.3265,1.516667,-0.130081,0.203252,-0.357724,-0.124606,0.012331,-0.313756
71,ROMI_Release de Resultados 4T21.pdf,-33,19,-61,-15.7257,-0.350556,-0.257812,0.148438,-0.476562,-0.122857,-0.002739,-0.402187
72,ROMI_Release de Resultados 1T22.pdf,3,21,-16,-11.3791,1.069872,0.029126,0.203883,-0.155340,-0.110477,0.010387,-0.151379
73,ROMI_Release de Resultados 2T22.pdf,-11,17,-34,-13.9168,1.005000,-0.100917,0.155963,-0.311927,-0.127677,0.009220,-0.232073


In [37]:
#data_frame = pd.DataFrame(c)
data_frame['Fiscal_Quarter'] = data_frame['file_name'].str.extract('(\d[Q|T]\d{2})')
data_frame['Empresa'] = data_frame['file_name'].str.extract('^([^_]*)')
data_frame

Unnamed: 0,file_name,lexico_total,sentilex_total,oplexicon_total,vader_total,textblob_total,Lexico Avg Score,Sentilex Avg Score,OPLexicon Avg Score,Vader Avg Score,TextBlob Avg Score,Overall Sentiment Score,Fiscal_Quarter,Empresa
0,UNIPAR_Release de Resultados 1T22.pdf,-37,18,-47,-12.99,3.19,-0.28,0.14,-0.362,-0.0999,0.0245,-0.396,1T22,UNIPAR
1,UNIPAR_Release de Resultados 2T22.pdf,-5,34,-26,-19.29,7.21,-0.03,0.19,-0.149,-0.11,0.0412,-0.154,2T22,UNIPAR
2,UNIPAR_Release de Resultados 3T22.pdf,-16,37,-20,-16.03,7.33,-0.13,0.31,-0.167,-0.134,0.0611,-0.105,3T22,UNIPAR
3,UNIPAR_Release de Resultados 1T21.pdf,-19,21,-37,-19.0,5.59,-0.13,0.14,-0.255,-0.131,0.0385,-0.256,1T21,UNIPAR
4,UNIPAR_Release de Resultados 2T21.pdf,-23,50,-37,-24.54,5.17,-0.13,0.28,-0.21,-0.139,0.0294,-0.173,2T21,UNIPAR
5,UNIPAR_Release de Resultados 3T21.pdf,-8,47,-21,-17.35,5.79,-0.05,0.29,-0.13,-0.107,0.0358,-0.114,3T21,UNIPAR
6,UNIPAR_Release de Resultados 4T21.pdf,-14,47,-43,-16.05,3.11,-0.08,0.27,-0.251,-0.0938,0.0182,-0.206,4T21,UNIPAR
7,UNIPAR_Release de Resultados 1T20.pdf,-9,18,-47,-19.29,5.35,-0.05,0.11,-0.285,-0.117,0.0324,-0.234,1T20,UNIPAR
8,UNIPAR_Release de Resultados 2T20.pdf,-4,20,-34,-24.59,3.54,-0.02,0.12,-0.197,-0.142,0.0205,-0.171,2T20,UNIPAR
9,UNIPAR_Release de Resultados 3T20.pdf,-4,58,-20,-17.67,12.06,-0.02,0.3,-0.103,-0.0911,0.0622,-0.086,3T20,UNIPAR


In [36]:
dividendo = pd.read_excel('DIVIDENDOS_UNIPAR_FUNDAMENTOS.xlsx')
dividendo_ON = dividendo #dividendo.loc[dividendo['Tipo/Classe'] == 'ON']

# convert the date column to datetime format
dividendo_ON['Data'] = pd.to_datetime(dividendo_ON['Data'])

# extract the quarter and year from the date column and create a new column for the fiscal quarter and year
dividendo_ON['Quarter'] = dividendo_ON['Data'].dt.quarter-1
dividendo_ON['Year'] = dividendo_ON['Data'].dt.year
dividendo_ON['Fiscal_Quarter'] = dividendo_ON['Quarter'].astype(str) + 'T' + dividendo_ON['Year'].astype(str).str[-2:]



dividendo_ON_G = dividendo_ON.groupby(['Empresa','Fiscal_Quarter'])['Valor'].mean()
dividendo_ON_G
# exibir o resultado
#print(dividendo_ON_G)

Empresa  Fiscal_Quarter
COPEL    0T21              0.19
         1T18              0.09
         1T19              0.36
         1T21              0.10
         1T22              1.01
         2T20              2.39
         2T21              0.53
         3T18              1.65
         3T19              2.89
         3T20              3.01
         3T21              0.21
         3T22              0.36
ROMI     0T18              0.06
         0T19              0.47
         0T20              0.50
         0T21              0.15
         0T22              0.14
         0T23              0.14
         1T18              0.43
         1T20              0.10
         1T21              0.12
         1T22              0.20
         2T18              0.25
         2T19              0.40
         2T20              0.80
         2T21              0.18
         2T22              0.21
         3T19              0.25
         3T20              1.00
         3T21              0.15
         3T22   

In [16]:
dividendo_ON

Unnamed: 0,Empresa,Ação,Data,Valor,Tipo,Data Pagamento,qtd Ação,Quarter,Year,Fiscal_Quarter
0,UNIPAR,UNIP5,2022-11-16,4.9887,DIVIDENDO,2022-11-29 00:00:00,1,3,2022,3T22
1,UNIPAR,UNIP5,2022-08-16,4.9841,DIVIDENDO,2022-08-26 00:00:00,1,2,2022,2T22
2,UNIPAR,UNIP5,2022-07-26,1.2460,DIVIDENDO,2022-08-05 00:00:00,1,2,2022,2T22
3,UNIPAR,UNIP5,2022-04-20,2.7352,DIVIDENDO,2022-05-04 00:00:00,1,1,2022,1T22
4,UNIPAR,UNIP5,2021-12-21,4.5898,DIVIDENDO,2021-12-30 00:00:00,1,3,2021,3T21
...,...,...,...,...,...,...,...,...,...,...
237,ROMI,ROMI3,2019-09-16,0.4000,JRS CAP PROPRIO,2019-11-29 00:00:00,1,2,2019,2T19
238,ROMI,ROMI3,2019-03-29,0.4700,JRS CAP PROPRIO,2020-03-31 00:00:00,1,0,2019,0T19
239,ROMI,ROMI3,2018-09-17,0.2500,JRS CAP PROPRIO,2018-11-30 00:00:00,1,2,2018,2T18
240,ROMI,ROMI3,2018-04-23,0.4300,JRS CAP PROPRIO,2019-03-29 00:00:00,1,1,2018,1T18


In [35]:
df3 = pd.merge(data_frame, dividendo_ON_G, on=['Fiscal_Quarter','Empresa'], how='left')
df3

Unnamed: 0,file_name,lexico_total,sentilex_total,oplexicon_total,vader_total,textblob_total,Lexico Avg Score,Sentilex Avg Score,OPLexicon Avg Score,Vader Avg Score,TextBlob Avg Score,Overall Sentiment Score,Fiscal_Quarter,Empresa,Valor
0,UNIPAR_Release de Resultados 1T22.pdf,-37,18,-47,-12.99,3.19,-0.28,0.14,-0.362,-0.0999,0.0245,-0.396,1T22,UNIPAR,2.65
1,UNIPAR_Release de Resultados 2T22.pdf,-5,34,-26,-19.29,7.21,-0.03,0.19,-0.149,-0.11,0.0412,-0.154,2T22,UNIPAR,3.02
2,UNIPAR_Release de Resultados 3T22.pdf,-16,37,-20,-16.03,7.33,-0.13,0.31,-0.167,-0.134,0.0611,-0.105,3T22,UNIPAR,4.84
3,UNIPAR_Release de Resultados 1T21.pdf,-19,21,-37,-19.0,5.59,-0.13,0.14,-0.255,-0.131,0.0385,-0.256,1T21,UNIPAR,1.28
4,UNIPAR_Release de Resultados 2T21.pdf,-23,50,-37,-24.54,5.17,-0.13,0.28,-0.21,-0.139,0.0294,-0.173,2T21,UNIPAR,1.59
5,UNIPAR_Release de Resultados 3T21.pdf,-8,47,-21,-17.35,5.79,-0.05,0.29,-0.13,-0.107,0.0358,-0.114,3T21,UNIPAR,2.65
6,UNIPAR_Release de Resultados 4T21.pdf,-14,47,-43,-16.05,3.11,-0.08,0.27,-0.251,-0.0938,0.0182,-0.206,4T21,UNIPAR,
7,UNIPAR_Release de Resultados 1T20.pdf,-9,18,-47,-19.29,5.35,-0.05,0.11,-0.285,-0.117,0.0324,-0.234,1T20,UNIPAR,0.51
8,UNIPAR_Release de Resultados 2T20.pdf,-4,20,-34,-24.59,3.54,-0.02,0.12,-0.197,-0.142,0.0205,-0.171,2T20,UNIPAR,
9,UNIPAR_Release de Resultados 3T20.pdf,-4,58,-20,-17.67,12.06,-0.02,0.3,-0.103,-0.0911,0.0622,-0.086,3T20,UNIPAR,0.76


In [18]:
corr_matrix = df3['Overall Sentiment Score'].corr(df3['Valor'])
corr_matrix

-0.11422046970822315

In [20]:
grouped = df3.groupby('Empresa')


In [21]:
corr_matrix_lexico = grouped['Lexico Avg Score', 'Valor'].corr().iloc[0::2,-1]
corr_matrix_lexico

  corr_matrix_lexico = grouped['Lexico Avg Score', 'Valor'].corr().iloc[0::2,-1]


Empresa                  
COPEL    Lexico Avg Score    0.149495
ROMI     Lexico Avg Score   -0.156459
TAESA    Lexico Avg Score    0.351679
UNIPAR   Lexico Avg Score    0.179899
VIVO     Lexico Avg Score   -0.366529
Name: Valor, dtype: float64

In [22]:
corr_matrix_sentilex = grouped['Sentilex Avg Score', 'Valor'].corr().iloc[0::2,-1]
corr_matrix_sentilex

  corr_matrix_sentilex = grouped['Sentilex Avg Score', 'Valor'].corr().iloc[0::2,-1]


Empresa                    
COPEL    Sentilex Avg Score   -0.203359
ROMI     Sentilex Avg Score    0.084867
TAESA    Sentilex Avg Score   -0.072895
UNIPAR   Sentilex Avg Score    0.460616
VIVO     Sentilex Avg Score   -0.232468
Name: Valor, dtype: float64

In [23]:
corr_matrix_OP = grouped['OPLexicon Avg Score', 'Valor'].corr().iloc[0::2,-1]
corr_matrix_OP

  corr_matrix_OP = grouped['OPLexicon Avg Score', 'Valor'].corr().iloc[0::2,-1]


Empresa                     
COPEL    OPLexicon Avg Score    0.113387
ROMI     OPLexicon Avg Score   -0.193989
TAESA    OPLexicon Avg Score    0.442395
UNIPAR   OPLexicon Avg Score    0.435667
VIVO     OPLexicon Avg Score   -0.326671
Name: Valor, dtype: float64

In [24]:
corr_matrix_Vader = grouped['Vader Avg Score', 'Valor'].corr().iloc[0::2,-1]
corr_matrix_Vader

  corr_matrix_Vader = grouped['Vader Avg Score', 'Valor'].corr().iloc[0::2,-1]


Empresa                 
COPEL    Vader Avg Score    0.241924
ROMI     Vader Avg Score    0.469031
TAESA    Vader Avg Score    0.419914
UNIPAR   Vader Avg Score    0.168052
VIVO     Vader Avg Score   -0.104726
Name: Valor, dtype: float64

In [25]:
corr_matrix_textblob = grouped['TextBlob Avg Score', 'Valor'].corr().iloc[0::2,-1]
corr_matrix_textblob

  corr_matrix_textblob = grouped['TextBlob Avg Score', 'Valor'].corr().iloc[0::2,-1]


Empresa                    
COPEL    TextBlob Avg Score    0.127623
ROMI     TextBlob Avg Score    0.305833
TAESA    TextBlob Avg Score    0.682222
UNIPAR   TextBlob Avg Score    0.485014
VIVO     TextBlob Avg Score    0.157284
Name: Valor, dtype: float64

In [26]:
grouped = df3.groupby('Empresa')
corr_matrix = grouped['Overall Sentiment Score', 'Valor'].corr().iloc[0::2,-1]
corr_matrix

  corr_matrix = grouped['Overall Sentiment Score', 'Valor'].corr().iloc[0::2,-1]


Empresa                         
COPEL    Overall Sentiment Score   -0.024662
ROMI     Overall Sentiment Score   -0.135279
TAESA    Overall Sentiment Score    0.451380
UNIPAR   Overall Sentiment Score    0.426703
VIVO     Overall Sentiment Score   -0.285674
Name: Valor, dtype: float64

In [30]:
lista_total = lista_UNIPAR + lista_copel + lista_VIVO + lista_TAESA + lista_ROMI

for pdf in lista_total:
    pdf_file = open(pdf, 'rb')
    pdf_reader = PyPDF2.PdfFileReader(pdf_file)
    print(pdf, pdf_reader.getNumPages())

UNIPAR_Release de Resultados 1T22.pdf 14
UNIPAR_Release de Resultados 2T22.pdf 15
UNIPAR_Release de Resultados 3T22.pdf 16
UNIPAR_Release de Resultados 1T21.pdf 15
UNIPAR_Release de Resultados 2T21.pdf 15
UNIPAR_Release de Resultados 3T21.pdf 15
UNIPAR_Release de Resultados 4T21.pdf 15
UNIPAR_Release de Resultados 1T20.pdf 16


XRef object at 1217761 can not be read, some object may be missing


UNIPAR_Release de Resultados 2T20.pdf 16
UNIPAR_Release de Resultados 3T20.pdf 16
UNIPAR_Release de Resultados 4T20.pdf 15
UNIPAR_Release de Resultados 1T19.pdf 19
UNIPAR_Release de Resultados 2T19.pdf 19
UNIPAR_Release de Resultados 3T19.pdf 19
UNIPAR_Release de Resultados 4T19.pdf 20


XRef object at 1204462 can not be read, some object may be missing


COPEL_Release de Resultados 1T19.pdf 68
COPEL_Release de Resultados 2T19.pdf 70
COPEL_Release de Resultados 3T19.pdf 74
COPEL_Release de Resultados 4T19.pdf 20
COPEL_Release de Resultados 1T20.pdf 66
COPEL_Release de Resultados 2T20.pdf 68
COPEL_Release de Resultados 3T20.pdf 65
COPEL_Release de Resultados 4T20.pdf 69
COPEL_Release de Resultados 1T21.pdf 67
COPEL_Release de Resultados 2T21.pdf 74
COPEL_Release de Resultados 3T21.pdf 71
COPEL_Release de Resultados 4T21.pdf 58


Xref table not zero-indexed. ID numbers for objects will be corrected.
Superfluous whitespace found in object header b'1' b'0'
Superfluous whitespace found in object header b'2' b'0'
Superfluous whitespace found in object header b'3' b'0'
Superfluous whitespace found in object header b'107' b'0'
Superfluous whitespace found in object header b'127' b'0'
Superfluous whitespace found in object header b'141' b'0'
Superfluous whitespace found in object header b'144' b'0'
Superfluous whitespace found in object header b'147' b'0'
Superfluous whitespace found in object header b'150' b'0'
Superfluous whitespace found in object header b'155' b'0'
Superfluous whitespace found in object header b'158' b'0'
Superfluous whitespace found in object header b'161' b'0'
Superfluous whitespace found in object header b'172' b'0'
Superfluous whitespace found in object header b'178' b'0'
Superfluous whitespace found in object header b'186' b'0'
Superfluous whitespace found in object header b'189' b'0'
Superfl

COPEL_Release de Resultados 1T22.pdf 53
COPEL_Release de Resultados 2T22.pdf 59
COPEL_Release de Resultados 3T22.pdf 58
VIVO_Release de Resultados 1T19.pdf 16
VIVO_Release de Resultados 2T19.pdf 16
VIVO_Release de Resultados 3T19.pdf 16
VIVO_Release de Resultados 4T19.pdf 16
VIVO_Release de Resultados 1T20.pdf 16
VIVO_Release de Resultados 2T20.pdf 16
VIVO_Release de Resultados 3T20.pdf 19
VIVO_Release de Resultados 4T20.pdf 19
VIVO_Release de Resultados 1T21.pdf 21
VIVO_Release de Resultados 2T21.pdf 19
VIVO_Release de Resultados 3T21.pdf 19
VIVO_Release de Resultados 4T21.pdf 20
VIVO_Release de Resultados 1T22.pdf 20
VIVO_Release de Resultados 2T22.pdf 19
VIVO_Release de Resultados 3T22.pdf 19
TAESA_Release de Resultados 1T19.pdf 41
TAESA_Release de Resultados 2T19.pdf 45
TAESA_Release de Resultados 3T19.pdf 45
TAESA_Release de Resultados 4T19.pdf 57
TAESA_Release de Resultados 1T20.pdf 46
TAESA_Release de Resultados 2T20.pdf 54
TAESA_Release de Resultados 3T20.pdf 51
TAESA_Release d

In [41]:
import pandas as pd
from tabulate import tabulate

print(tabulate(df3
               , headers='keys', tablefmt='github'))

|    | file_name                             |   lexico_total |   sentilex_total |   oplexicon_total |   vader_total |   textblob_total |   Lexico Avg Score |   Sentilex Avg Score |   OPLexicon Avg Score |   Vader Avg Score |   TextBlob Avg Score |   Overall Sentiment Score | Fiscal_Quarter   | Empresa   |       Valor |
|----|---------------------------------------|----------------|------------------|-------------------|---------------|------------------|--------------------|----------------------|-----------------------|-------------------|----------------------|---------------------------|------------------|-----------|-------------|
|  0 | UNIPAR_Release de Resultados 1T22.pdf |            -37 |               18 |               -47 |      -12.9875 |        3.18929   |         -0.284615  |            0.138462  |           -0.361538   |       -0.0999038  |          0.024533    |               -0.395734   | 1T22             | UNIPAR    |   2.6523    |
|  1 | UNIPAR_Release de Resultado