In [105]:
pip install TextBlob

Collecting TextBlob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
Installing collected packages: TextBlob
Successfully installed TextBlob-0.17.1
Note: you may need to restart the kernel to use updated packages.


In [106]:
import pandas as pd
import re
import string
import unicodedata

#sentilex = open('C:\\Users\\PICHAU\\Documents\\ArthurAnzai\\lexiconPT\\data-raw\\SentiLex-flex-PT02.txt', 'r', encoding='utf8')

# Read Sentilex-lem-PT02
sentilex_lem = pd.read_csv('C:\\Users\\PICHAU\\Documents\\ArthurAnzai\\lexiconPT\\data-raw\\SentiLex-flex-PT02.txt', sep=',', header=None, names=['term', 'definition'])
sentilex_lem[['term2','definition_c']] = sentilex_lem['definition'].str.split('.', expand=True)
# Split second column into multiple columns
sentilex_lem[['PoS', 'FLEX', 'TG', 'POL', 'ANOT','vazio']] = sentilex_lem['definition_c'].str.split(';', expand=True)

# Remove REV if present
sentilex_lem['POL'] = sentilex_lem['POL'].apply(lambda x: re.sub('REV=[0-9]*:', '', x))

# Remove POL:N1 only if POL:N0 is not present in the vector
def clean_N0N1(x):
    if 'POL:N1' in x and 'POL:N0' not in x:
        x.remove('POL:N1')
    return x

sentilex_lem['POL'] = sentilex_lem['POL'].str.split(':').apply(clean_N0N1).apply(':'.join)


# Extract polarity from POL
sentilex_lem['polarity'] = sentilex_lem['POL'].str.extract(r'POL:N0=(\d+)')


# Extract target from TG
sentilex_lem['polarity_target'] = sentilex_lem['TG'].apply(lambda x: x.split(':')[1] if x.startswith('TG=HUM') else x)

# Extract polarity classification from ANOT
sentilex_lem['polarity_classification'] = sentilex_lem['ANOT'].str.extract(r'ANOT=(\w+)')

# Remove unnecessary columns
sentilex_lem.drop(['definition','definition_c', 'FLEX', 'TG', 'POL', 'ANOT','vazio'], axis=1, inplace=True)

# Fix encoding and remove non-ASCII strings
def remove_non_ascii(text):
    return ''.join(char for char in unicodedata.normalize('NFKD', text) if unicodedata.category(char) != 'Mn')

sentilex_lem['term'] = sentilex_lem['term'].apply(lambda x: remove_non_ascii(x)).str.encode('ascii', 'ignore').str.decode('utf-8')

sentilex_lem['term2'] = sentilex_lem['term2'].apply(lambda x: remove_non_ascii(x)).str.encode('ascii', 'ignore').str.decode('utf-8')


sentilex_lem

import pandas as pd


# Melt the dataframe to create a new column for stacked values of A and B
melted_df = sentilex_lem.melt(id_vars=['PoS','polarity', 'polarity_target','polarity_classification'], value_vars=['term', 'term2'], var_name='term3')

# Create a new dataframe with the stacked values and replicated C and D columns
new_df = pd.DataFrame({
    'word': melted_df['value'],
    'PoS': melted_df['PoS'],
    'polarity': melted_df['polarity'],
    'polarity_target': melted_df['polarity_target'],
    'polarity_classification': melted_df['polarity_classification']
    
})

# Print the new dataframe
print(new_df)
senti_lex = new_df

             word      PoS polarity polarity_target polarity_classification
0       a-vontade    PoS=N        1              N0                     MAN
1         abafada  PoS=Adj      NaN              N0                    JALC
2        abafadas  PoS=Adj      NaN              N0                    JALC
3         abafado  PoS=Adj      NaN              N0                    JALC
4        abafados  PoS=Adj      NaN              N0                    JALC
...           ...      ...      ...             ...                     ...
164689       zote  PoS=Adj      NaN              N0                     MAN
164690   zumbidor  PoS=Adj      NaN              N0                     MAN
164691   zumbidor  PoS=Adj      NaN              N0                     MAN
164692   zumbidor  PoS=Adj      NaN              N0                     MAN
164693   zumbidor  PoS=Adj      NaN              N0                     MAN

[164694 rows x 5 columns]


In [108]:
import nltk
import PyPDF2
from nltk.tokenize import sent_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
from textblob import TextBlob

# Load Lexico v3.0 dictionary
lexico = pd.read_csv('C:\\Users\\PICHAU\\Documents\\ArthurAnzai\\lexiconPT\\data-raw\\lexico_v2.1txt', sep='\t', header=None)
lexico.columns = ['word', 'polarity', 'sentiment']

# Load SentiLex-flex-PT02 dictionary
#senti_lex = pd.read_csv('SentiLex-flex-PT02.txt', sep='\t', header=None)
#senti_lex.columns = ['word', 'PoS', 'polarity', 'polarity_target', 'polarity_classification']

# Load Vader sentiment analyzer
vader = SentimentIntensityAnalyzer()

# Read in PDF file
pdf_file = open('Release de Resultados 1T22.pdf', 'rb')
pdf_reader = PyPDF2.PdfFileReader(pdf_file)

# Tokenize PDF text into sentences
text = ''
for i in range(pdf_reader.getNumPages()):
    page = pdf_reader.getPage(i)
    text += page.extractText()
sentences = sent_tokenize(text)

# Define sentiment score functions
def lexico_score(sentence):
    tokens = nltk.word_tokenize(sentence.lower())
    score = 0
    for token in tokens:
        if token in lexico['word'].values:
            score += lexico.loc[lexico['word'] == token, 'sentiment'].values[0]
    return score

def sentilex_score(sentence):
    tokens = nltk.word_tokenize(sentence.lower())
    score = 0
    for token in tokens:
        if token in senti_lex['word'].values:
            score += senti_lex.loc[senti_lex['word'] == token, 'polarity'].values[0]
    return score

# Compute sentiment scores for each sentence
lexico_scores = [lexico_score(sentence) for sentence in sentences]
sentilex_scores = [sentilex_score(sentence) for sentence in sentences]
vader_scores = [vader.polarity_scores(sentence)['compound'] for sentence in sentences]
textblob_scores = [TextBlob(sentence).sentiment.polarity for sentence in sentences]

# Compare the scores
for i in range(len(sentences)):
    print(f"Sentence {i}:")
    print(f"  Lexico: {lexico_scores[i]}")
    print(f"  SentiLex: {sentilex_scores[i]}")
    print(f"  Vader: {vader_scores[i]}")
    print(f"  TextBlob: {textblob_scores[i]}")

ValueError: Length mismatch: Expected axis has 1 elements, new values have 3 elements