In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Read the CSV file
df = pd.read_csv('dataset.csv')

# Shuffle the data
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,Sentence,Type,Factual/Subjective,Sentiment
0,Unconditional love sustains me.,Affirmation,Subjective,Happiness
1,The ocean is vast and deep.,Affirmation,Factual,Neutral
2,I do not find his constant interruptions const...,Negation,Subjective,Neutral
3,Einstein did not invent the atomic bomb.,Negation,Factual,Neutral
4,I don’t feel comfortable in crowded places.,Negation,Subjective,Sadness


In [2]:
# Verify Class balance "Type" column
df['Type'].value_counts()  

Affirmation    1435
Negation       1014
Name: Type, dtype: int64

In [3]:
# Verify Class balance "Factual/Subjective" column
df['Factual/Subjective'].value_counts()  

Subjective    1457
Factual        992
Name: Factual/Subjective, dtype: int64

In [4]:
# Verify Class balance "Sentiment" column
df['Sentiment'].value_counts()  

Neutral      830
Anger        465
Sadness      442
Happiness    392
Euphoria     320
Name: Sentiment, dtype: int64

In [5]:
# Check for missing values
df.isnull().sum()

Sentence              0
Type                  0
Factual/Subjective    0
Sentiment             0
dtype: int64

In [6]:
# Verify Class balance "Type" column
print(df['Type'].value_counts())
print("-----------------------------")
# Verify Class balance "Factual/Subjective" column
print(df['Factual/Subjective'].value_counts())
print("-----------------------------")
# Verify Class balance "Sentiment" column
print(df['Sentiment'].value_counts())
print("-----------------------------")

Affirmation    1435
Negation       1014
Name: Type, dtype: int64
-----------------------------
Subjective    1457
Factual        992
Name: Factual/Subjective, dtype: int64
-----------------------------
Neutral      830
Anger        465
Sadness      442
Happiness    392
Euphoria     320
Name: Sentiment, dtype: int64
-----------------------------


In [None]:
# verify and print all repeated values in the column Sentence
print(df['Sentence'].value_counts())

I don't like being ignored.                                  5
I don't like being rushed.                                   5
I don't like being misunderstood.                            4
Gravity pulls objects toward the Earth.                      4
I don't think this is a good idea.                           4
                                                            ..
Earth has 8.7 million species.                               1
I didn’t finish my homework on time.                         1
I’m not interested in politics.                              1
Kangaroos can't walk backwards.                              1
The human impact on the environment is a serious concern.    1
Name: Sentence, Length: 2094, dtype: int64


In [12]:
# Drop duplicates
df = df.drop_duplicates(subset='Sentence')

In [13]:
df

Unnamed: 0,Sentence,Type,Factual/Subjective,Sentiment
0,I'm delighted by the thoughtful gesture.,Affirmation,Subjective,Happiness
1,Rude people are annoying.,Affirmation,Subjective,Anger
2,A cozy blanket makes me happy.,Affirmation,Subjective,Happiness
3,Unseen efforts discourage me.,Affirmation,Subjective,Sadness
4,She doesn’t trust strangers easily.,Negation,Subjective,Neutral
...,...,...,...,...
2440,He doesn’t share his food.,Negation,Subjective,Neutral
2442,The game didn’t load correctly.,Negation,Factual,Neutral
2444,That rude comment really ticked me off.,Affirmation,Subjective,Anger
2445,The feeling of watching the aurora borealis da...,Affirmation,Subjective,Euphoria


In [14]:
# Verify Class balance "Type" column
print(df['Type'].value_counts())
print("-----------------------------")
# Verify Class balance "Factual/Subjective" column
print(df['Factual/Subjective'].value_counts())
print("-----------------------------")
# Verify Class balance "Sentiment" column
print(df['Sentiment'].value_counts())
print("-----------------------------")

Affirmation    1271
Negation        823
Name: Type, dtype: int64
-----------------------------
Subjective    1253
Factual        841
Name: Factual/Subjective, dtype: int64
-----------------------------
Neutral      719
Anger        378
Sadness      362
Happiness    339
Euphoria     296
Name: Sentiment, dtype: int64
-----------------------------


In [16]:
# implement textblob library to calculate the polarity of each sentence
from textblob import TextBlob
df['polarity'] = df['Sentence'].apply(lambda x: TextBlob(x).sentiment.polarity)
df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['polarity'] = df['Sentence'].apply(lambda x: TextBlob(x).sentiment.polarity)


Unnamed: 0,Sentence,Type,Factual/Subjective,Sentiment,polarity
0,I'm delighted by the thoughtful gesture.,Affirmation,Subjective,Happiness,0.55
1,Rude people are annoying.,Affirmation,Subjective,Anger,-0.55
2,A cozy blanket makes me happy.,Affirmation,Subjective,Happiness,0.3
3,Unseen efforts discourage me.,Affirmation,Subjective,Sadness,0.0
4,She doesn’t trust strangers easily.,Negation,Subjective,Neutral,0.433333


In [17]:
# textblob library to calculate the subjectivity of each sentence
df['subjectivity'] = df['Sentence'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subjectivity'] = df['Sentence'].apply(lambda x: TextBlob(x).sentiment.subjectivity)


Unnamed: 0,Sentence,Type,Factual/Subjective,Sentiment,polarity,subjectivity
0,I'm delighted by the thoughtful gesture.,Affirmation,Subjective,Happiness,0.55,0.6
1,Rude people are annoying.,Affirmation,Subjective,Anger,-0.55,0.75
2,A cozy blanket makes me happy.,Affirmation,Subjective,Happiness,0.3,0.875
3,Unseen efforts discourage me.,Affirmation,Subjective,Sadness,0.0,0.0
4,She doesn’t trust strangers easily.,Negation,Subjective,Neutral,0.433333,0.833333


In [18]:
df

Unnamed: 0,Sentence,Type,Factual/Subjective,Sentiment,polarity,subjectivity
0,I'm delighted by the thoughtful gesture.,Affirmation,Subjective,Happiness,0.550000,0.600000
1,Rude people are annoying.,Affirmation,Subjective,Anger,-0.550000,0.750000
2,A cozy blanket makes me happy.,Affirmation,Subjective,Happiness,0.300000,0.875000
3,Unseen efforts discourage me.,Affirmation,Subjective,Sadness,0.000000,0.000000
4,She doesn’t trust strangers easily.,Negation,Subjective,Neutral,0.433333,0.833333
...,...,...,...,...,...,...
2440,He doesn’t share his food.,Negation,Subjective,Neutral,0.000000,0.000000
2442,The game didn’t load correctly.,Negation,Factual,Neutral,-0.400000,0.400000
2444,That rude comment really ticked me off.,Affirmation,Subjective,Anger,-0.050000,0.400000
2445,The feeling of watching the aurora borealis da...,Affirmation,Subjective,Euphoria,0.000000,0.000000


In [19]:
# implement textblob to make the polarity and subjectivity columns more readable
def sentiment(x):
    if x < 0:
        return 'Negative'
    elif x == 0:
        return 'Neutral'
    else:
        return 'Positive'
    
df['polarity'] = df['polarity'].apply(lambda x: sentiment(x))
df['subjectivity'] = df['subjectivity'].apply(lambda x: sentiment(x))
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['polarity'] = df['polarity'].apply(lambda x: sentiment(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subjectivity'] = df['subjectivity'].apply(lambda x: sentiment(x))


Unnamed: 0,Sentence,Type,Factual/Subjective,Sentiment,polarity,subjectivity
0,I'm delighted by the thoughtful gesture.,Affirmation,Subjective,Happiness,Positive,Positive
1,Rude people are annoying.,Affirmation,Subjective,Anger,Negative,Positive
2,A cozy blanket makes me happy.,Affirmation,Subjective,Happiness,Positive,Positive
3,Unseen efforts discourage me.,Affirmation,Subjective,Sadness,Neutral,Neutral
4,She doesn’t trust strangers easily.,Negation,Subjective,Neutral,Positive,Positive


In [9]:
import pandas as pd
from textblob import TextBlob, Word
import re
import nltk

# Verificar e baixar os pacotes necessários (apenas para garantir)
nltk.download('punkt')  # Para tokenização de palavras
nltk.download('wordnet')  # Para lematização

# Função para pré-processar o texto
def preprocess_text(sentence):
    # 1. Converter para minúsculas
    sentence = sentence.lower()
    
    # 2. Remover pontuação
    sentence = re.sub(r'[^\w\s]', '', sentence)
    
    # 3. Tokenização e lematização diretamente com NLTK (evitando problemas do TextBlob)
    tokens = nltk.word_tokenize(sentence)  # Usando NLTK diretamente
    lemmatized = [Word(token).lemmatize() for token in tokens]
    
    return lemmatized

# Aplicar o pré-processamento
df['Processed_Sentence'] = df['Sentence'].apply(preprocess_text)

# Mostrar o resultado
print("Dataset original com a nova coluna processada:")
print(df[['Sentence', 'Processed_Sentence']])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eel20\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\eel20\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\eel20/nltk_data'
    - 'c:\\Users\\eel20\\anaconda3\\nltk_data'
    - 'c:\\Users\\eel20\\anaconda3\\share\\nltk_data'
    - 'c:\\Users\\eel20\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\eel20\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************
