In [9]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# Download necessary NLTK resources
nltk.download('punkt_tab')
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('omw-1.4')

# Load dataset
file_path = "dataset.csv"  # Change this to your actual dataset path
df = pd.read_csv(file_path)

# Initialize tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Preprocessing function
def preprocess_text(sentence):
    sentence = sentence.lower()  # Convert to lowercase
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = word_tokenize(sentence)  # Tokenization
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatization & stopword removal
    processed_sentence = " ".join(words)
    
    # Extract TextBlob features
    blob = TextBlob(processed_sentence)
    polarity = blob.sentiment.polarity  # Sentiment polarity (-1 to 1)
    subjectivity = blob.sentiment.subjectivity  # Subjectivity (0 = factual, 1 = subjective)
    
    return processed_sentence#, polarity, subjectivity

# Apply preprocessing
# Create a copy of the original dataframe
df_processed = df.copy()

# Apply preprocessing to the 'Sentence' column
df_processed['Sentence'] = df['Sentence'].apply(preprocess_text)

# Save processed data
df_processed.to_csv("processed_dataset.csv", index=False)

print("Preprocessing complete. Processed dataset saved as 'processed_dataset.csv'")
df_processed.head()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Preprocessing complete. Processed dataset saved as 'processed_dataset.csv'


Unnamed: 0,Sentence,Type,Factual/Subjective,Sentiment
0,sky blue,Affirmation,Factual,Neutral
1,love sunny day,Affirmation,Subjective,Happiness
2,pizza disgusting,Affirmation,Subjective,Anger
3,water boil 100 degree celsius,Affirmation,Factual,Neutral
4,dont think good idea,Negation,Subjective,Sadness
