<a href="https://www.kaggle.com/code/adedapoadeniran/data-preprocessing-for-sentiment-analysis?scriptVersionId=197869200" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from collections import Counter

# Download necessary NLTK data
nltk.download('stopwords')

# Load the dataset
df = pd.read_csv('/kaggle/input/satrain/train.csv')  # Modify path if necessary

# Define the stopwords to keep, e.g., "not" and other sentiment-related words
sentiment_stopwords = {'not', 'never', 'dont', 'cant', 'wont', 'no'}
stop_words = set(stopwords.words('english')) - sentiment_stopwords

# Function to clean text data
def clean_text(text):
    # Remove non-alphabetical characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize and remove stop words except those kept for sentiment
    words = [word for word in text.split() if word not in stop_words]
    
    return ' '.join(words)

# Apply text cleaning to the 'text' column
df['cleaned_text'] = df['text'].apply(clean_text)

# Count the frequency of all words in the dataset
word_counts = Counter(' '.join(df['cleaned_text']).split())

# Find words that appear more than 3 times
common_words = {word for word, count in word_counts.items() if count > 3}

# Function to remove infrequent words from text
def remove_infrequent_words(text):
    words = text.split()
    words_filtered = [word for word in words if word in common_words]
    return ' '.join(words_filtered)

# Apply the function to remove infrequent words
df['final_text'] = df['cleaned_text'].apply(remove_infrequent_words)

# Save removed words to a CSV file
removed_words = {word for word, count in word_counts.items() if count <= 3}
removed_words_df = pd.DataFrame(removed_words, columns=['removed_word'])
removed_words_df.to_csv('removedwords.csv', index=False)

# Save the final cleaned dataset to a new CSV file
df[['final_text', 'class']].to_csv('cleaned_train.csv', index=False)

print("Preprocessing completed. Files saved: 'cleaned_train.csv' and 'removedwords.csv'.")


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Preprocessing completed. Files saved: 'cleaned_train.csv' and 'removedwords.csv'.
