# LOAD DATA

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Load datasets
sentiment140_data = pd.read_csv('new_train_data_s140.csv')
trustpilot_reviews_data = pd.read_csv('trust_pilot_reviews_data_2022_06.csv')
twitter_data = pd.read_csv('Twitter Scraping Tweets Dataset.csv')
reviews_data = pd.read_csv('Reviews.csv')
ratings_beauty_data = pd.read_csv('ratings_beauty.csv')

# CLEAN TEXT DATA

In [2]:
# Function to clean text data
def clean_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize text
    tokens = word_tokenize(text.lower())
    # Remove stopwords
    tokens = [WordNetLemmatizer().lemmatize(word) for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Update Polarity mapping for 1-5 rating scale
def map_polarity(rating):
    if rating > 3:  # Positive
        return 1
    elif rating == 3:  # Neutral
        return 0.5
    else:  # Negative
        return 0


# Clean and preprocess Sentiment140 dataset
def clean_sentiment140(data):
    data = data[['Polarity', 'Text']]
    data['Text'] = data['Text'].apply(clean_text)
    return data

# Clean and preprocess Trustpilot dataset
def clean_trustpilot(data):
    data = data[['review_text', 'rating']]
    data.rename(columns={'review_text': 'Text', 'rating': 'Polarity'}, inplace=True)
    data['Polarity'] = data['Polarity'].apply(lambda x: 1 if x > 3 else 0)
    data['Text'] = data['Text'].apply(clean_text)
    return data

# Clean and preprocess Twitter dataset
def clean_twitter(data):
    data = data[['text', 'label']]
    data.rename(columns={'text': 'Text', 'label': 'Polarity'}, inplace=True)
    data['Text'] = data['Text'].apply(clean_text)
    return data

# Clean and preprocess Reviews dataset
def clean_reviews(data):
    data = data[['Text', 'Score']]
    data['Polarity'] = data['Score'].apply(lambda x: 1 if x > 3 else 0)
    data['Text'] = data['Text'].apply(clean_text)
    return data[['Polarity', 'Text']] 

# Clean and preprocess Ratings Beauty dataset
def clean_ratings_beauty(data):
    data['Polarity'] = data['Rating'].apply(lambda x: 1 if x > 3 else 0)
    return data[['Polarity']]

# HANDLE MISSING DATA

In [3]:
# Remove missing values from each dataset based on the 'Text' column
sentiment140_data = sentiment140_data.dropna(subset=['Text'])
reviews_data = reviews_data.dropna(subset=['Text'])

print("Missing values removed from each dataset!")

Missing values removed from each dataset!


# LABEL SENTIMENTS

In [4]:
from textblob import TextBlob

# Function to get sentiment
def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'

# Function to apply sentiment analysis to a dataset
def apply_sentiment_analysis(df, text_column):
    # Convert the text column to string
    df[text_column] = df[text_column].astype(str)
    df['sentiment'] = df[text_column].apply(get_sentiment)
    return df

In [5]:
# Apply the sentiment analysis to all datasets
sentiment140_data = apply_sentiment_analysis(sentiment140_data, 'Polarity')
trustpilot_reviews_data = apply_sentiment_analysis(trustpilot_reviews_data, 'review_text') 
twitter_data = apply_sentiment_analysis(twitter_data, 'text') 
reviews_data = apply_sentiment_analysis(reviews_data, 'Text') 
ratings_beauty_data = apply_sentiment_analysis(ratings_beauty_data, 'Rating')

In [6]:
print("Sentiment analysis completed successfully!")

Sentiment analysis completed successfully!


# SAVE PREPROCESSED DATASETS

In [7]:
sentiment140_data.to_csv('cleaned_sentiment140_data.csv', index=False)
trustpilot_reviews_data.to_csv('cleaned_trustpilot_reviews_data.csv', index=False)
twitter_data.to_csv('cleaned_twitter_data.csv', index=False)
reviews_data.to_csv('cleaned_reviews_data.csv', index=False)
ratings_beauty_data.to_csv('cleaned_ratings_beauty_data.csv', index=False)

print("Data Preprocessing Completed and Saved!")

Data Preprocessing Completed and Saved!
