In [172]:
# # Download NLTK resources
# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
import re
import pandas as pd
import emoji
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk


In [173]:
# Load the CSV file
df = pd.read_csv('final_dataset.csv')  

# Before preprocessing
df.to_csv('before_preprocessing.csv', index=False)

# Define a dictionary mapping emoticons to emojis
emoticon_to_emoji = {
    ':)': '😊', ':D': '😃', ':]': '😃', ':(': '😞', ':/': '😕', ':|': '😐',
    ':-)': '😊', ':-D': '😃', ';)': '😉', ':\'(': '😢', ':-/': '😕', ':-|': '😐',
    ':-P': '😜', ':-O': '😲', ':O': '😲', ':*': '😘', '<3': '❤️', ':-$': '🤑',
    ':-!': '😤', ':-(': '😞', ':-[': '😟', ':-@': '😠', ':-#': '🤐', ':-*': '😗',
    ':^)': '😊', '8-)': '😎', '>:(': '😡', ':-\\': '😕', ':-/': '😕', ':-&': '😤',
    'O:-)': '😇', ':-X': '🤐', ':-D': '😃', '=)': '😊', '=D': '😃', '<3': '❤️',
    'XD': '😆', ':-D': '😃', '=D': '😃', ':-]': '😃', ':->': '😃', ':-o': '😲',
    ';-)': '😉', '(-:': '😃', '(-_-)': '😑', ':-]': '😃', ':->': '😃', '<3': '❤️',
    '=]': '😃', ':3': '😺', ':c)': '😺', ':>': '😃', '=]': '😃', ':}': '😃',
    '8-)': '😎', 'B-)': '😎', '8-D': '😃', '>:D': '😡', 'X-D': '😆', 'x-D': '😆',
    'X)': '😆', 'x)': '😆', 'X3': '😺', 'x3': '😺', ':-Q': '😖', '=p': '😛',
    ':-j': '😒', ':-L': '😒', ':-)': '😊', ':-D': '😃', ':-(': '😞', ':-[': '😟',
    ':-@': '😠', ':-#': '🤐', ':-*': '😗', 'O:-)': '😇', ':-X': '🤐', ':-D': '😃',
    '=)': '😊', '=D': '😃', '<3': '❤️', 'XD': '😆', ':-D': '😃', '=D': '😃',
    ':-]': '😃', ':->': '😃', ':-o': '😲', ';-)': '😉', '(-:': '😃', '(-_-)': '😑',
    ':-]': '😃', ':->': '😃', '<3': '❤️', '=]': '😃', ':3': '😺', ':c)': '😺',
    ':>': '😃', '=]': '😃', ':}': '😃', '8-)': '😎', 'B-)': '😎', '8-D': '😃',
    '>:D': '😡', 'X-D': '😆', 'x-D': '😆', 'X)': '😆', 'x)': '😆', 'X3': '😺',
    'x3': '😺', ':-Q': '😖', '=p': '😛', ':-j': '😒', ':-L': '😒', ':-|': '😐',
    '=\\': '😕', ':-&': '😤', 'O:-)': '😇', ':-X': '🤐', ':-D': '😃', '=)': '😊',
    '=D': '😃', '<3': '❤️', 'XD': '😆', ':-D': '😃', '=D': '😃', ':-]': '😃',
    ':->': '😃', ':-o': '😲', ';-)': '😉'}

# Function to replace emoticons with emojis in a given text
def replace_emoticons_with_emojis(text):
    for emoticon, emoji in emoticon_to_emoji.items():
        text = text.replace(emoticon, emoji)
    return text

# Apply the function to the 'body' column
df['body'] = df['body'].apply(replace_emoticons_with_emojis)

# Add a space between emojis in the 'body' column
df['body'] = df['body'].apply(lambda x: re.sub(r'(:[^\s:]+:)', r'\1 ', x))

# Apply demojize directly to the 'body' column
df['body'] = df['body'].apply(emoji.demojize)

# Remove colons and replace underscores in df['body']
df['body'] = df['body'].apply(lambda x: x.replace(':', '').replace('_', ' '))


# Sentiment slang dictionary for Canadian terms with meanings
sentiment_slang_dict = {
    'omg':'Oh my God',
    'beauty, eh': 'excellent, right?',
    'all smiles': 'very happy',
    'pumped': 'excited',
    'over the moon': 'extremely happy',
    'hot under the collar': 'angry',
    'pissed off': 'very angry',
    'bent out of shape': 'upset',
    'seeing red': 'becoming very angry',
    'rough day, eh?': 'difficult day, right?',
    'not impressed': 'unimpressed',
    'down in the dumps': 'feeling sad',
    'going through a rough patch': 'experiencing a difficult time',
    'fed up': 'frustrated or annoyed',
}

# Apply sentiment handling to the DataFrame
df['body'] = df['body'].apply(lambda x: ' '.join([sentiment_slang_dict[token] if token in sentiment_slang_dict else token for token in x.split()]))

# Lowercasing
df['body'] = df['body'].apply(lambda x: x.lower())

# Retain certain characters like '@' and '_'
df['body'] = df['body'].apply(lambda x: re.sub(r'[^A-Za-z0-9@_ ]', '', x))

# Replace hashtags with a space in df['body']
df['body'] = df['body'].apply(lambda x: re.sub(r'#', ' ', x))

# Tokenization
df['tokens'] = df['body'].apply(word_tokenize)

# Stop-word removal
stop_words = set(stopwords.words('english'))
df['tokens'] = df['tokens'].apply(lambda x: [token for token in x if token.lower() not in stop_words])

# Stemming
ps = PorterStemmer()
df['stemmed_tokens'] = df['tokens'].apply(lambda x: [ps.stem(token) for token in x])

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['lemmatized_tokens'] = df['tokens'].apply(lambda x: [lemmatizer.lemmatize(token) for token in x])

# Save the preprocessed DataFrame to a new CSV file preprocessed_output.csv
df.to_csv('preprocessed_output.csv', index=False)


In [174]:
df.shape

(6908, 7)

In [175]:
df.columns

Index(['body', 'score', 'comment_length', 'predicted_sentiment', 'tokens',
       'stemmed_tokens', 'lemmatized_tokens'],
      dtype='object')

In [176]:
import pandas as pd
import emoji

data = {'body': ["😞 I feel :(, I hate the politics in my country "]}

df = pd.DataFrame(data)

# Apply demojize directly to the 'body' column
df['body'] = df['body'].apply(emoji.demojize)

# Display the DataFrame
print(df)

                                                body
0  :disappointed_face: I feel :(, I hate the poli...


In [177]:
import pandas as pd
import emoji

# Sample DataFrame creation
data = {'body': ["I hate the politics in my country :/:/:/:/."]}

df = pd.DataFrame(data)

# Define a dictionary mapping emoticons to emojis
emoticon_to_emoji = {
    ':)': '😊', ':D': '😃', ':]': '😃', ':(': '😞', ':/': '😕', ':|': '😐'
}

# Function to replace emoticons with emojis in a given text
def replace_emoticons_with_emojis(text):
    for emoticon, emoji in emoticon_to_emoji.items():
        text = text.replace(emoticon, emoji)
    return text

# Apply the function to the 'body' column
df['body'] = df['body'].apply(replace_emoticons_with_emojis)

# Display the DataFrame
print(df)

                                      body
0  I hate the politics in my country 😕😕😕😕.
