## Data Loading

In [128]:
import pandas as pd

df1 = pd.read_excel('Dataset_labelled.xlsx', header=0)
df2 = pd.read_csv(r'tweets_labelled.csv', header=0)
df3 = pd.read_csv(r'..\data\Threat Texts.csv', encoding='ISO-8859-1', header=None,  names=['Threat_Text'])
df4 = pd.read_csv(r'..\data\synthetic_social_media_data.csv', header=0)

## Adding new column

In [129]:
df3["Annotation"] = 2 
df4["Annotation"] = 0

## Data Filtering

In [130]:
# Filtering out incomplete post from dataset 2
df2 = df2[~df2['tweet'].str.endswith('...', na=False)]

# Filtering out data with external link from dataset 2
df2 = df2[df2['ext_link'].fillna(False) == False]

#Filtering out Negative data from dataset 4
df4 = df4[df4['Sentiment Label'] != "Negative"]

## Column Selection

In [131]:
df1 = df1[["Post","Annotation"]]
df2 = df2[["tweet","Annotation"]]
df3 = df3[["Threat_Text", "Annotation"]]
df4 = df4[["Post Content","Annotation"]]


## Rename Columns

In [132]:
df1.rename(columns={"Post": "text"}, inplace=True)
df2.rename(columns={"tweet": "text"}, inplace=True)
df3.rename(columns={"Threat_Text": "text"}, inplace=True)
df4.rename(columns={"Post Content": "text"}, inplace=True)

## Merging Dataset

In [133]:
df_merged = pd.concat([df1, df2, df3, df4], ignore_index=True)

df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4451 entries, 0 to 4450
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        4449 non-null   object
 1   Annotation  4451 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 69.7+ KB


## Remove Missing Value and Duplicates

In [134]:
df_merged = df_merged.dropna()
df_merged = df_merged.drop_duplicates(subset=["text"])

df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4423 entries, 0 to 4450
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        4423 non-null   object
 1   Annotation  4423 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 103.7+ KB


## Pre-processing

In [135]:
import pandas as pd
import re
import emoji
import nltk
import contractions
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, words
from nltk.stem import WordNetLemmatizer
from autocorrect import Speller
from bs4 import BeautifulSoup

# Download necessary NLTK data
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("words")

# Initialize tools
spell = Speller(lang="en")  # Autocorrect spelling
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))
english_vocab = set(words.words())

def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # Text Normalization
    text = text.lower()  # Convert to lowercase
    text = contractions.fix(text)  # Expand contractions ("can't" → "cannot")
    text = re.sub(r"\s+", " ", text).strip()  # Remove excessive whitespace

    # Remove unwanted cahracters
    text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9\s#@]", "", text)  # Keep alphanumeric, hashtags, mentions

    # 3️Handle URLS, hashtags, mentions
    text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "USER", text)  # Replace @mentions with "USER"
    text = re.sub(r"#(\w+)", lambda m: " ".join(re.findall(r"[A-Z]?[a-z]+|\d+", m.group(1))), text)  # Split hashtags

    # Handle emoji
    text = emoji.demojize(text, delimiters=(" ", " "))  
    text = re.sub(r":\S+:", lambda m: m.group(0).replace("_", " "), text)  

    # Tokenization and remove stop word
    words_list = word_tokenize(text)  # Tokenize text
    words_list = [word for word in words_list if word not in stop_words]  # Remove stopwords

    #autocorrect
    words_list = [spell(word) if word not in english_vocab else word for word in words_list]

    # Lemmatization
    words_list = [lemmatizer.lemmatize(word) for word in words_list]

    return " ".join(words_list)  # Convert list back to string

df_merged["cleaned_text"] = df_merged["text"].apply(preprocess_text)

pd.set_option('display.max_rows', None)  # Show all rows
print(df_merged)  # Display the full DataFrame
pd.reset_option('display.max_rows')  # Reset to default after viewing

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


                                                   text  Annotation  \
0     Mommy said not to talk to strangers..but she's...           1   
1     1.I was immune to getting hurt/killed/infected...           1   
2     I mean terrorist attacks happen all the time. ...           1   
3     As a god, it would be my responsibility not to...           1   
4     I am going to grab a knife and shove it in the...           1   
5     I will rape your hot lovey sister and then str...           1   
6                       "IWILLNEVERLETYOUFORGETABOUTME"           1   
7     Rule 11 : Be nice to nerds. Chances are you'll...           1   
8     2005.. Age 9.. I was playing sm64ds when I los...           1   
9     Look guys I just smoked an entire $500 worth b...           1   
10    Don't play WoW. It turned my brother into a ba...           1   
11    I support interesting people and I hope to be ...           1   
12    The shortest I can describe trolls: Trolls are...           1   
13    