## Load Dataset

In [67]:
import pandas as pd

df = pd.read_csv(r'..\3. Data Understanding\merged_data.csv', header=0)

df.info()
df.head(25)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4280 entries, 0 to 4279
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Text    4278 non-null   object 
 1   Target  4280 non-null   float64
 2   Source  4280 non-null   object 
dtypes: float64(1), object(2)
memory usage: 100.4+ KB


Unnamed: 0,Text,Target,Source
0,Mommy said not to talk to strangers..but she's...,1.0,SchooshooterTexts
1,1.I was immune to getting hurt/killed/infected...,1.0,SchooshooterTexts
2,I mean terrorist attacks happen all the time. ...,1.0,SchooshooterTexts
3,"As a god, it would be my responsibility not to...",1.0,SchooshooterTexts
4,I am going to grab a knife and shove it in the...,1.0,SchooshooterTexts
5,I will rape your hot lovey sister and then str...,1.0,SchooshooterTexts
6,"""IWILLNEVERLETYOUFORGETABOUTME""",1.0,SchooshooterTexts
7,Rule 11 : Be nice to nerds. Chances are you'll...,1.0,SchooshooterTexts
8,2005.. Age 9.. I was playing sm64ds when I los...,1.0,SchooshooterTexts
9,Look guys I just smoked an entire $500 worth b...,1.0,SchooshooterTexts


## Remove Missing Values

In [68]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4278 entries, 0 to 4279
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Text    4278 non-null   object 
 1   Target  4278 non-null   float64
 2   Source  4278 non-null   object 
dtypes: float64(1), object(2)
memory usage: 133.7+ KB


## Drop Duplicates

In [41]:
df = df.drop_duplicates(subset=['Text'])  # assign back to df
df['Text'].duplicated().any()

np.False_

## Lowercasing

In [42]:
df["Text"] = df["Text"].str.lower()
df.head()

Unnamed: 0,Text,Target,Source
0,mommy said not to talk to strangers..but she's...,1.0,SchooshooterTexts
1,1.i was immune to getting hurt/killed/infected...,1.0,SchooshooterTexts
2,i mean terrorist attacks happen all the time. ...,1.0,SchooshooterTexts
3,"as a god, it would be my responsibility not to...",1.0,SchooshooterTexts
4,i am going to grab a knife and shove it in the...,1.0,SchooshooterTexts


## Removing non english text

In [43]:
import pandas as pd
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

DetectorFactory.seed = 42  

def detect_language(text):
    try: 
        return detect(text) if pd.notna(text) else 'unknown' 
    except LangDetectException:
        return 'unknown'  # Handle cases where detection fails

df['language'] = df['Text'].apply(detect_language)
df = df[df['language'] == 'en']
df = df.drop(columns=['language']) 
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4003 entries, 0 to 4279
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Text    4003 non-null   object 
 1   Target  4003 non-null   float64
 2   Source  4003 non-null   object 
dtypes: float64(1), object(2)
memory usage: 125.1+ KB


## Removing URLs and HTML

In [44]:
import re

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

df['Text'] = df['Text'].apply(remove_urls)


In [45]:
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

df['Text'] = df['Text'].apply(remove_html)


## Remove punctuations and symbols

In [46]:
import re
import string

def remove_non_alnum(text):
    # Convert to string in case of NaN or other types
    text = str(text)
    #replace regular punctuation except aprostophe with a space
    punct_no_apost = string.punctuation.replace("'", "")
    text = re.sub(f"[{re.escape(punct_no_apost)}]", " ", text)
    
    # Keep only letters, numbers, and spaces
    text = re.sub(r"[^A-Za-z0-9\s]", "", text)
    # Collapse multiple spaces and strip
    return re.sub(r"\s+", " ", text).strip()

# Apply to your DataFrame
df["Text"] = df["Text"].apply(remove_non_alnum)
df.head(25)

Unnamed: 0,Text,Target,Source
0,mommy said not to talk to strangers but shes d...,1.0,SchooshooterTexts
1,1 i was immune to getting hurt killed infected...,1.0,SchooshooterTexts
2,i mean terrorist attacks happen all the time s...,1.0,SchooshooterTexts
3,as a god it would be my responsibility not to ...,1.0,SchooshooterTexts
4,i am going to grab a knife and shove it in the...,1.0,SchooshooterTexts
5,i will rape your hot lovey sister and then str...,1.0,SchooshooterTexts
6,iwillneverletyouforgetaboutme,1.0,SchooshooterTexts
7,rule 11 be nice to nerds chances are youll end...,1.0,SchooshooterTexts
8,2005 age 9 i was playing sm64ds when i lost th...,1.0,SchooshooterTexts
9,look guys i just smoked an entire 500 worth ba...,1.0,SchooshooterTexts


## Tokenization

In [47]:
import nltk
from nltk.tokenize import word_tokenize


df['Text'] = df['Text'].apply(nltk.word_tokenize)
df.head()

Unnamed: 0,Text,Target,Source
0,"[mommy, said, not, to, talk, to, strangers, bu...",1.0,SchooshooterTexts
1,"[1, i, was, immune, to, getting, hurt, killed,...",1.0,SchooshooterTexts
2,"[i, mean, terrorist, attacks, happen, all, the...",1.0,SchooshooterTexts
3,"[as, a, god, it, would, be, my, responsibility...",1.0,SchooshooterTexts
4,"[i, am, going, to, grab, a, knife, and, shove,...",1.0,SchooshooterTexts


## Expanding Abbreviations

In [48]:
abbreviation_dict = {
    "u": "you", "ur": "your", "lol": "laughing out loud", "lmao": "laughing my ass off", "rofl": "rolling on the floor laughing", "brb": "be right back", "idk": "i don't know",
    "tbh": "to be honest", "smh": "shaking my head", "wtf": "what the fuck", "omg": "oh my god", "thx": "thank you", "ty": "thank you", "np": "no problem",
    "yw": "you're welcome", "fyi": "for your information", "b4": "before", "nvm": "never mind", "gtg": "got to go", "ttyl": "talk to you later", "gr8": "great",
    "bff": "best friends forever", "cya": "see you", "imo": "in my opinion", "imho": "in my humble opinion", "jk": "just kidding", "afk": "away from keyboard", "irl": "in real life",
    "gg": "good game", "wp": "well played", "btw": "by the way", "dm": "direct message", "rn": "right now", "afaik": "as far as i know", "asap": "as soon as possible",
    "fml": "fuck my life", "ikr": "i know right", "ily": "i love you", "lmk": "let me know", "ppl": "people", "bc": "because", "cuz": "because",
    "tho": "though", "y": "why", "r": "are", "k": "okay", "n": "and", "w/": "with", "w/o": "without",
    "stfu": "shut the fuck up", "hmu": "hit me up", "g2g": "got to go", "wyd": "what are you doing", "wym": "what you mean", "wbu": "what about you", "wb": "welcome back",
    "ofc": "of course", "pls": "please", "plz": "please", "bday": "birthday", "fav": "favorite", "msg": "message", "fb": "facebook",
    "yt": "youtube", "ig": "instagram", "snap": "snapchat", "twt": "twitter", "ftw": "for the win", "icymi": "in case you missed it", "mfw": "my face when",
    "tfw": "that feeling when", "ftl": "for the loss", "roflmao": "rolling on the floor laughing my ass off", "atk": "at the keyboard", "atm": "at the moment", "a3": "anytime anywhere anyplace", "bak": "back at keyboard",
    "bbl": "be back later", "bbs": "be back soon", "bfn": "bye for now", "b4n": "bye for now", "brt": "be right there", "cu": "see you", "cul8r": "see you later",
    "faq": "frequently asked questions", "fc": "fingers crossed", "fwiw": "for what it's worth", "gal": "get a life", "gn": "good night", "gmta": "great minds think alike", "g9": "genius",
    "ic": "i see", "ilu": "i love you", "iow": "in other words", "kiss": "keep it simple stupid", "ldr": "long distance relationship", "ltns": "long time no see", "l8r": "later",
    "mte": "my thoughts exactly", "m8": "mate", "nrn": "no reply necessary", "oic": "oh i see", "pita": "pain in the ass", "prt": "party", "prw": "parents are watching",
    "qpsa?": "que pasa", "roflol": "rolling on the floor laughing out loud", "rotflmao": "rolling on the floor laughing my ass off", "sk8": "skate", "stats": "your sex and age", "asl": "age sex location", "ttfn": "ta ta for now",
    "u2": "you too", "u4e": "yours forever", "wtg": "way to go", "wuf": "where are you from", "w8": "wait", "7k": "sick laughter", "tntl": "trying not to laugh",
    "idu": "i don't understand", "imu": "i miss you", "adih": "another day in hell", "zzz": "sleeping tired", "wywh": "wish you were here", "time": "tears in my eyes", "bae": "before anyone else",
    "fimh": "forever in my heart", "bsaaw": "big smile and a wink", "bwl": "bursting with laughter", "csl": "can't stop laughing", "std": "sexually transmitted disease", "og" : "original"
}

def expand_abbreviations(tokenized_text):
    expanded_text = []
    for word in tokenized_text:
        expansion = abbreviation_dict.get(word, word)
        expanded_text.extend(expansion.split())  # Split expansions into separate tokens
    return expanded_text

df['Text'] = df['Text'].apply(expand_abbreviations)

df.head(25)

Unnamed: 0,Text,Target,Source
0,"[mommy, said, not, to, talk, to, strangers, bu...",1.0,SchooshooterTexts
1,"[1, i, was, immune, to, getting, hurt, killed,...",1.0,SchooshooterTexts
2,"[i, mean, terrorist, attacks, happen, all, the...",1.0,SchooshooterTexts
3,"[as, a, god, it, would, be, my, responsibility...",1.0,SchooshooterTexts
4,"[i, am, going, to, grab, a, knife, and, shove,...",1.0,SchooshooterTexts
5,"[i, will, rape, your, hot, lovey, sister, and,...",1.0,SchooshooterTexts
6,[iwillneverletyouforgetaboutme],1.0,SchooshooterTexts
7,"[rule, 11, be, nice, to, nerds, chances, are, ...",1.0,SchooshooterTexts
8,"[2005, age, 9, i, was, playing, sm64ds, when, ...",1.0,SchooshooterTexts
9,"[look, guys, i, just, smoked, an, entire, 500,...",1.0,SchooshooterTexts


## Expanding contractions

In [49]:
import contractions

def expand_contractions_in_tokens(tokens):
    expanded = []
    for token in tokens:
        expanded_token = contractions.fix(token)
        expanded.extend(expanded_token.split()) 
    return expanded

df['Text'] = df['Text'].apply(expand_contractions_in_tokens)
df.head(25)

Unnamed: 0,Text,Target,Source
0,"[mommy, said, not, to, talk, to, strangers, bu...",1.0,SchooshooterTexts
1,"[1, i, was, immune, to, getting, hurt, killed,...",1.0,SchooshooterTexts
2,"[i, mean, terrorist, attacks, happen, all, the...",1.0,SchooshooterTexts
3,"[as, a, god, it, would, be, my, responsibility...",1.0,SchooshooterTexts
4,"[i, am, going, to, grab, a, knife, and, shove,...",1.0,SchooshooterTexts
5,"[i, will, rape, your, hot, lovey, sister, and,...",1.0,SchooshooterTexts
6,[iwillneverletyouforgetaboutme],1.0,SchooshooterTexts
7,"[rule, 11, be, nice, to, nerds, chances, are, ...",1.0,SchooshooterTexts
8,"[2005, age, 9, i, was, playing, sm64ds, when, ...",1.0,SchooshooterTexts
9,"[look, guys, i, just, smoked, an, entire, 500,...",1.0,SchooshooterTexts


## Handling typos and merged words

In [50]:
import re
import wordninja
from spellchecker import SpellChecker

spell = SpellChecker()
_word_re = re.compile(r"^[a-z]+$") 

def in_vocab(w: str) -> bool:
    return spell[w] > 0  # pyspellchecker freq > 0 == known

def correct_word(w: str) -> str:
    c = spell.correction(w)
    return c if c else w

def clean_token(token, min_split_vocab_frac=0.7):
    # Ignore ints and non-strings
    if not isinstance(token, str):
        return [token]

    # Pass through numbers, punctuation, mixed stuff
    if not _word_re.match(token):
        return [token]

    # Already good?
    if in_vocab(token):
        return [token]

    # Try correction
    corr = correct_word(token)
    if in_vocab(corr):
        return [corr]

    # Try split
    pieces = wordninja.split(token)
    if len(pieces) > 1:
        known_frac = sum(in_vocab(p) for p in pieces) / len(pieces)
        if known_frac >= min_split_vocab_frac:
            return [p if in_vocab(p) else correct_word(p) for p in pieces]

    # Fallback to original
    return [token]

def clean_tokens(tokens):
    out = []
    for t in tokens:
        out.extend(clean_token(t))
    return out

# Apply
df['Text'] = df['Text'].apply(clean_tokens)


In [51]:
df.head(25)

Unnamed: 0,Text,Target,Source
0,"[mommy, said, not, to, talk, to, strangers, bu...",1.0,SchooshooterTexts
1,"[1, i, was, immune, to, getting, hurt, killed,...",1.0,SchooshooterTexts
2,"[i, mean, terrorist, attacks, happen, all, the...",1.0,SchooshooterTexts
3,"[as, a, god, it, would, be, my, responsibility...",1.0,SchooshooterTexts
4,"[i, am, going, to, grab, a, knife, and, shove,...",1.0,SchooshooterTexts
5,"[i, will, rape, your, hot, lovey, sister, and,...",1.0,SchooshooterTexts
6,"[i, will, never, let, you, forget, about, me]",1.0,SchooshooterTexts
7,"[rule, 11, be, nice, to, nerds, chances, are, ...",1.0,SchooshooterTexts
8,"[2005, age, 9, i, was, playing, sm64ds, when, ...",1.0,SchooshooterTexts
9,"[look, guys, i, just, smoked, an, entire, 500,...",1.0,SchooshooterTexts


## Pos Tagging and Lemmatization

In [None]:
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# Download required NLTK resources
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to get the correct WordNet POS tag
def get_wordnet_pos(word):
    # Converts NLTK POS tags to WordNet POS tags for better lemmatization.
    tag = nltk.pos_tag([word])[0][1][0].upper()  # Extract first letter of POS tag
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN) 

# Apply lemmatization
df['Text'] = df['Text'].apply(
    lambda tokens: [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
)


df.head(25)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,Text,Target,Source
0,"[mommy, say, not, to, talk, to, stranger, but,...",1.0,SchooshooterTexts
1,"[1, i, be, immune, to, get, hurt, kill, infect...",1.0,SchooshooterTexts
2,"[i, mean, terrorist, attack, happen, all, the,...",1.0,SchooshooterTexts
3,"[a, a, god, it, would, be, my, responsibility,...",1.0,SchooshooterTexts
4,"[i, be, go, to, grab, a, knife, and, shove, it...",1.0,SchooshooterTexts
5,"[i, will, rape, your, hot, lovey, sister, and,...",1.0,SchooshooterTexts
6,"[i, will, never, let, you, forget, about, me]",1.0,SchooshooterTexts
7,"[rule, 11, be, nice, to, nerd, chance, be, you...",1.0,SchooshooterTexts
8,"[2005, age, 9, i, be, play, sm64ds, when, i, l...",1.0,SchooshooterTexts
9,"[look, guy, i, just, smoke, an, entire, 500, w...",1.0,SchooshooterTexts


## Emoji conversion

In [53]:
import emoji

def convert_emojis_in_tokens(tokens):
    new_tokens = []
    for token in tokens:
        # Replace emoji with description, e.g. 🔥 -> _fire_
        replaced = emoji.replace_emoji(
            token, 
            replace=lambda e, data: "_" + data['en'].replace(' ', '_') + "_" if data and 'en' in data else e
        )
        new_tokens.append(replaced)
    return new_tokens

# Apply to your tokenized column
df['Text'] = df['Text'].apply(convert_emojis_in_tokens)
df.head(25)

Unnamed: 0,Text,Target,Source
0,"[mommy, say, not, to, talk, to, stranger, but,...",1.0,SchooshooterTexts
1,"[1, i, be, immune, to, get, hurt, kill, infect...",1.0,SchooshooterTexts
2,"[i, mean, terrorist, attack, happen, all, the,...",1.0,SchooshooterTexts
3,"[a, a, god, it, would, be, my, responsibility,...",1.0,SchooshooterTexts
4,"[i, be, go, to, grab, a, knife, and, shove, it...",1.0,SchooshooterTexts
5,"[i, will, rape, your, hot, lovey, sister, and,...",1.0,SchooshooterTexts
6,"[i, will, never, let, you, forget, about, me]",1.0,SchooshooterTexts
7,"[rule, 11, be, nice, to, nerd, chance, be, you...",1.0,SchooshooterTexts
8,"[2005, age, 9, i, be, play, sm64ds, when, i, l...",1.0,SchooshooterTexts
9,"[look, guy, i, just, smoke, an, entire, 500, w...",1.0,SchooshooterTexts


## Remove Numbers

In [54]:
def safe_remove_numbers(tokens):
    try:
        if not isinstance(tokens, list):
            return []
        return [str(token) for token in tokens if not any(char.isdigit() for char in str(token))]
    except Exception:
        # If any error occurs, return empty list (row will be dropped later)
        return []

# Apply the function
df['Text'] = df['Text'].apply(safe_remove_numbers)

# Drop any row where 'Text' is a empty list
df = df[df['Text'].apply(lambda x: isinstance(x, list) and len(x) > 0)]

df.head()

Unnamed: 0,Text,Target,Source
0,"[mommy, say, not, to, talk, to, stranger, but,...",1.0,SchooshooterTexts
1,"[i, be, immune, to, get, hurt, kill, infect, w...",1.0,SchooshooterTexts
2,"[i, mean, terrorist, attack, happen, all, the,...",1.0,SchooshooterTexts
3,"[a, a, god, it, would, be, my, responsibility,...",1.0,SchooshooterTexts
4,"[i, be, go, to, grab, a, knife, and, shove, it...",1.0,SchooshooterTexts


## Joining Back Tokens into a single String

In [55]:
# Join tokens back into a single string
df['Text'] = df['Text'].apply(' '.join)
df.head()

Unnamed: 0,Text,Target,Source
0,mommy say not to talk to stranger but she be d...,1.0,SchooshooterTexts
1,i be immune to get hurt kill infect with a sex...,1.0,SchooshooterTexts
2,i mean terrorist attack happen all the tear in...,1.0,SchooshooterTexts
3,a a god it would be my responsibility not to d...,1.0,SchooshooterTexts
4,i be go to grab a knife and shove it in the ne...,1.0,SchooshooterTexts


## Data Augmentation

In [56]:
#checking target counts again after pre-processing

df['Target'].value_counts()

Target
1.0    2043
0.0    1556
2.0     404
Name: count, dtype: int64

In [57]:
import nlpaug.augmenter.word.context_word_embs as aug

# Initialize the augmenter
augmenter = aug.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")

In [None]:
from sklearn.utils import shuffle
import numpy as np
from tqdm import tqdm
import pandas as pdz

def augment_text(df, target, augmenter, repititions=1, samples=200):
    augmented_texts = []
    
    # Select only target class samples
    text_df = df[df['Target'] == target].reset_index(drop=True)
    
    for i in tqdm(np.random.randint(0, len(text_df), samples)):
        for _ in range(repititions):
            augmented = augmenter.augment(text_df['Text'][i])
            augmented_texts.append(augmented[0])

    # Create augmented dataframe
    data = {
        'Target': [target] * len(augmented_texts),
        'Text': augmented_texts,
        'Source': ['augmented'] * len(augmented_texts)
    }
    aug_df = pd.DataFrame(data)

    # Concatenate and shuffle
    df = shuffle(pd.concat([df, aug_df]).reset_index(drop=True))
    return df

#augement normal texts
aug_df_v1 = augment_text(df, 0, augmenter, repititions=1, samples=500)
    

100%|██████████| 500/500 [14:27<00:00,  1.74s/it]


In [59]:
aug_df_v1[aug_df_v1['Source'] == 'augmented']

Unnamed: 0,Text,Target,Source
4277,that i never wish someone be run on a simple p...,0.0,augmented
4104,hiding a snack fresh stash in off my backpack ...,0.0,augmented
4032,mystic water bear,0.0,augmented
4136,re write mr rightsize a as a poly hop anthem t...,0.0,augmented
4142,certainly set budget and still gun too well di...,0.0,augmented
...,...,...,...
4371,start up a new body fitness challenge tomorrow,0.0,augmented
4152,will embark on after a successful culinary ody...,0.0,augmented
4282,in which the heart of the new new york city ti...,0.0,augmented
4138,book song various yourself four investment cha...,0.0,augmented


In [60]:
aug_df_v1['Target'].value_counts()

Target
0.0    2056
1.0    2043
2.0     404
Name: count, dtype: int64

In [61]:

#augement threat texts
aug_df_v2 = augment_text(aug_df_v1, 2, augmenter, repititions=4, samples=400)

100%|██████████| 400/400 [46:18<00:00,  6.95s/it]  


In [62]:
aug_df_v2[(aug_df_v2['Source'] == 'augmented') & (aug_df_v2['Target'] == 2.0)]


Unnamed: 0,Text,Target,Source
4900,a caption continues with this word that told s...,2.0,augmented
4643,i be going go back to remake a texas at co,2.0,augmented
6031,two go to shoot up someone,2.0,augmented
4912,to commit a a fake school shoot,2.0,augmented
5274,i be gonna shoot up eagle crest highschool tom...,2.0,augmented
...,...,...,...
5957,knock hard on my bathroom door one more more t...,2.0,augmented
5577,but do they not come to school today,2.0,augmented
5310,shoot back up the school and then i just have ...,2.0,augmented
5181,warn i be no come here to shoot missions up we...,2.0,augmented


In [63]:
aug_df_v2['Target'].value_counts()

Target
0.0    2056
1.0    2043
2.0    2004
Name: count, dtype: int64

## drop uneeded columns

In [64]:
final_df = aug_df_v2.drop(columns=['Source'], axis=1)
final_df.head()

Unnamed: 0,Text,Target
2199,remember no one be bullet,2.0
1740,fuck you buzz feed you fuck hypocrite say shit...,2.0
2582,say itself new story chair available brother h...,0.0
4900,a caption continues with this word that told s...,2.0
2075,if i be a homicidal criminal id love the death...,1.0


## Seperate final cleaned data into test and train csv

In [65]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    final_df, 
    test_size=0.2,       # 20% test, 80% train
    random_state=42, 
    stratify=final_df['Target']  # maintain class distribution
)

In [66]:
train_df.to_csv('../5. Modelling/train_data.csv', index=False, quoting=1)
test_df.to_csv('../5. Modelling/test_data.csv', index=False, quoting=1)