## Load Dataset

In [1]:
import pandas as pd

df = pd.read_csv(r'..\3. Data Understanding\merged_data.csv', header=0)

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4280 entries, 0 to 4279
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Text    4278 non-null   object 
 1   Target  4280 non-null   float64
 2   Source  4280 non-null   object 
dtypes: float64(1), object(2)
memory usage: 100.4+ KB


Unnamed: 0,Text,Target,Source
0,Mommy said not to talk to strangers..but she's...,1.0,SchooshooterTexts
1,1.I was immune to getting hurt/killed/infected...,1.0,SchooshooterTexts
2,I mean terrorist attacks happen all the time. ...,1.0,SchooshooterTexts
3,"As a god, it would be my responsibility not to...",1.0,SchooshooterTexts
4,I am going to grab a knife and shove it in the...,1.0,SchooshooterTexts


## Remove Missing Values

In [2]:
df = df.dropna()
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4278 entries, 0 to 4279
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Text    4278 non-null   object 
 1   Target  4278 non-null   float64
 2   Source  4278 non-null   object 
dtypes: float64(1), object(2)
memory usage: 133.7+ KB


## Drop Duplicates

In [3]:
df = df.drop_duplicates(subset=['Text'])  # assign back to df
print(df.head())

                                                Text  Target  \
0  Mommy said not to talk to strangers..but she's...     1.0   
1  1.I was immune to getting hurt/killed/infected...     1.0   
2  I mean terrorist attacks happen all the time. ...     1.0   
3  As a god, it would be my responsibility not to...     1.0   
4  I am going to grab a knife and shove it in the...     1.0   

              Source  
0  SchooshooterTexts  
1  SchooshooterTexts  
2  SchooshooterTexts  
3  SchooshooterTexts  
4  SchooshooterTexts  


## Lowercasing

In [4]:
df["Text"] = df["Text"].str.lower()
df.head()

Unnamed: 0,Text,Target,Source
0,mommy said not to talk to strangers..but she's...,1.0,SchooshooterTexts
1,1.i was immune to getting hurt/killed/infected...,1.0,SchooshooterTexts
2,i mean terrorist attacks happen all the time. ...,1.0,SchooshooterTexts
3,"as a god, it would be my responsibility not to...",1.0,SchooshooterTexts
4,i am going to grab a knife and shove it in the...,1.0,SchooshooterTexts


## Removing non english text

In [5]:
import pandas as pd
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

DetectorFactory.seed = 42  

def detect_language(text):
    try: 
        return detect(text) if pd.notna(text) else 'unknown' 
    except LangDetectException:
        return 'unknown'  # Handle cases where detection fails

df['language'] = df['Text'].apply(detect_language)
df = df[df['language'] == 'en']
df = df.drop(columns=['language']) 
df.head()

Unnamed: 0,Text,Target,Source
0,mommy said not to talk to strangers..but she's...,1.0,SchooshooterTexts
1,1.i was immune to getting hurt/killed/infected...,1.0,SchooshooterTexts
2,i mean terrorist attacks happen all the time. ...,1.0,SchooshooterTexts
3,"as a god, it would be my responsibility not to...",1.0,SchooshooterTexts
4,i am going to grab a knife and shove it in the...,1.0,SchooshooterTexts


## Removing URLs and HTML

In [6]:
import re

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

df['Text'] = df['Text'].apply(remove_urls)


In [7]:
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

df['Text'] = df['Text'].apply(remove_html)


## Remove Punctuation 


In [8]:
import string
import re

def custom_punct_replace(text):
    # Remove apostrophes
    text = text.replace("'", "")
    # Create a punctuation set excluding apostrophe
    punct = string.punctuation.replace("'", "")
    # Replace all other punctuation with a space
    return re.sub(f"[{re.escape(punct)}]", " ", text)

df["Text"] = df["Text"].apply(custom_punct_replace)
df.head()


Unnamed: 0,Text,Target,Source
0,mommy said not to talk to strangers but shes ...,1.0,SchooshooterTexts
1,1 i was immune to getting hurt killed infected...,1.0,SchooshooterTexts
2,i mean terrorist attacks happen all the time ...,1.0,SchooshooterTexts
3,as a god it would be my responsibility not to...,1.0,SchooshooterTexts
4,i am going to grab a knife and shove it in the...,1.0,SchooshooterTexts


## Tokenization

In [9]:
import nltk
from nltk.tokenize import word_tokenize


df['Text'] = df['Text'].apply(nltk.word_tokenize)
df.head()

Unnamed: 0,Text,Target,Source
0,"[mommy, said, not, to, talk, to, strangers, bu...",1.0,SchooshooterTexts
1,"[1, i, was, immune, to, getting, hurt, killed,...",1.0,SchooshooterTexts
2,"[i, mean, terrorist, attacks, happen, all, the...",1.0,SchooshooterTexts
3,"[as, a, god, it, would, be, my, responsibility...",1.0,SchooshooterTexts
4,"[i, am, going, to, grab, a, knife, and, shove,...",1.0,SchooshooterTexts


## Expanding Abbreviations

In [None]:
abbreviation_dict = {
    "u": "you", "ur": "your", "lol": "laughing out loud", "lmao": "laughing my ass off", "rofl": "rolling on the floor laughing", "brb": "be right back", "idk": "i don't know",
    "tbh": "to be honest", "smh": "shaking my head", "wtf": "what the fuck", "omg": "oh my god", "thx": "thank you", "ty": "thank you", "np": "no problem",
    "yw": "you're welcome", "fyi": "for your information", "b4": "before", "nvm": "never mind", "gtg": "got to go", "ttyl": "talk to you later", "gr8": "great",
    "bff": "best friends forever", "cya": "see you", "imo": "in my opinion", "imho": "in my humble opinion", "jk": "just kidding", "afk": "away from keyboard", "irl": "in real life",
    "gg": "good game", "wp": "well played", "btw": "by the way", "dm": "direct message", "rn": "right now", "afaik": "as far as i know", "asap": "as soon as possible",
    "fml": "fuck my life", "ikr": "i know right", "ily": "i love you", "lmk": "let me know", "ppl": "people", "bc": "because", "cuz": "because",
    "tho": "though", "y": "why", "r": "are", "k": "okay", "n": "and", "w/": "with", "w/o": "without",
    "stfu": "shut the fuck up", "hmu": "hit me up", "g2g": "got to go", "wyd": "what are you doing", "wym": "what you mean", "wbu": "what about you", "wb": "welcome back",
    "ofc": "of course", "pls": "please", "plz": "please", "bday": "birthday", "fav": "favorite", "msg": "message", "fb": "facebook",
    "yt": "youtube", "ig": "instagram", "snap": "snapchat", "twt": "twitter", "ftw": "for the win", "icymi": "in case you missed it", "mfw": "my face when",
    "tfw": "that feeling when", "ftl": "for the loss", "roflmao": "rolling on the floor laughing my ass off", "atk": "at the keyboard", "atm": "at the moment", "a3": "anytime anywhere anyplace", "bak": "back at keyboard",
    "bbl": "be back later", "bbs": "be back soon", "bfn": "bye for now", "b4n": "bye for now", "brt": "be right there", "cu": "see you", "cul8r": "see you later",
    "faq": "frequently asked questions", "fc": "fingers crossed", "fwiw": "for what it's worth", "gal": "get a life", "gn": "good night", "gmta": "great minds think alike", "g9": "genius",
    "ic": "i see", "ilu": "i love you", "iow": "in other words", "kiss": "keep it simple stupid", "ldr": "long distance relationship", "ltns": "long time no see", "l8r": "later",
    "mte": "my thoughts exactly", "m8": "mate", "nrn": "no reply necessary", "oic": "oh i see", "pita": "pain in the ass", "prt": "party", "prw": "parents are watching",
    "qpsa?": "que pasa", "roflol": "rolling on the floor laughing out loud", "rotflmao": "rolling on the floor laughing my ass off", "sk8": "skate", "stats": "your sex and age", "asl": "age sex location", "ttfn": "ta ta for now",
    "u2": "you too", "u4e": "yours forever", "wtg": "way to go", "wuf": "where are you from", "w8": "wait", "7k": "sick laughter", "tntl": "trying not to laugh",
    "idu": "i don't understand", "imu": "i miss you", "adih": "another day in hell", "zzz": "sleeping tired", "wywh": "wish you were here", "time": "tears in my eyes", "bae": "before anyone else",
    "fimh": "forever in my heart", "bsaaw": "big smile and a wink", "bwl": "bursting with laughter", "csl": "can't stop laughing", "std": "sexually transmitted disease"
}

def expand_abbreviations(tokenized_text):
    return [abbreviation_dict.get(word, word) for word in tokenized_text]

df['Text'] = df['Text'].apply(expand_abbreviations)

df.head()

Unnamed: 0,Text,Target,Source
0,"[mommy, said, not, to, talk, to, strangers, bu...",1.0,SchooshooterTexts
1,"[1, i, was, immune, to, getting, hurt, killed,...",1.0,SchooshooterTexts
2,"[i, mean, terrorist, attacks, happen, all, the...",1.0,SchooshooterTexts
3,"[as, a, god, it, would, be, my, responsibility...",1.0,SchooshooterTexts
4,"[i, am, going, to, grab, a, knife, and, shove,...",1.0,SchooshooterTexts


## Expanding contractions

In [11]:
import contractions

def expand_contractions_in_tokens(tokens):
    expanded = []
    for token in tokens:
        expanded_token = contractions.fix(token)
        expanded.append(expanded_token)
    return expanded

df['Text'] = df['Text'].apply(expand_contractions_in_tokens)
df.head()

Unnamed: 0,Text,Target,Source
0,"[mommy, said, not, to, talk, to, strangers, bu...",1.0,SchooshooterTexts
1,"[1, i, was, immune, to, getting, hurt, killed,...",1.0,SchooshooterTexts
2,"[i, mean, terrorist, attacks, happen, all, the...",1.0,SchooshooterTexts
3,"[as, a, god, it, would, be, my, responsibility...",1.0,SchooshooterTexts
4,"[i, am, going, to, grab, a, knife, and, shove,...",1.0,SchooshooterTexts


## Pos Tagging and Lemmatization

In [12]:
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# Download required NLTK resources
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to get the correct WordNet POS tag
def get_wordnet_pos(word):
    # Converts NLTK POS tags to WordNet POS tags for better lemmatization.
    tag = nltk.pos_tag([word])[0][1][0].upper()  # Extract first letter of POS tag
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)  # Default to NOUN

# Apply lemmatization
df['Text'] = df['Text'].apply(
    lambda tokens: [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
)


df.head()


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,Text,Target,Source
0,"[mommy, say, not, to, talk, to, stranger, but,...",1.0,SchooshooterTexts
1,"[1, i, be, immune, to, get, hurt, kill, infect...",1.0,SchooshooterTexts
2,"[i, mean, terrorist, attack, happen, all, the,...",1.0,SchooshooterTexts
3,"[a, a, god, it, would, be, my, responsibility,...",1.0,SchooshooterTexts
4,"[i, be, go, to, grab, a, knife, and, shove, it...",1.0,SchooshooterTexts


## Emoji conversion

In [13]:
import emoji

def convert_emojis_in_tokens(tokens):
    new_tokens = []
    for token in tokens:
        # Replace emoji with description, e.g. 🔥 -> _fire_
        replaced = emoji.replace_emoji(
            token, 
            replace=lambda e, data: "_" + data['en'].replace(' ', '_') + "_" if data and 'en' in data else e
        )
        new_tokens.append(replaced)
    return new_tokens

# Apply to your tokenized column
df['Text'] = df['Text'].apply(convert_emojis_in_tokens)
df.head()

Unnamed: 0,Text,Target,Source
0,"[mommy, say, not, to, talk, to, stranger, but,...",1.0,SchooshooterTexts
1,"[1, i, be, immune, to, get, hurt, kill, infect...",1.0,SchooshooterTexts
2,"[i, mean, terrorist, attack, happen, all, the,...",1.0,SchooshooterTexts
3,"[a, a, god, it, would, be, my, responsibility,...",1.0,SchooshooterTexts
4,"[i, be, go, to, grab, a, knife, and, shove, it...",1.0,SchooshooterTexts


## Word Splitting

In [14]:
import wordninja

def fix_merged_tokens(tokens):
    fixed_tokens = []
    for token in tokens:
        # If token is a single word, split it if needed
        split = wordninja.split(token)
        fixed_tokens.extend(split if len(split) > 1 else [token])
    return fixed_tokens

df['Text'] = df['Text'].apply(fix_merged_tokens)


In [15]:
df.head()

Unnamed: 0,Text,Target,Source
0,"[mommy, say, not, to, talk, to, stranger, but,...",1.0,SchooshooterTexts
1,"[1, i, be, immune, to, get, hurt, kill, infect...",1.0,SchooshooterTexts
2,"[i, mean, terrorist, attack, happen, all, the,...",1.0,SchooshooterTexts
3,"[a, a, god, it, would, be, my, responsibility,...",1.0,SchooshooterTexts
4,"[i, be, go, to, grab, a, knife, and, shove, it...",1.0,SchooshooterTexts


## Fixing Mispelling

In [16]:
from spellchecker import SpellChecker

spell = SpellChecker()

def correct_spellings_in_tokens(tokens):
    corrected_tokens = []
    misspelled = spell.unknown(tokens)
    for word in tokens:
        if word in misspelled:
            corrected_tokens.append(spell.correction(word))
        else:
            corrected_tokens.append(word)
    return corrected_tokens

df['Text'] = df['Text'].apply(correct_spellings_in_tokens)
df.head()

Unnamed: 0,Text,Target,Source
0,"[mommy, say, not, to, talk, to, stranger, but,...",1.0,SchooshooterTexts
1,"[1, i, be, immune, to, get, hurt, kill, infect...",1.0,SchooshooterTexts
2,"[i, mean, terrorist, attack, happen, all, the,...",1.0,SchooshooterTexts
3,"[a, a, god, it, would, be, my, responsibility,...",1.0,SchooshooterTexts
4,"[i, be, go, to, grab, a, knife, and, shove, it...",1.0,SchooshooterTexts


## Remove Numbers

In [17]:
def safe_remove_numbers(tokens):
    try:
        if not isinstance(tokens, list):
            return []
        return [str(token) for token in tokens if not any(char.isdigit() for char in str(token))]
    except Exception:
        # If any error occurs, return empty list (row will be dropped later)
        return []

# Apply the function
df['Text'] = df['Text'].apply(safe_remove_numbers)

# Drop any row where 'Text' is a empty list
df = df[df['Text'].apply(lambda x: isinstance(x, list) and len(x) > 0)]

df.head()

Unnamed: 0,Text,Target,Source
0,"[mommy, say, not, to, talk, to, stranger, but,...",1.0,SchooshooterTexts
1,"[i, be, immune, to, get, hurt, kill, infect, w...",1.0,SchooshooterTexts
2,"[i, mean, terrorist, attack, happen, all, the,...",1.0,SchooshooterTexts
3,"[a, a, god, it, would, be, my, responsibility,...",1.0,SchooshooterTexts
4,"[i, be, go, to, grab, a, knife, and, shove, it...",1.0,SchooshooterTexts


## Joining Back Tokens into a single String

In [18]:
# Join tokens back into a single string
df['Text'] = df['Text'].apply(' '.join)
df.head()

Unnamed: 0,Text,Target,Source
0,mommy say not to talk to stranger but she is d...,1.0,SchooshooterTexts
1,i be immune to get hurt kill infect with a sad...,1.0,SchooshooterTexts
2,i mean terrorist attack happen all the tears i...,1.0,SchooshooterTexts
3,a a god it would be my responsibility not to d...,1.0,SchooshooterTexts
4,i be go to grab a knife and shove it in the ne...,1.0,SchooshooterTexts
