# Preprocessing

In [1]:
import pandas as pd

df = pd.read_csv(r'..\4. Data Understanding\merged_data.csv', header=0)

df.head()

Unnamed: 0,Text,Target,Source
0,Mommy said not to talk to strangers..but she's...,1.0,SchooshooterTexts
1,1.I was immune to getting hurt/killed/infected...,1.0,SchooshooterTexts
2,I mean terrorist attacks happen all the time. ...,1.0,SchooshooterTexts
3,"As a god, it would be my responsibility not to...",1.0,SchooshooterTexts
4,I am going to grab a knife and shove it in the...,1.0,SchooshooterTexts


In [5]:
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")


# NLP preprocessing function
def preprocess(text):
    doc = nlp(text.lower())
    tokens = [
        token.lemma_ for token in doc 
        if token.is_alpha and not token.is_stop
    ]
    return " ".join(tokens)

# Apply preprocessing
df['clean_text'] = df['Text'].apply(preprocess)

# TF-IDF vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_text'])
y = df['Target']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train classifier
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7746062992125984

Classification Report:
               precision    recall  f1-score   support

         0.0       0.89      0.36      0.52       239
         1.0       0.74      0.98      0.84       638
         2.0       0.95      0.55      0.70       139

    accuracy                           0.77      1016
   macro avg       0.86      0.63      0.69      1016
weighted avg       0.81      0.77      0.75      1016



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import get_scheduler
from datasets import Dataset
from tqdm import tqdm

# 1. Encode the labels
label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["Target"])

# 2. Split data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["Text"], df["label_encoded"], test_size=0.2, stratify=df["label_encoded"], random_state=42
)

# 3. Tokenization
tokenizer = BertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128)

# 4. Create custom dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            key: torch.tensor(val[idx])
            for key, val in self.encodings.items()
        } | {'labels': torch.tensor(self.labels[idx])}

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, list(train_labels))
test_dataset = CustomDataset(test_encodings, list(test_labels))

# 5. Model setup
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 6. Training setup
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_loader) * 3  # for 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer,
                             num_warmup_steps=0,
                             num_training_steps=num_training_steps)

# 7. Train loop
model.train()
for epoch in range(3):
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        loop.set_postfix(loss=loss.item())

# 8. Evaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=-1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch['labels'].cpu().numpy())

# 9. Print results
print("Classification Report:\n")
print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1:   0%|          | 0/170 [00:00<?, ?it/s]

: 

In [2]:
import pandas as pd
import re
import emoji
import nltk
import contractions
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, words
from nltk.stem import WordNetLemmatizer
from autocorrect import Speller
from bs4 import BeautifulSoup

# Download necessary NLTK data
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("words")

# Initialize tools
spell = Speller(lang="en")  # Autocorrect spelling
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))
english_vocab = set(words.words())

def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # Text Normalization
    text = text.lower()  # Convert to lowercase
    text = contractions.fix(text)  # Expand contractions ("can't" → "cannot")
    text = re.sub(r"\s+", " ", text).strip()  # Remove excessive whitespace

    # Remove unwanted cahracters
    text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9\s#@]", "", text)  # Keep alphanumeric, hashtags, mentions

    # 3️Handle URLS, hashtags, mentions
    text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "USER", text)  # Replace @mentions with "USER"
    text = re.sub(r"#(\w+)", lambda m: " ".join(re.findall(r"[A-Z]?[a-z]+|\d+", m.group(1))), text)  # Split hashtags

    # Handle emoji
    text = emoji.demojize(text, delimiters=(" ", " "))  
    text = re.sub(r":\S+:", lambda m: m.group(0).replace("_", " "), text)  

    # Tokenization and remove stop word
    words_list = word_tokenize(text)  # Tokenize text
    words_list = [word for word in words_list if word not in stop_words]  # Remove stopwords

    #autocorrect
    words_list = [spell(word) if word not in english_vocab else word for word in words_list]

    # Lemmatization
    words_list = [lemmatizer.lemmatize(word) for word in words_list]

    return " ".join(words_list)  # Convert list back to string

df["Text"] = df["Text"].apply(preprocess_text)

pd.set_option('display.max_rows', None)  # Show all rows
print(df)  # Display the full DataFrame
pd.reset_option('display.max_rows')  # Reset to default after viewing

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


                                                   Text  Target  \
0     mommy said talk strangersbut drunk time listen...     1.0   
1     1i immune getting hurtkilledinfected std 2i 43...     1.0   
2     mean terrorist attack happen time sure thousan...     1.0   
3     god would responsibility destroy universal bal...     1.0   
4     going grab knife shove neck mother drag dead b...     1.0   
5     rape hot love sister strangle hair sneaking pa...     1.0   
6                         iwillneverletyouforgetaboutme     1.0   
7     rule 11 nice nerd chance end working one start...     1.0   
8     2005 age 9 playing sm64ds lost level went age ...     1.0   
9     look guy smoked entire 500 worth bag og push h...     1.0   
10    play wow turned brother basement dweller weed ...     1.0   
11    support interesting people hope internet famou...     1.0   
12    shortest describe troll troll basically young ...     1.0   
13    would kill tide method may offend many would s...     1.