# Label Creation "testing dataset"


In [1]:
import pandas as pd

# Load the dataset
testing_df = pd.read_csv("Data/Sample/sampledDataset_testing.csv")

# Manually create a set of labels (e.g., for the first 50 rows)
# For illustration, let's assume '1' indicates pedophilia-related content and '0' indicates otherwise
Suspect_Pedophile = [0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
          0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
          0, 1, 0, 0, 1, 0, 0, 1, 0, 0]

# Add the labels to the dataframe
testing_df['Suspect_Pedophile'] = pd.Series(Suspect_Pedophile, index=testing_df.index[:50])

# Save the labeled dataset
testing_df.to_csv(r"Data/Sample/TestingData_WithLabel.csv", index=False)

# Classification Code

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


In [3]:
testing_df = pd.read_csv("Data/Sample/TestingData_WithLabel.csv")
testing_df['text'] = testing_df['text'].astype(str)

In [4]:
# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stop words and lemmatizer
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ZeeF\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ZeeF\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ZeeF\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
def preprocess(text):
    if pd.isnull(text) or text.strip() == "":
        return ""

    text = text.lower() # Convert text to lowercase
    text = re.sub(r'http\S+', '', text) # Remove URLs
    text = re.sub(r'www\.\S+', '', text) # Remove URLs
    text = re.sub(r'<.*?>', '', text) # Remove HTML tags
    text = re.sub(r'\[.*?\]', '', text) # Remove text within square brackets
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    tokens = word_tokenize(text) # Tokenize the text
    tokens = [word for word in tokens if word not in stop_words] # Remove stopwords
    tokens = [word for word in tokens if not any(char.isdigit() for char in word)] # Remove words containing numbers
    tokens = [lemmatizer.lemmatize(word) for word in tokens] # Lemmatize the remaining words
    
    text = ' '.join(tokens) # Join tokens back into a single string
    return text

In [6]:
cleaned_texts = [preprocess(row['text']) for _, row in testing_df.iterrows()]
testing_df['cleaned'] = cleaned_texts

testing_df.head()

Unnamed: 0,line,author,time,text,Suspect_Pedophile,cleaned
0,1,8bf9c378ec2a475392b4267ae7768f13,17:32,hi,0,hi
1,1,eda70375307c707da829326edc18816f,07:19,Heyy,1,heyy
2,59,24527f8ca0a73da6f9ae19e7b7dd19b4,11:55,okay :),0,okay
3,28,5fe72667d76d0f68d46390d03c940350,02:07,yay me,0,yay
4,4,a11aabeeceeae6b8cb5d12ea06b56554,14:55,What's an iteration?,1,whats iteration


In [7]:
# Split the data into training and testing sets
train_df, test_df = train_test_split(testing_df, test_size=0.2, random_state=42)


In [8]:
# Prepare the data
X_train = train_df['cleaned']  # The text data
y_train = train_df['Suspect_Pedophile']  # The labels
X_test = test_df['cleaned']
y_test = test_df['Suspect_Pedophile']

In [9]:
# Convert text data into TF-IDF features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [10]:
# Train the classifier
classifier = LogisticRegression()
classifier.fit(X_train_tfidf, y_train)

In [11]:
# Make predictions on the test set
y_pred_test = classifier.predict(X_test_tfidf)


In [12]:
# Evaluate the classifier on the test set
test_accuracy = accuracy_score(y_test, y_pred_test)
test_precision = precision_score(y_test, y_pred_test, zero_division=0)
test_recall = recall_score(y_test, y_pred_test, zero_division=0)
test_f1 = f1_score(y_test, y_pred_test, zero_division=0)
test_report = classification_report(y_test, y_pred_test, zero_division=0)


In [13]:
# Print the results
print("Test Accuracy:", test_accuracy)
print("Test Precision:", test_precision)
print("Test Recall:", test_recall)
print("Test F1 Score:", test_f1)
print("Test Classification Report:\n", test_report)

Test Accuracy: 0.9
Test Precision: 0.0
Test Recall: 0.0
Test F1 Score: 0.0
Test Classification Report:
               precision    recall  f1-score   support

           0       0.90      1.00      0.95         9
           1       0.00      0.00      0.00         1

    accuracy                           0.90        10
   macro avg       0.45      0.50      0.47        10
weighted avg       0.81      0.90      0.85        10

