<H1> RANDOM FOREST CLASSIFIER

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load the dataset
data = pd.read_csv("toxic comments.csv")

# Splitting the data into training and testing sets
X = data['Text']
y = data['IsToxic']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_tfidf, y_train)

# Predictions
y_pred = model.predict(X_test_tfidf)

# Classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

       False       0.64      0.81      0.71        93
        True       0.78      0.61      0.68       107

    accuracy                           0.70       200
   macro avg       0.71      0.71      0.70       200
weighted avg       0.72      0.70      0.70       200



<H1>GRID SEARCH

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load the dataset
data = pd.read_csv("toxic comments.csv")

# Splitting the data into training and testing sets
X = data['Text']
y = data['IsToxic']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Parameter grid for grid search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Instantiate the grid search model
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, n_jobs=-1)

# Train the model using the grid search
grid_search.fit(X_train_tfidf, y_train)

# Best parameters found by grid search
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Predictions
y_pred = grid_search.predict(X_test_tfidf)

# Classification report
print(classification_report(y_test, y_pred))


Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
              precision    recall  f1-score   support

       False       0.62      0.87      0.72        93
        True       0.83      0.53      0.65       107

    accuracy                           0.69       200
   macro avg       0.72      0.70      0.69       200
weighted avg       0.73      0.69      0.68       200



<H1> GRADIENT BOOSTING CLASSIFIER

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

# Load the dataset
data = pd.read_csv("toxic comments.csv")

# Splitting the data into training and testing sets
X = data['Text']
y = data['IsToxic']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a Gradient Boosting Classifier
model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train_tfidf, y_train)

# Predictions
y_pred = model.predict(X_test_tfidf)

# Classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

       False       0.61      0.83      0.70        93
        True       0.78      0.53      0.63       107

    accuracy                           0.67       200
   macro avg       0.69      0.68      0.67       200
weighted avg       0.70      0.67      0.66       200



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

# Load the dataset
data = pd.read_csv("toxic comments.csv")

# Splitting the data into training and testing sets
X = data['Text']
y = data['IsToxic']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a Gradient Boosting Classifier with optimized parameters
model = GradientBoostingClassifier(n_estimators=300, learning_rate=0.05, max_depth=5, min_samples_split=5, random_state=42)
model.fit(X_train_tfidf, y_train)

# Predictions
y_pred = model.predict(X_test_tfidf)

# Classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

       False       0.64      0.82      0.72        93
        True       0.79      0.60      0.68       107

    accuracy                           0.70       200
   macro avg       0.71      0.71      0.70       200
weighted avg       0.72      0.70      0.70       200



<H1> CNN


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Load the dataset
data = pd.read_csv("toxic comments.csv")

# Splitting the data into training, validation, and testing sets
X = data['Text']
y = data['IsToxic']

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Reshape TF-IDF matrices for compatibility with CNN
max_features = 10000  # number of features extracted by TF-IDF
sequence_length = X_train_tfidf.shape[1]  # length of each input sequence
X_train_cnn = X_train_tfidf.toarray().reshape(-1, sequence_length, 1)
X_val_cnn = X_val_tfidf.toarray().reshape(-1, sequence_length, 1)
X_test_cnn = X_test_tfidf.toarray().reshape(-1, sequence_length, 1)

# Define the CNN model
model = Sequential([
    Conv1D(128, 5, activation='relu', input_shape=(sequence_length, 1)),
    GlobalMaxPooling1D(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(X_train_cnn, y_train, validation_data=(X_val_cnn, y_val), epochs=20, batch_size=32, callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = model.evaluate(X_test_cnn, y_test)
print(f'Test Accuracy: {accuracy:.2f}')

# Predictions
y_pred_prob = model.predict(X_test_cnn)
y_pred = (y_pred_prob > 0.5).astype(int)

# Classification report
print(classification_report(y_test, y_pred))


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test Accuracy: 0.47
              precision    recall  f1-score   support

           0       0.47      1.00      0.63        93
           1       0.00      0.00      0.00       107

    accuracy                           0.47       200
   macro avg       0.23      0.50      0.32       200
weighted avg       0.22      0.47      0.30       200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import numpy as np

# Calculate class weights
class_weights = {0: 1.0, 1: np.sum(y_train == 0) / np.sum(y_train == 1)}

# Train the model with class weights
history = model.fit(X_train_cnn, y_train, validation_data=(X_val_cnn, y_val), epochs=20, batch_size=32, callbacks=[early_stopping], class_weight=class_weights)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


# **BERT**


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, TFBertForSequenceClassification

# Load the dataset
data = pd.read_csv("toxic comments.csv")

# Splitting the data into training and testing sets
X = data['Text']
y = data['IsToxic']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the input text and convert to input IDs
def tokenize_text(text):
    return tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,  # BERT supports up to 512 tokens
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='tf'
    )

# Tokenize the training and testing data
X_train_tokenized = [tokenize_text(text) for text in X_train]
X_test_tokenized = [tokenize_text(text) for text in X_test]

# Convert lists of dictionaries to dictionaries of tensors
X_train_input = {
    'input_ids': tf.concat([x['input_ids'] for x in X_train_tokenized], axis=0),
    'attention_mask': tf.concat([x['attention_mask'] for x in X_train_tokenized], axis=0)
}
X_test_input = {
    'input_ids': tf.concat([x['input_ids'] for x in X_test_tokenized], axis=0),
    'attention_mask': tf.concat([x['attention_mask'] for x in X_test_tokenized], axis=0)
}

# Convert labels to tensors
y_train = tf.convert_to_tensor(y_train.values)
y_test = tf.convert_to_tensor(y_test.values)

# Load pre-trained BERT model for sequence classification
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


# Train the model
history = model.fit(
    X_train_input,
    y_train,
    validation_data=(X_test_input, y_test),
    epochs=3,  # You can adjust the number of epochs
    batch_size=32
)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_input, y_test)
print(f'Test Accuracy: {accuracy:.2f}')

# Predictions
y_pred = model.predict(X_test_input)
y_pred_labels = np.argmax(y_pred.logits, axis=1)

# Classification report
print(classification_report(y_test.numpy(), y_pred_labels))


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported
Epoch 2/3
Epoch 3/3
Test Accuracy: 0.54
              precision    recall  f1-score   support

       False       0.00      0.00      0.00        93
        True       0.54      1.00      0.70       107

    accuracy                           0.54       200
   macro avg       0.27      0.50      0.35       200
weighted avg       0.29      0.54      0.37       200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
