<a href="https://colab.research.google.com/github/Almamun809/Daily-NLP/blob/main/hate_speech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, roc_curve
import pandas as pd

# Mount Google Drive to access files
from google.colab import drive
drive.mount('/content/drive')

# Load data from CSV file
data = pd.read_csv('/content/drive/My Drive/path_to_your_csv_file.csv')

# Assuming 'text' column contains the text data and 'label' column contains the labels
texts = data['text'].tolist()
labels = data['label'].tolist()

# Split the data into training, evaluation, and test sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)
X_train, X_eval, y_train, y_eval = train_test_split(X_train, y_train, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

# Fit and transform the training data
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
# Transform the evaluation data
tfidf_eval = tfidf_vectorizer.transform(X_eval)
# Transform the testing data
tfidf_test = tfidf_vectorizer.transform(X_test)

# Initialize the Passive Aggressive Classifier
pac = PassiveAggressiveClassifier(max_iter=50)

# Train the classifier
pac.fit(tfidf_train, y_train)

# Predict on the evaluation set
y_pred_eval = pac.predict(tfidf_eval)
# Predict on the testing set
y_pred_test = pac.predict(tfidf_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_eval, y_pred_eval)
precision = precision_score(y_eval, y_pred_eval)
recall = recall_score(y_eval, y_pred_eval)
f1 = f1_score(y_eval, y_pred_eval)
auc = roc_auc_score(y_eval, y_pred_eval)

# Print evaluation metrics
print("Evaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("AUC:", auc)

# Confusion matrix for evaluation set
conf_matrix_eval = confusion_matrix(y_eval, y_pred_eval)
print("Confusion Matrix (Evaluation Set):")
print(conf_matrix_eval)

# Confusion matrix for testing set
conf_matrix_test = confusion_matrix(y_test, y_pred_test)
print("Confusion Matrix (Testing Set):")
print(conf_matrix_test)

# Calculate testing loss manually
testing_loss = pac.score(tfidf_test, y_test)
print("Testing Loss:", testing_loss)

# Plot training, evaluation, and testing loss
plt.figure(figsize=(10, 5))
plt.plot(pac.loss_curve_, label='Training Loss', color='blue')
plt.plot(np.arange(1, len(pac.loss_curve_) + 1), pac.loss_curve_, marker='o', markersize=5, linestyle='', color='blue')
plt.plot(np.arange(1, len(pac.loss_curve_) + 1), pac.loss_curve_, color='blue')
plt.axhline(y=pac.validation_scores_[-1], color='red', linestyle='--', label='Evaluation Loss')
plt.axhline(y=testing_loss, color='green', linestyle='-.', label='Testing Loss')
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.title('Training, Evaluation, and Testing Loss')
plt.legend()
plt.grid(True)
plt.show()

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_eval, pac.decision_function(tfidf_eval))
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()