<a href="https://colab.research.google.com/github/Almamun809/Daily-NLP/blob/main/HS_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
import sklearn.metrics as metrics
import itertools
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import nltk
from nltk.tokenize import word_tokenize

# Download the Punkt tokenizer (if not already downloaded)
nltk.download('punkt')

# Assuming you have your training data in a pandas DataFrame called 'data'
X = data['text']  # Features
y = data['label']  # Target variable

# Preprocessing: Remove URLs, usernames, hashtags, and punctuations
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    # Remove usernames (e.g., @user)
    text = re.sub(r"@\w+", "", text, flags=re.MULTILINE)
    # Remove hashtags (e.g., #hashtag)
    text = re.sub(r"#\w+", "", text, flags=re.MULTILINE)
    # Remove punctuations (excluding word characters and whitespace)
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Apply preprocessing to the text data
X = X.apply(preprocess_text)

# Replace NaN values with empty strings
X = X.fillna('')

# Define the TfidfVectorizer with word_tokenize for word-level unigram features
vectorizer = TfidfVectorizer(ngram_range=(2, 4), analyzer='char')

# Convert text data into numerical features
X = vectorizer.fit_transform(X)

# Define the Passive Aggressive Classifier
classifier = PassiveAggressiveClassifier(max_iter=500, random_state=42, C=1.0)

# Perform 10-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Initialize lists to store evaluation metric scores
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Iterate over the cross-validation folds
for train_index, test_index in kfold.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit the classifier on the training data
    classifier.fit(X_train, y_train)

    # Predict the labels for the test data
    y_pred = classifier.predict(X_test)

    # Calculate evaluation metric scores
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred, average='macro', zero_division=1))
    recall_scores.append(recall_score(y_test, y_pred, average='macro'))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

# Calculate average scores across all folds
accuracy_avg = np.mean(accuracy_scores)
precision_avg = np.mean(precision_scores)
recall_avg = np.mean(recall_scores)
f1_avg = np.mean(f1_scores)

# Print the evaluation metrics
print(f'Accuracy: {accuracy_avg}')
print(f'Precision: {precision_avg}')
print(f'Recall: {recall_avg}')
print(f'F1 Score: {f1_avg}')

In [None]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import sklearn.metrics as metrics
import itertools
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming you have your training data in a pandas DataFrame called 'data'
X = data['text']  # Features
y = data['label']  # Target variable

# Preprocessing: Remove URLs, usernames, hashtags, and punctuations
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    # Remove usernames (e.g., @user)
    text = re.sub(r"@\w+", "", text, flags=re.MULTILINE)
    # Remove hashtags (e.g., #hashtag)
    text = re.sub(r"#\w+", "", text, flags=re.MULTILINE)
    # Remove punctuations (excluding word characters and whitespace)
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Apply preprocessing to the text data
X = X.apply(preprocess_text)

# Replace NaN values with empty strings
X = X.fillna('')

# Define the TfidfVectorizer for word-level unigram features
vectorizer = TfidfVectorizer(ngram_range=(1, 1))

# Convert text data into numerical features
X = vectorizer.fit_transform(X)

# Define the Multinomial Naive Bayes Classifier
classifier = MultinomialNB()

# Perform 3-fold cross-validation
kfold = KFold(n_splits=3, shuffle=True, random_state=42)

# Initialize lists to store evaluation metric scores
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

# Iterate over the cross-validation folds
for train_index, test_index in kfold.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit the classifier on the training data
    classifier.fit(X_train, y_train)

    # Predict the labels for the test data
    y_pred = classifier.predict(X_test)

    # Calculate evaluation metric scores
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred, average='macro', zero_division=1))
    recall_scores.append(recall_score(y_test, y_pred, average='macro'))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

# Calculate average scores across all folds
accuracy_avg = np.mean(accuracy_scores)
precision_avg = np.mean(precision_scores)
recall_avg = np.mean(recall_scores)
f1_avg = np.mean(f1_scores)

# Print the evaluation metrics
print(f'Accuracy: {accuracy_avg}')
print(f'Precision: {precision_avg}')
print(f'Recall: {recall_avg}')
print(f'F1 Score: {f1_avg}')

#Confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred, labels=[0, 1])
plot_confusion_matrix(cm, classes=[0, 1])