<a href="https://colab.research.google.com/github/Beshoy-R/Beshoy-R/blob/main/PROJECT_LanguageEngineering(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import nltk
import re
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
dataset = pd.read_csv('dataset.csv')

In [None]:
columns = dataset.columns
print(columns)

**-> Dropping unwanted columns:**

In [None]:
dataset = dataset.drop(columns=['id', 'severe_toxic', 'obscene','obscene','threat','insult','identity_hate'])


In [None]:
print(dataset)

**-> Cleaning the data(Tokenize,Removing Stopwords,Lemmatize):**


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english')).difference({
    'against', 'ain', 'aren', "aren't", 'couldn', "couldn't", "didn't", 'didn', 'doesn', 
    "doesn't", 'doing', 'don', "don't", 'down', 'did', 'can', 'had', 'hadn', "hadn't", 
    'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'is', 'isn', "isn't", 'mightn', 
    "mightn't", 'more', 'most', 'mustn', "mustn't", 'needn', "needn't", 'no', 'nor', 
    'not', 'off', 'on', 'should', "should've", 'shouldn', "shouldn't", 'so', 't', 'too', 
    'very', 'was', 'wasn', "wasn't", 'were', 'weren', "weren't", "won't", 'wouldn', 
    "wouldn't",
})

def preprocess_text(text):
    # Remove URLs and mentions
    text = re.sub(r'http\S+|www.\S+|@\S+', '', text)
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back into a single string
    text = ' '.join(tokens)
    return text
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None
        
dataset['comment_text'] = dataset['comment_text'].apply(preprocess_text)


**###### Done Prepareing the data(Cleaning and Preprocessing) ######**

**## ALGORITHM (1) Logistic Regression: ##**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# Split the dataset into features (X) and target variable (y)
X = dataset['comment_text']
y = dataset['toxic']

# Split the data into training and testing sets (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Convert the text data to TF-IDF features
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Initialize the Logistic Regression model
LR_Model = LogisticRegression()

# Train the model on the training set
LR_Model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = LR_Model.predict(X_test)

# Compute the accuracy of the model
LR_accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print(f"Logistic_Regression_Accuracy: {LR_accuracy*100}%")

**## ALGORITHM (2) Support Vector Machine: ##**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Split the dataset into features (X) and target variable (y)
X = dataset['comment_text']
y = dataset['toxic']

# Split the data into training and testing sets (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Convert the text data to TF-IDF features
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Initialize the Support Vector Machine model
SVC_model = SVC()

# Train the model on the training set
SVC_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = SVC_model.predict(X_test)

# Compute the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print(f"Support_Vector_Machine_Accuracy: {accuracy*100}%")

**## ALGORITHM (3) Naïve Bayes: ##**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
X = dataset['comment_text']
y = dataset['toxic']

# Split the data into training and testing sets (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Convert the text data to TF-IDF features
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Initialize the Naïve Bayes model (MultinomialNB)
NB_model = MultinomialNB()

# Train the model on the training set
NB_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = NB_model.predict(X_test)

# Compute the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print(f"Naïve_Bayes_Accuracy: {accuracy*100}%")

**## ALGORITHM (4) Artificial Neural Networks: ##**

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Split the dataset into features (X) and target variable (y)
X = dataset['comment_text']
y = dataset['toxic']

# Split the data into training and testing sets (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Convert the text data to TF-IDF features
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

# Initialize the Artificial Neural Network model
ANN_model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
ANN_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model on the training set
ANN_model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)

# Make predictions on the testing set
y_pred_prob = ANN_model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Compute the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print(f"Artificial_Neural_Networks_Accuracy: {accuracy*100}%")

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Define the labels for the matrix
labels = ['Non-Toxic', 'Toxic']

# Plot the confusion matrix
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d', xticklabels=labels, yticklabels=labels)

# Set plot labels and title
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')

# Display the plot
plt.show()


In [None]:
from collections import Counter
import matplotlib.pyplot as plt

# Create a list of toxic comments
toxic_comments = dataset[dataset['toxic'] == 1]['comment_text']

# Combine all toxic comments into a single string
toxic_text = ' '.join(toxic_comments)

# Tokenize the text into individual words
tokens = nltk.word_tokenize(toxic_text)

# Count the frequency of each word
word_frequency = Counter(tokens)

# Select the top N words and their frequencies
top_words = word_frequency.most_common(20)
words, frequencies = zip(*top_words)

# Create a bar chart
plt.bar(words, frequencies)

# Set plot labels and title
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Toxic Comment Word Frequency')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)

# Display the plot
plt.show()


**TEST OUR COMMENT**

In [None]:
our_comment = "Have a happy day"
preprocessed_comment = preprocess_text(our_comment)
vectorized_comment = vectorizer.transform([preprocessed_comment]).toarray()

ANN_prediction_prob = ANN_model.predict(vectorized_comment)
ANN_prediction = (ANN_prediction_prob > 0.5).astype(int)
prediction_label_ANN = 'toxic' if ANN_prediction == 1 else 'non-toxic'
print('ANN_Prediction:', prediction_label_ANN)

NB_prediction_prob = NB_model.predict(vectorized_comment)
NB_prediction = (NB_prediction_prob > 0.5).astype(int)
prediction_label_NB = 'toxic' if NB_prediction == 1 else 'non-toxic'
print('NB_Prediction:', prediction_label_NB)

SVC_prediction_prob = SVC_model.predict(vectorized_comment)
SVC_prediction = (SVC_prediction_prob > 0.5).astype(int)
prediction_label_SVC = 'toxic' if SVC_prediction == 1 else 'non-toxic'
print('SVC_Prediction:', prediction_label_SVC)

LR_prediction_prob = LR_Model.predict(vectorized_comment)
LR_prediction = (LR_prediction_prob > 0.5).astype(int)
prediction_label_LR = 'toxic' if LR_prediction == 1 else 'non-toxic'
print('LR_Prediction:', prediction_label_LR)


**IMPROVE OUR TESTING**

In [None]:
our_comment = "i hate you"
preprocessed_comment = preprocess_text(our_comment)
vectorized_comment = vectorizer.transform([preprocessed_comment]).toarray()

models = {
    'ANN': (ANN_model, 'Artificial_Neural_Network'),
    'NB': (NB_model, 'Naive_Bayes'),
    'SVC': (SVC_model, 'Support_Vector_Machine'),
    'LR': (LR_Model, 'Logistic_Regression')
}

for model_key, (model, model_label) in models.items():
    prediction_prob = model.predict(vectorized_comment)
    prediction = (prediction_prob > 0.5).astype(int)
    prediction_label = 'toxic' if prediction == 1 else 'non-toxic'
    print(f'{model_label} Prediction:', prediction_label)

