<a href="https://colab.research.google.com/github/Beshoy-R/Beshoy-R/blob/main/LanguageEngineering(1)_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
dataset = pd.read_csv('train.csv')

In [3]:
columns = dataset.columns
print(columns)

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')


**-> Dropping unwanted columns:**

In [4]:
dataset = dataset.drop(columns=['id', 'severe_toxic', 'obscene','obscene','threat','insult','identity_hate'])


In [5]:
print(dataset)

                                           comment_text  toxic
0     Explanation\nWhy the edits made under my usern...      0
1     D'aww! He matches this background colour I'm s...      0
2     Hey man, I'm really not trying to edit war. It...      0
3     "\nMore\nI can't make any real suggestions on ...      0
4     You, sir, are my hero. Any chance you remember...      0
...                                                 ...    ...
2997  Re: All Items\nI know that you said I did some...      0
2998  "\nSo you not going tell me why you created so...      0
2999  John Phillip Key (born 9 August 1961, in Auckl...      0
3000  The Billy the Kid article with my contribution...      0
3001  Vandalizing the Macedonian towns \n\nFreestype...      0

[3002 rows x 2 columns]


**-> Clean the text data by removing unwanted characters, converting to lowercase, and removing URLs or mentions:**


In [6]:
def clean_text(text):
    text = re.sub(r'http\S+|www.\S+|@\S+', '', text)  # remove URLs and mentions
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())  # remove non-alphabetic characters and convert to lowercase
    return text


In [7]:
dataset['comment_text'] = dataset['comment_text'].apply(clean_text)

**-> Tokenize words:**

In [8]:
def tokenize_text(text):
    return nltk.word_tokenize(text)

dataset['comment_text'] = dataset['comment_text'].apply(tokenize_text)


**-> Removing Stop Words:**

In [9]:
stopwords = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stopwords]

dataset['comment_text'] = dataset['comment_text'].apply(remove_stopwords)


**-> Lemmatize the words to reduce them to their base form:**

In [10]:
lemmatizer = WordNetLemmatizer()

def lemmatize_words(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

dataset['comment_text'] = dataset['comment_text'].apply(lemmatize_words)


**-> Join the tokens back into a single string:**

In [11]:
def join_tokens(tokens):
    return ' '.join(tokens)

dataset['comment_text'] = dataset['comment_text'].apply(join_tokens)

**###### Done Prepareing the data(Cleaning and Preprocessing) #########################################################**

**## ALGORITHM (1) Logistic Regression: ##**

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [13]:
# Split the dataset into features (X) and target variable (y)
X = dataset['comment_text']
y = dataset['toxic']

# Split the data into training and testing sets (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Convert the text data to TF-IDF features
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Initialize the Logistic Regression model
LR_Model = LogisticRegression()

# Train the model on the training set
LR_Model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = LR_Model.predict(X_test)

# Compute the accuracy of the model
LR_accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print(f"Logistic_Regression_Accuracy: {LR_accuracy*100}%")

Logistic_Regression_Accuracy: 90.34941763727122%


**## ALGORITHM (2) Support Vector Machine: ##**

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
# Split the dataset into features (X) and target variable (y)
X = dataset['comment_text']
y = dataset['toxic']

# Split the data into training and testing sets (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Convert the text data to TF-IDF features
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Initialize the Support Vector Machine model
model = SVC()

# Train the model on the training set
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Compute the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print(f"Support_Vector_Machine_Accuracy: {accuracy*100}%")

Support_Vector_Machine_Accuracy: 90.68219633943427%


**## ALGORITHM (3) Naïve Bayes: ##**

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
X = dataset['comment_text']
y = dataset['toxic']

# Split the data into training and testing sets (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Convert the text data to TF-IDF features
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Initialize the Naïve Bayes model (MultinomialNB)
model = MultinomialNB()

# Train the model on the training set
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Compute the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print(f"Naïve_Bayes_Accuracy: {accuracy*100}%")

Naïve_Bayes_Accuracy: 89.18469217970049%


**## ALGORITHM (4) Artificial Neural Networks: ##**

In [18]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
# Split the dataset into features (X) and target variable (y)
X = dataset['comment_text']
y = dataset['toxic']

# Split the data into training and testing sets (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Convert the text data to TF-IDF features
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

# Initialize the Artificial Neural Network model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model on the training set
model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)

# Make predictions on the testing set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Compute the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print(f"Artificial_Neural_Networks_Accuracy: {accuracy*100}%")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Artificial_Neural_Networks_Accuracy: 92.17970049916805%
