In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score 

In [2]:
# Step 1: Convert the CSV file to a Pandas DataFrame
df = pd.read_csv('E:\Data Science Course - GUVI\Final Project\Project - 2\FinalBalancedDataset.csv')

In [3]:
# Step 2: Convert the text to Bag of Words and TF-IDF
bow_vectorizer = CountVectorizer()
bow_features = bow_vectorizer.fit_transform(df['Tweet'])

tfidf_vectorizer = TfidfVectorizer()
tfidf_features = tfidf_vectorizer.fit_transform(df['Tweet'])

In [5]:
# Step 3: Apply prediction methods
X_train_bow, X_test_bow, y_train, y_test = train_test_split(bow_features, df['Toxicity'], test_size=0.2, random_state=42)
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(tfidf_features, df['Toxicity'], test_size=0.2, random_state=42)

In [6]:
# Decision Trees
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train_bow, y_train)
dt_predictions_bow = dt_classifier.predict(X_test_bow)

dt_classifier.fit(X_train_tfidf, y_train)
dt_predictions_tfidf = dt_classifier.predict(X_test_tfidf)

In [7]:
# Random Forest
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_bow, y_train)
rf_predictions_bow = rf_classifier.predict(X_test_bow)

rf_classifier.fit(X_train_tfidf, y_train)
rf_predictions_tfidf = rf_classifier.predict(X_test_tfidf)

In [8]:
# Naive Bayes Model
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_bow, y_train)
nb_predictions_bow = nb_classifier.predict(X_test_bow)

nb_classifier.fit(X_train_tfidf, y_train)
nb_predictions_tfidf = nb_classifier.predict(X_test_tfidf)

In [9]:
# K-NN Classifier
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train_bow, y_train)
knn_predictions_bow = knn_classifier.predict(X_test_bow)

knn_classifier.fit(X_train_tfidf, y_train)
knn_predictions_tfidf = knn_classifier.predict(X_test_tfidf)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [10]:
# SVM
svm_classifier = SVC()
svm_classifier.fit(X_train_bow, y_train)
svm_predictions_bow = svm_classifier.predict(X_test_bow)

svm_classifier.fit(X_train_tfidf, y_train)
svm_predictions_tfidf = svm_classifier.predict(X_test_tfidf)

In [11]:
# Step 4: Calculate evaluation metrics
print("Bag of Words - Decision Trees:")
print("Precision:", precision_score(y_test, dt_predictions_bow))
print("Recall:", recall_score(y_test, dt_predictions_bow))
print("F1-Score:", f1_score(y_test, dt_predictions_bow))
print("Confusion Matrix:")
print(confusion_matrix(y_test, dt_predictions_bow))
print("ROC-AUC Score:", roc_auc_score(y_test, dt_predictions_bow))
print()

Bag of Words - Decision Trees:
Precision: 0.9347826086956522
Recall: 0.916957145786344
F1-Score: 0.925784080322948
Confusion Matrix:
[[6160  312]
 [ 405 4472]]
ROC-AUC Score: 0.9343747410019483



In [12]:
print("TF-IDF - Decision Trees:")
print("Precision:", precision_score(y_test, dt_predictions_tfidf))
print("Recall:", recall_score(y_test, dt_predictions_tfidf))
print("F1-Score:", f1_score(y_test, dt_predictions_tfidf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, dt_predictions_tfidf))
print("ROC-AUC Score:", roc_auc_score(y_test, dt_predictions_tfidf))
print()

TF-IDF - Decision Trees:
Precision: 0.9184849732400164
Recall: 0.9149067049415625
F1-Score: 0.9166923472008217
Confusion Matrix:
[[6076  396]
 [ 415 4462]]
ROC-AUC Score: 0.9268600273780742



In [13]:
print("Bag of Words - Random Forest:")
print("Precision:", precision_score(y_test, rf_predictions_bow))
print("Recall:", recall_score(y_test, rf_predictions_bow))
print("F1-Score:", f1_score(y_test, rf_predictions_bow))
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_predictions_bow))
print("ROC-AUC Score:", roc_auc_score(y_test, rf_predictions_bow))
print()

Bag of Words - Random Forest:
Precision: 0.9268394648829431
Recall: 0.9091654705761739
F1-Score: 0.9179173998550875
Confusion Matrix:
[[6122  350]
 [ 443 4434]]
ROC-AUC Score: 0.9275431802819064



In [14]:
print("TF-IDF - Random Forest:")
print("Precision:", precision_score(y_test, rf_predictions_tfidf))
print("Recall:", recall_score(y_test, rf_predictions_tfidf))
print("F1-Score:", f1_score(y_test, rf_predictions_tfidf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_predictions_tfidf))
print("ROC-AUC Score:", roc_auc_score(y_test, rf_predictions_tfidf))
print()

TF-IDF - Random Forest:
Precision: 0.9295478443743428
Recall: 0.9062948533934796
F1-Score: 0.9177740863787376
Confusion Matrix:
[[6137  335]
 [ 457 4420]]
ROC-AUC Score: 0.9272667097622528



In [15]:
print("Bag of Words - Naive Bayes:")
print("Precision:", precision_score(y_test, nb_predictions_bow))
print("Recall:", recall_score(y_test, nb_predictions_bow))
print("F1-Score:", f1_score(y_test, nb_predictions_bow))
print("Confusion Matrix:")
print(confusion_matrix(y_test, nb_predictions_bow))
print("ROC-AUC Score:", roc_auc_score(y_test, nb_predictions_bow))
print()

Bag of Words - Naive Bayes:
Precision: 0.8822495606326889
Recall: 0.9263891736723395
F1-Score: 0.9037807561512301
Confusion Matrix:
[[5869  603]
 [ 359 4518]]
ROC-AUC Score: 0.916609296354093



In [16]:
print("TF-IDF - Naive Bayes:")
print("Precision:", precision_score(y_test, nb_predictions_tfidf))
print("Recall:", recall_score(y_test, nb_predictions_tfidf))
print("F1-Score:", f1_score(y_test, nb_predictions_tfidf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, nb_predictions_tfidf))
print("ROC-AUC Score:", roc_auc_score(y_test, nb_predictions_tfidf))
print()

TF-IDF - Naive Bayes:
Precision: 0.9047915370255134
Recall: 0.8944022964937461
F1-Score: 0.8995669210146422
Confusion Matrix:
[[6013  459]
 [ 515 4362]]
ROC-AUC Score: 0.9117407032530536



In [17]:
print("Bag of Words - K-NN:")
print("Precision:", precision_score(y_test, knn_predictions_bow))
print("Recall:", recall_score(y_test, knn_predictions_bow))
print("F1-Score:", f1_score(y_test, knn_predictions_bow))
print("Confusion Matrix:")
print(confusion_matrix(y_test, knn_predictions_bow))
print("ROC-AUC Score:", roc_auc_score(y_test, knn_predictions_bow))
print()

Bag of Words - K-NN:
Precision: 0.8760027504011002
Recall: 0.7836784908755382
F1-Score: 0.8272727272727273
Confusion Matrix:
[[5931  541]
 [1055 3822]]
ROC-AUC Score: 0.8500438189853586



In [18]:
print("TF-IDF - K-NN:")
print("Precision:", precision_score(y_test, knn_predictions_tfidf))
print("Recall:", recall_score(y_test, knn_predictions_tfidf))
print("F1-Score:", f1_score(y_test, knn_predictions_tfidf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, knn_predictions_tfidf))
print("ROC-AUC Score:", roc_auc_score(y_test, knn_predictions_tfidf))
print()

TF-IDF - K-NN:
Precision: 0.8955075701166543
Recall: 0.7397990567972114
F1-Score: 0.8102402874466652
Confusion Matrix:
[[6051  421]
 [1269 3608]]
ROC-AUC Score: 0.8373748065197429



In [19]:
print("Bag of Words - SVM:")
print("Precision:", precision_score(y_test, svm_predictions_bow))
print("Recall:", recall_score(y_test, svm_predictions_bow))
print("F1-Score:", f1_score(y_test, svm_predictions_bow))
print("Confusion Matrix:")
print(confusion_matrix(y_test, svm_predictions_bow))
print("ROC-AUC Score:", roc_auc_score(y_test, svm_predictions_bow))
print()

Bag of Words - SVM:
Precision: 0.96440489432703
Recall: 0.8888661062128358
F1-Score: 0.9250960307298336
Confusion Matrix:
[[6312  160]
 [ 542 4335]]
ROC-AUC Score: 0.9320721136750211



In [20]:
print("TF-IDF - SVM:")
print("Precision:", precision_score(y_test, svm_predictions_tfidf))
print("Recall:", recall_score(y_test, svm_predictions_tfidf))
print("F1-Score:", f1_score(y_test, svm_predictions_tfidf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, svm_predictions_tfidf))
print("ROC-AUC Score:", roc_auc_score(y_test, svm_predictions_tfidf))
print()

TF-IDF - SVM:
Precision: 0.9674706793538393
Recall: 0.8964527373385278
F1-Score: 0.9306087696892296
Confusion Matrix:
[[6325  147]
 [ 505 4372]]
ROC-AUC Score: 0.9368697555666681

