# **Data Loading and Preparation**

In [8]:
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')

train_df = pd.read_csv('/content/drive/My Drive/train.csv')
test_df = pd.read_csv('/content/drive/My Drive/test.csv')
train_df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


# **Text Preprocessing and Feature Engineering**

In [None]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Define the target labels
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Define the features
features = ['comment_text']

# Combine the training and test data
data_df = pd.concat([train_df[features], test_df[features]], axis=0)


def clean_text(text):
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Clean the text data
data_df['comment_text'] = data_df['comment_text'].apply(lambda x: clean_text(x))

# Convert the text data to numerical features
vectorizer = TfidfVectorizer()
features_tfidf = vectorizer.fit_transform(data_df['comment_text'])

# Split the data back into training and test sets
train_features = features_tfidf[:train_df.shape[0]]
test_features = features_tfidf[train_df.shape[0]:]
train_labels = train_df[labels]

X_train, X_valid, y_train, y_valid = train_test_split(train_features, train_labels, test_size=0.2, random_state=42)



# **Model Training, Fine-tuning, and Evaluation using XGBoost and Sklearn Metrics**

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
import xgboost as xgb

# Train the XGBoost model
xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(X_train, y_train)

# Fine-tune the model by retraining with additional data
xgb_classifier.fit(train_features, train_labels)

# Make predictions on the test data
test_predictions = xgb_classifier.predict(test_features)

# Evaluate the performance of the model
for i, label in enumerate(labels):
    precision = precision_score(y_valid[label], xgb_classifier.predict(X_valid)[:,i])
    recall = recall_score(y_valid[label], xgb_classifier.predict(X_valid)[:,i])
    f1 = f1_score(y_valid[label], xgb_classifier.predict(X_valid)[:,i])
    print(f"Category: {label}\nPrecision: {precision}\nRecall: {recall}\nF1 Score: {f1}\n")


Category: toxic
Precision: 0.967741935483871
Recall: 0.68717277486911
F1 Score: 0.8036739380022961

Category: severe_toxic
Precision: 0.9166666666666666
Recall: 0.5482866043613707
F1 Score: 0.6861598440545809

Category: obscene
Precision: 0.9476987447698745
Recall: 0.7924198250728863
F1 Score: 0.8631311527469038

Category: threat
Precision: 0.9491525423728814
Recall: 0.7567567567567568
F1 Score: 0.8421052631578948

Category: insult
Precision: 0.9217970049916805
Recall: 0.6864931846344485
F1 Score: 0.7869318181818181

Category: identity_hate
Precision: 0.9595375722543352
Recall: 0.564625850340136
F1 Score: 0.7109207708779444



# **Generating Predictions and Creating a Submission File**

In [None]:
from xgboost import XGBClassifier

# Make predictions on the test set
test_pred_proba = xgb_classifier.predict_proba(test_features)

# Create a submission DataFrame with the predicted probabilities
submission_df = pd.DataFrame(test_pred_proba, columns=labels)

# Add the id column from the test data to the submission DataFrame
submission_df['id'] = test_df['id']

# Reorder the columns so that id is the first column, followed by the target labels
submission_df = submission_df[['id'] + labels]

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)



# **Saving the trained XGBoost model to a file**

In [None]:
import pickle

# Save the trained model to a file
with open('xgb_classifier.pkl', 'wb') as f:
    pickle.dump(xgb_classifier, f)


# **Saving Trained Vectorizer to Pickle**

In [None]:
import pickle

# Convert the text data to numerical features
vectorizer = TfidfVectorizer()
features_tfidf = vectorizer.fit_transform(data_df['comment_text'])

# Save the trained vectorizer to a file
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)


# Here is a link to my deployed model below

[click this](https://ayo-folashade-nlp-text-classification-streamlit-app-1ds89h.streamlit.app/)