# Training the model on whole dataset

In [1]:
import pandas as pd

df = pd.read_parquet("hf://datasets/tdavidson/hate_speech_offensive/data/train-00000-of-00001.parquet")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [2]:
df.head()

Unnamed: 0,count,hate_speech_count,offensive_language_count,neither_count,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [3]:
df.columns

Index(['count', 'hate_speech_count', 'offensive_language_count',
       'neither_count', 'class', 'tweet'],
      dtype='object')

In [None]:
# Dropping unnecessary Columns
df = df.drop(columns=['count', 'hate_speech_count', 'offensive_language_count', 'neither_count'], axis=1)

In [None]:
import spacy
import re

from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

In [None]:
# Text Preprocessing
def preprocessing(text):
  text = re.sub(r'@(\w*)', ' ', text, flags=re.MULTILINE)
  text = re.sub(r'RT', ' ', text, flags=re.MULTILINE)
  text = re.sub(r'[^\w\s\']', ' ', text)
  text = re.sub(r'\d', '', text)
  text = re.sub(r' +', ' ', text)
  text = re.sub(r'^\s+|\s+$', '', text)

  doc = nlp(text)

  no_stop_words = [token.text for token in doc if not token.is_stop] # "is_punct()" -----> is_punctuation()

  return " ".join(no_stop_words)

In [None]:
df['preprocessed_tweet'] = df['tweet'].apply(preprocessing)

In [None]:
df.head()

Unnamed: 0,class,tweet,preprocessed_tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,woman complain cleaning house amp man trash
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,boy dats cold tyga dwn bad cuffin dat hoe st p...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,Dawg fuck bitch start cry confused shit
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,look like tranny
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,shit hear true faker bitch told ya


**NOTE:**
For text classification `fastText` uses `supervised learning` method, and for that the file format should be like this...
`__label__`{category} {Text}  
Example:
`__label__`elctronics Apple iPhone 6S.

If you have multiple labels then...
`__label__`{category1} `__label__`{category2} {Text}

In [None]:
def add_label(num):
  if num == 0:
    return '__label__Hate_Speech'
  elif num == 1:
    return '__label__Offensive_Language'
  else:
    return '__label__Neither'

In [None]:
df['category'] = df['class'].apply(add_label)

In [None]:
df.head()

Unnamed: 0,class,tweet,preprocessed_tweet,category
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,woman complain cleaning house amp man trash,__label__Neither
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,boy dats cold tyga dwn bad cuffin dat hoe st p...,__label__Offensive_Language
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,Dawg fuck bitch start cry confused shit,__label__Offensive_Language
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,look like tranny,__label__Offensive_Language
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,shit hear true faker bitch told ya,__label__Offensive_Language


In [None]:
df['category_tweet'] = df['category'] + " " + df['preprocessed_tweet']

In [None]:
df.head()

Unnamed: 0,class,tweet,preprocessed_tweet,category,category_tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,woman complain cleaning house amp man trash,__label__Neither,__label__Neither woman complain cleaning house...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,boy dats cold tyga dwn bad cuffin dat hoe st p...,__label__Offensive_Language,__label__Offensive_Language boy dats cold tyga...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,Dawg fuck bitch start cry confused shit,__label__Offensive_Language,__label__Offensive_Language Dawg fuck bitch st...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,look like tranny,__label__Offensive_Language,__label__Offensive_Language look like tranny
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,shit hear true faker bitch told ya,__label__Offensive_Language,__label__Offensive_Language shit hear true fak...


In [None]:
# train test split
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=10)

In [None]:
train.shape, test.shape

((19826, 5), (4957, 5))

In [None]:
# Saving the train and test set to csv file
train.to_csv("content_train", columns=['category_tweet'], index=False, header=False)
test.to_csv("content_test", columns=['category_tweet'], index=False, header=False)

In [None]:
!pip install fasttext



In [None]:
import fasttext

model = fasttext.train_supervised(input="content_train")

In [None]:
model.test("content_test")

(4795, 0.9059436913451512, 0.9059436913451512)

In [None]:
# Save the model
model.save_model('content_moderator.bin')

In [None]:
moderator = fasttext.load_model('content_moderator.bin')

In [None]:
print(moderator.predict("I'll kill you")[0][0])

__label__Hate_Speech


In [None]:
print(moderator.predict("Fuck you")[0][0])

__label__Offensive_Language


In [None]:
print(moderator.predict("Hii! let's go home together")[0][0])

__label__Neither


## Demo

In [None]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.10.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.5.3 (from gradio)
  Downloading gradio_client-1.5.3-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.8.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.meta

In [None]:
# Prediction function
def predict_content(text):
    text = preprocessing(text)
    prediction = moderator.predict(text)[0][0]

    if prediction == "__label__Hate_Speech":
        return "🚫 Hate Speech"
    elif prediction == '__label__Offensive_Language':
        return "⚠️ Offensive Language"
    else:
        return "✅ Neutral Content"

In [None]:
import gradio as gr

# Gradio Blocks with CSS for 100px Right Margin
with gr.Blocks(css="""
    .gradio-container {
        margin-right: 100px;
    }
""") as demo:
    gr.Markdown("# 📚 **Content Moderation Demo**")
    gr.Markdown("Enter text below and see if it's Hate Speech, Offensive, or Neutral.")

    with gr.Column():
        text_input = gr.Textbox(label="Enter Text for Moderation")
        submit_btn = gr.Button("Submit")
        output = gr.Label(label="Prediction")

    submit_btn.click(predict_content, inputs=text_input, outputs=output)

# Launch the GUI
demo.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://b6412d9270cc07da6b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


