# Youtube Hate Speech ML Project

### First thing is to import libraries.

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

### Various libraries will be imported for feature extraction.

In [None]:
import re
import string
import nltk
nltk.download('stopwords')
from nltk.util import pr
from nltk.stem.snowball import SnowballStemmer

### `nltk.corpus` is a module by Natural language toolkit. Contains large sets of text for linguistic analysis and development.

In [None]:
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword = set(stopwords.words("english"))

In [None]:
df = pd.read_csv("twitter_data.csv")
df = df.dropna()
df.head()

In [None]:
# Import dataset
# Drop any `NaN` values
# Summon the info of dataset.

print(df.columns.tolist()) 


### We used the `.map` function to assign 0, 1, and 2 to "Hate Speech Detected", "Offensive language detected", and "No hate and offensive speech"

In [None]:
df['labels'] = df['class'].map({0:"Hate Speech Detected", 1:"Offensive language detected", 2:"No hate and offensive speech"})

In [None]:
df['labels'].info
df.head()

### We create a `tweets` and `labels` dataframe

In [None]:
df = df[['tweet', 'labels']]
df = df[['tweet', 'labels']].fillna(0)
df.head()

### Now begins the process of cleaning the text

In [None]:
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text) 
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', "", text)
    text = [word for word in text.split() if word not in stopword]
    text = " ".join(text)
    return text
df["tweet"] = df["tweet"].apply(clean)
df["tweet"] = df["tweet"].dropna()
print(df.head())

### Our data is ready. Now to build the classification model.

In [None]:
x = np.array(df["tweet"])
y = np.array(df["labels"])

cv = CountVectorizer()
x = cv.fit_transform(x)
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size = 0.33, random_state = 42)
clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)



### Testing the model

In [None]:
test_data = "i will kill you"
df = cv.transform([test_data]).toarray()
print(clf.predict(df))

## Conclusion of  Hate Speech Detection project
### This project ...

In [None]:
''' Citation: Simplilearn. (2022, November 16). Hate speech detection using machine learning: 
ML projects using python: Simplilearn. YouTube. https://youtu.be/jbexvUovHxw?si=jJGUJVUIWDSDtujG 
'''