<h1>Package Installation<h1>

In [60]:
# import os

# def install_packages():
#     packages = [
#         "pandas",
#         "nltk",
#         "scikit-learn",
#         "pyLDAvis",
#         "numpy",
#         "gensim",
#         "matplotlib",
#         "wordcloud",
#         "seaborn",
#     ]
#     for package in packages:
#         os.system(f"pip install {package}")

# install_packages()



<h1>Importing All Libraries</h1>

In [61]:
import nltk
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('punkt')
import re
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import LatentDirichletAllocation, NMF
import pyLDAvis
import numpy as np
import time
from gensim.models import CoherenceModel
from gensim.corpora.dictionary import Dictionary
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import seaborn as sns
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)


[nltk_data] Downloading package wordnet to
[nltk_data]     /home/farhantahmid/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/farhantahmid/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/farhantahmid/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<h1>Loading Data</h1>

In [62]:
# loading data

try:
    all_data = pd.read_csv('english_hate_speech.csv')
    all_data.head()
except Exception as e:
    print(f"Error loading data: {e}")

print("Shape of the data: ", all_data.shape) #inspecting the shape of the data


Shape of the data:  (14490, 4)


<h1>Data Preprocessing</h1>

<h3>Missing Values</h3>

In [63]:
# check missing value count
missing_values_count = all_data.isna().sum()
print(missing_values_count)
print("\n")
# As only 8 missing values, dropping these rows
all_data.dropna(subset=['clean_text'], inplace=True)

# checking missing values again
print("Checking missing value after dropping data")
missing_values_count = all_data.isna().sum()
print(missing_values_count)

all_data.drop(['clean_text','Dataset'],axis='columns' ,inplace=True)

all_data.head()

Text          0
oh_label      0
Dataset       0
clean_text    8
dtype: int64


Checking missing value after dropping data
Text          0
oh_label      0
Dataset       0
clean_text    0
dtype: int64


Unnamed: 0,Text,oh_label
0,I give permission for Wikipedia to use the ...,0.0
1,` October 2010 (UTC) :::::It does look that wa...,0.0
2,RT @dcm81: #killerblondes ?? Have the producer...,0.0
3,@ShaofHappiness COME SAY HI TO ME THIS EVENING...,0.0
4,"@ManhattaKnight I mean he's gay, but he uses g...",1.0


<h3>Text Preprocessing</h3>

In [64]:
import string
from nltk.corpus import stopwords
import unicodedata

def preprocess_text(text):
  """
  This function preprocesses text by:
    - Lowercasing all characters
    - Removing URLs
    - Removing punctuation and special characters
    - Removing stopwords
    - Keeping only English characters

  Args:
      text: The text to preprocess (string)

  Returns:
      The preprocessed text (string)
  """

  # Lowercase all characters
  text = text.lower()

  # Remove URLs using regular expressions (adapt if needed for specific URL formats)
  import re
  text = re.sub(r"http\S+", "", text)

  # Remove punctuation and special characters
  text = text.translate(str.maketrans('', '', string.punctuation + string.digits))

  # Remove stopwords (download NLTK stopwords corpus first if not available)
  stop_words = stopwords.words('english')
  text = " ".join([word for word in text.split() if word not in stop_words])

  # Keep only English characters using unicode check
  text = ''.join(c for c in text if (unicodedata.category(c) == 'Ll' or c.isspace()))

  return text

In [65]:
all_data['pre_processed_text']=all_data['Text'].apply(lambda x:preprocess_text(x))
all_data.head()

Unnamed: 0,Text,oh_label,pre_processed_text
0,I give permission for Wikipedia to use the ...,0.0,give permission wikipedia use material nancy s...
1,` October 2010 (UTC) :::::It does look that wa...,0.0,october utc look way merger sounds like cathol...
2,RT @dcm81: #killerblondes ?? Have the producer...,0.0,rt dcm killerblondes producers lost plot eyes ...
3,@ShaofHappiness COME SAY HI TO ME THIS EVENING...,0.0,shaofhappiness come say hi evening ilu
4,"@ManhattaKnight I mean he's gay, but he uses g...",1.0,manhattaknight mean hes gay uses gendered slur...


<h3>Tokenizing the Clean Text Column</h3>

In [66]:

all_data['tokenized_texts']=all_data['pre_processed_text'].astype('string')
all_data['tokenized_texts']= all_data['pre_processed_text'].apply(lambda x: word_tokenize(x))
all_data.head()

Unnamed: 0,Text,oh_label,pre_processed_text,tokenized_texts
0,I give permission for Wikipedia to use the ...,0.0,give permission wikipedia use material nancy s...,"[give, permission, wikipedia, use, material, n..."
1,` October 2010 (UTC) :::::It does look that wa...,0.0,october utc look way merger sounds like cathol...,"[october, utc, look, way, merger, sounds, like..."
2,RT @dcm81: #killerblondes ?? Have the producer...,0.0,rt dcm killerblondes producers lost plot eyes ...,"[rt, dcm, killerblondes, producers, lost, plot..."
3,@ShaofHappiness COME SAY HI TO ME THIS EVENING...,0.0,shaofhappiness come say hi evening ilu,"[shaofhappiness, come, say, hi, evening, ilu]"
4,"@ManhattaKnight I mean he's gay, but he uses g...",1.0,manhattaknight mean hes gay uses gendered slur...,"[manhattaknight, mean, hes, gay, uses, gendere..."


<h3>Lemmatize the Tokenized Texts Column</h3>

In [67]:
from nltk.stem import WordNetLemmatizer

# # nltk.download('wordnet')


#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    # remove URLS and Numbers as well
    lemm_text = re.sub(r'[0-9]+|\bhttps?://\S+\b', '', " ".join(lemm_text))
    return lemm_text

all_data['lemmatized_text']=all_data['tokenized_texts'].apply(lambda x:lemmatizer(x))
all_data.head()


Unnamed: 0,Text,oh_label,pre_processed_text,tokenized_texts,lemmatized_text
0,I give permission for Wikipedia to use the ...,0.0,give permission wikipedia use material nancy s...,"[give, permission, wikipedia, use, material, n...",give permission wikipedia use material nancy s...
1,` October 2010 (UTC) :::::It does look that wa...,0.0,october utc look way merger sounds like cathol...,"[october, utc, look, way, merger, sounds, like...",october utc look way merger sound like catholi...
2,RT @dcm81: #killerblondes ?? Have the producer...,0.0,rt dcm killerblondes producers lost plot eyes ...,"[rt, dcm, killerblondes, producers, lost, plot...",rt dcm killerblondes producer lost plot eye fa...
3,@ShaofHappiness COME SAY HI TO ME THIS EVENING...,0.0,shaofhappiness come say hi evening ilu,"[shaofhappiness, come, say, hi, evening, ilu]",shaofhappiness come say hi evening ilu
4,"@ManhattaKnight I mean he's gay, but he uses g...",1.0,manhattaknight mean hes gay uses gendered slur...,"[manhattaknight, mean, hes, gay, uses, gendere...",manhattaknight mean he gay us gendered slur ma...


<h1>Feature Extraction</h1>

<h3>Bag of Words and TFIDF Counts of common words</h3>

In [68]:
from nltk.corpus import  stopwords
from collections import  Counter
# nltk.download('stopwords')
from scipy.sparse import csr_matrix

def create_bag_of_words(text_column):
    """
    This function takes a column of text data and creates a bag-of-words representation.

    Args:
        text_column (list): A list of strings, where each string represents a document.

    Returns:
        list: A list of dictionaries. Each dictionary represents the bag-of-words for a document.
    """
    word_counts = Counter()
    stop_words = set(stopwords.words('english'))
    
    for text in text_column:
        # Preprocess text (lowercase, remove punctuation)
        text = text.lower()
        text = ''.join([char for char in text if char.isalnum() or char.isspace()])

        # Tokenize the text
        tokens = nltk.word_tokenize(text)

        # Remove stop words
        filtered_tokens = [token for token in tokens if token not in stop_words]
        word_counts.update(filtered_tokens)

    return word_counts


In [69]:
bag_of_words=create_bag_of_words(all_data['lemmatized_text'])
print(bag_of_words)



In [70]:

# # Apply the BOW to all data

# def bag_of_words(data, text_column):
#     """
#     Creates a Bag-of-Words (BoW) representation from a given dataset.

#     Args:
#         data (pd.DataFrame): DataFrame containing the text data.
#         text_column (str): Name of the column containing the text data.

#     Returns:
#         tuple: A tuple containing the BoW matrix and the fitted CountVectorizer object.
#     """

#     # Select the text column
#     text_data = data[text_column]

#     # Create a CountVectorizer object
#     vectorizer = CountVectorizer()

#     # Transform the text into a BoW representation
#     bow_matrix = vectorizer.fit_transform(text_data)
#     # # Convert the sparse matrix to a dense array and then to a DataFrame
#     # bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())

#     return bow_matrix, vectorizer

# BoW_all_data, vectorizer = bag_of_words(all_data,"lemmatized_text")

# print(BoW_all_data.toarray()[:5])

# # Access the vocabulary from the vectorizer
# vocabulary = vectorizer.get_feature_names_out()
# print(f"Vocabulary: {vocabulary[:10]}")  # Print the first 10 words

# from sklearn.feature_extraction.text import CountVectorizer
# count_vectorizer = CountVectorizer()
# bow = count_vectorizer.fit_transform(all_data['lemmatized_text'][:1000])

# print(bow.toarray())

# count_array = bow.toarray()

# features = count_vectorizer.get_feature_names_out()
# BoW_all_data = pd.DataFrame(data=count_array, columns=features)

# BoW_all_data.head()


In [71]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf_vectorizer = TfidfVectorizer()
# tfidf = tfidf_vectorizer.fit_transform(all_data['lemmatized_text'])

# count_array = tfidf.toarray()

# features = tfidf_vectorizer.get_feature_names_out()
# TFIDF_all_data = pd.DataFrame(data=count_array, columns=features)

# TFIDF_all_data.head()