In [1]:
import re
import string
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, ne_chunk


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

from tqdm import tqdm
import os
import spacy
import random
from spacy.util import compounding
from spacy.util import minibatch

import warnings
warnings.filterwarnings("ignore")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


In [3]:
Data = pd.read_csv('HatespeechKenya.csv')
Data.info()
Data.head()


print("Missing Values:")
print(Data.isnull().sum())

print("Duplicate Entries:")
print(Data.duplicated().sum())


print("Stats:")
Data.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155236 entries, 0 to 155235
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   tweet   155236 non-null  object
dtypes: object(1)
memory usage: 1.2+ MB
Missing Values:
tweet    0
dtype: int64
Duplicate Entries:
15805
Stats:


Unnamed: 0,tweet
count,155236
unique,139431
top,username;date;retweets;favorites;text;geo;ment...
freq,73


In [4]:
Original = len(Data)
Data = Data.drop_duplicates(subset='tweet', keep='first')
duplicates_removed = Original - len(Data)
print(f"Duplicates removed: {duplicates_removed}")

Duplicates removed: 15805


In [5]:
def clean_text(text):
    Text = re.sub(r'http\S+', '', text)
    Text = re.sub(r'[^a-zA-Z\s]', '', text)
    Text = Text.lower()
    return Text
Data['Cleaned_Tweet'] = Data['tweet'].apply(clean_text)
print("\nCleaned tweets:")
Data[['tweet', 'Cleaned_Tweet']].head()


Cleaned tweets:


Unnamed: 0,tweet,Cleaned_Tweet
0,";2017-10-18 11:04;0;0;""That #ShangweVedio shou...",that shangwevedio should be replaced where di...
1,";2017-10-18 09:15;0;1;""they wrote letters to i...",they wrote letters to iebc their bloggers tre...
2,";2017-10-18 09:06;0;2;""Akombe was a nasa mole ...",akombe was a nasa mole jubilee never wanted...
3,";2017-10-04 12:24;0;0;""@paulinenjoroge @OleItu...",paulinenjoroge oleitumbi jubileepartyk teamuh...
4,";2017-09-25 20:20;0;0;""#akombemustgo"";;;#akomb...",akombemustgoakombemustgohttpstwittercomdennis...


In [6]:
Data['Tokens'] = Data['Cleaned_Tweet'].apply(word_tokenize)

print("Tokenized tweets:")
Data[['Cleaned_Tweet', 'Tokens']].head()

Tokenized tweets:


Unnamed: 0,Cleaned_Tweet,Tokens
0,that shangwevedio should be replaced where di...,"[that, shangwevedio, should, be, replaced, whe..."
1,they wrote letters to iebc their bloggers tre...,"[they, wrote, letters, to, iebc, their, blogge..."
2,akombe was a nasa mole jubilee never wanted...,"[akombe, was, a, nasa, mole, jubilee, never, w..."
3,paulinenjoroge oleitumbi jubileepartyk teamuh...,"[paulinenjoroge, oleitumbi, jubileepartyk, tea..."
4,akombemustgoakombemustgohttpstwittercomdennis...,[akombemustgoakombemustgohttpstwittercomdennis...


In [15]:
stop_words = set(stopwords.words('english'))
Data['Removed_From_Tokens'] = Data['Tokens'].apply(lambda x: [word for word in x if word not in stop_words])

print("Tweets after stopword removal:")
Data[['Tokens', 'Removed_From_Tokens']].head()

Tweets after stopword removal:


Unnamed: 0,Tokens,Removed_From_Tokens
0,"[that, shangwevedio, should, be, replaced, whe...","[shangwevedio, replaced, akombemustgo, disappe..."
1,"[they, wrote, letters, to, iebc, their, blogge...","[wrote, letters, iebc, bloggers, trended, akom..."
2,"[akombe, was, a, nasa, mole, jubilee, never, w...","[akombe, nasa, mole, jubilee, never, wanted, s..."
3,"[paulinenjoroge, oleitumbi, jubileepartyk, tea...","[paulinenjoroge, oleitumbi, jubileepartyk, tea..."
4,[akombemustgoakombemustgohttpstwittercomdennis...,[akombemustgoakombemustgohttpstwittercomdennis...


In [None]:
Lemmatizeremmatizer = WordNetLemmatizer()

Data['Lemmatized'] = Data['Removed_From_Tokens'].apply(lambda x: [Lemmatizer.lemmatize(word) for word in x])


print("\nLemmatized tokens:")
Data[['Removed_From_Tokens', 'Lemmatized']].head()

In [None]:
Unusable = Data['Lemmatized'].apply(lambda x: len(x) == 0 or not isinstance(x, list)).sum()
print(f"Number of rows with empty or invalid tokens: {Unusable}")

Data[Data['Lemmatized'].apply(lambda x: len(x) == 0 or not isinstance(x, list))]

In [None]:
Data['Part_Of_Speech'] = Data['Lemmatized'].apply(nltk.pos_tag)
print("\nPOS Tagged tokens:")
Data[['Lemmatized', 'Part_Of_Speech']].head()

In [None]:
Chunked = r"NP: {<DT>?<JJ>*<NN>}"
chunk_parser = nltk.RegexpParser(Chunked)
Data['Chunk'] = Data['Part_Of_Speech'].apply(chunk_parser.parse)
print("\nChunked phrases:")
for i, tree in enumerate(Data['Chunk'].head()):
    print(f"Tweet {i}:")
    print(tree)

In [None]:
Words = ' '.join([' '.join(tokens) for tokens in Data['Lemmatized']])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(Words)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud of Lemmatized Tokens")
plt.show()

In [None]:
Flat = [word for tokens in Data['Lemmatized'] for word in tokens]
Frequency = Counter(Flat).most_common(10)

words, counts = zip(*Frequency)
plt.figure(figsize=(10, 5))
plt.bar(words, counts)
plt.title("Top 10 Most Common Words")
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.show()

In [None]:
!pip install squarify

import squarify

In [None]:
Frequency_Dict = dict(Frequency)
Size = list(Frequency_Dict.values())
Label = list(Frequency_Dict.keys())

plt.figure(figsize=(10, 5))
squarify.plot(sizes=Size, label=Label, alpha=0.8)
plt.title("Treemap of Most Common Words")
plt.axis('off')
plt.show()