In [25]:
import pandas as pd
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
import nltk
import re
from tqdm import tqdm
import numpy as np

In [26]:
def pre_processing(tweet: str):
    
    # Remove Leading Blank Spaces
    tweet = tweet.strip()
    
    # Lower Case
    tweet = tweet.lower()
    
    # Remove URLS 
    url_pattern = re.compile(r"https?://\S+|www\.\S+")
    tweet = re.sub(url_pattern, "", tweet)
    
    # Remove UserName
    username_pattern = re.compile(r"@\w+")
    tweet = re.sub(username_pattern, "", tweet)
    
    # Remove Hashtags
    hashtag_pattern = re.compile(r"#\w+")
    tweet = re.sub(hashtag_pattern, "", tweet)
    
    # Character normalization // todaaaaay -> today
    tweet = re.sub(r"([a-zA-Z])\1{2,}", r'\1', tweet)
    
    # Remove Special Characters
    tweet = re.sub(r'[^a-zA-Z\s]', "", tweet)
    
    # Word Tokenizer
    tweet = nltk.word_tokenize(tweet)
    
    # Remove Stop Words 
    stop_words = set([re.sub(r'[^a-zA-Z\s]', "", word) for word in nltk.corpus.stopwords.words("english")])
    tweet = [word for word in tweet if word not in stop_words]
    
    # lemmatization
    def get_pos(word):
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"N": "n", "V": "v", "R": "r", "J": "a"}
        return tag_dict.get(tag, "n")
    
    lemma = nltk.stem.WordNetLemmatizer()
    tweet = [lemma.lemmatize(word, pos=get_pos(word)) for word in tweet]
    
    return tweet


In [27]:
df = pd.read_csv("labeled_data.csv")

In [28]:
df

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...,...
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies


In [29]:
df = df.sample(frac=0.01, random_state=42)  # Sampling 10% of the data, you can adjust the fraction as needed

# Reset index of the sampled DataFrame
df.reset_index(drop=True, inplace=True)

In [30]:
df

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,2326,3,0,3,0,1,934 8616\ni got a missed call from yo bitch
1,16283,3,0,3,0,1,RT @KINGTUNCHI_: Fucking with a bad bitch you ...
2,19362,3,0,1,2,2,RT @eanahS__: @1inkkofrosess lol my credit ain...
3,16780,3,0,3,0,1,RT @Maxin_Betha Wipe the cum out of them faggo...
4,13654,3,1,2,0,1,Niggas cheat on they bitch and don't expect no...
...,...,...,...,...,...,...,...
243,3763,3,0,3,0,1,@JimNorton Hey Jim you'll love this http://www...
244,7111,3,0,3,0,1,@reneeXOmarie yeah she didnt wana bring that p...
245,17714,3,0,0,3,2,"RT @SimplyPerfectt_: Girls, don't let a guy tr..."
246,11408,3,0,3,0,1,I wonder where all these bitches I got at ?


In [31]:
df['processed_text'] = df['tweet'].apply(pre_processing)

In [32]:
df['processed_text']

0                           [get, miss, call, yo, bitch]
1      [rt, fuck, bad, bitch, go, need, money, lil, h...
2      [rt, lol, credit, aint, near, good, know, righ...
3      [rt, wipe, cum, faggot, rt, contact, lens, wil...
4      [nigga, cheat, bitch, expect, pay, back, whats...
                             ...                        
243                                     [hey, jim, love]
244    [yeah, wana, bring, pussy, u, tell, u, could, ...
245    [rt, girl, let, guy, treat, like, yellow, star...
246                                 [wonder, bitch, get]
247    [rt, yall, nigga, entertain, nicole, bitch, se...
Name: processed_text, Length: 248, dtype: object

In [33]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df["processed_text"].values, df["class"].values, train_size=0.8)

# Building the vocabulary
vocab = set()
for text in x_train:
    for word in text:
        vocab.add(word)
print("Vocab Size :", len(vocab))

Vocab Size : 762


In [34]:
from gensim.models import Word2Vec

g_model = Word2Vec(vector_size=200, window=5, workers=5)
g_model.build_vocab(x_train)
g_model.train(x_train, total_examples=g_model.corpus_count, epochs=500)

(61809, 749500)

In [35]:
def in_vocab(word_l):
    for word in word_l:
        if word not in g_model.wv:
            return False
    else:
        return True

train_vec = [g_model.wv[x].sum(axis = 0) if len(x) and in_vocab(x) else np.zeros((200)) for x in x_train]
test_vec  = [g_model.wv[x].sum(axis = 0) if len(x) and in_vocab(x) else np.zeros((200)) for x in x_test]

In [36]:
from sklearn.metrics import classification_report, accuracy_score


In [37]:
model = LogisticRegression(max_iter=1000)
model.fit(train_vec, y_train)

# Predicting on test data
predict = model.predict(test_vec)

# Evaluating the model
print("Accuracy Score :", accuracy_score(y_test, predict), end='\n\n')
print(classification_report(y_true=y_test, y_pred=predict))

Accuracy Score : 0.7

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.70      1.00      0.82        35
           2       0.00      0.00      0.00        10

    accuracy                           0.70        50
   macro avg       0.23      0.33      0.27        50
weighted avg       0.49      0.70      0.58        50



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [38]:
def predict_offensive(text):
    processed_text = preprocess_text(text)
    word_list = nltk.word_tokenize(processed_text)
    vector = [g_model.wv[word] for word in word_list if word in g_model.wv]
    if not vector:
        print("Clean sentence.")
        return
    vector_mean = sum(vector) / len(vector)
    prediction = model.predict([vector_mean])
    if prediction[0] == 1:
        print("Hate speech or offensive language detected.")
    else:
        print("Clean sentence.")

In [40]:
input_sentence = input("Enter a sentence: ")

# Predict
predict_offensive(input_sentence)

Enter a sentence: good
Clean sentence.


In [41]:
# def replace_negative_words(text):
#     negative_keywords = extract_negative_keywords(text)
#     positive_replacements = {'hate': 'love', 'terrible': 'excellent', 'negative_word_3': 'positive_word_3'}
#     replaced_text = text
#     for word in negative_keywords:
#         if word in positive_replacements:
#             replaced_text = replaced_text.replace(word, positive_replacements[word])
#     return replaced_text

In [47]:
from rake_nltk import Rake

In [48]:
# def extract_negative_keywords(text):
#     r = Rake()
#     r.extract_keywords_from_text(text)
#     return r.get_ranked_phrases()

In [50]:
def predict_offensive_2(text):
    processed_text = preprocess_text(text)
    word_list = nltk.word_tokenize(processed_text)
    vector = [g_model.wv[word] for word in word_list if word in g_model.wv]
    if not vector:
        print("Clean sentence.")
        return
    vector_mean = sum(vector) / len(vector)
    prediction = model.predict([vector_mean])
    if prediction[0] == 1:
        print("Hate speech or offensive language detected.")
        replaced_text = replace_negative_words(text)
        print("Replaced sentence:", replaced_text)
    else:
        print("Clean sentence.")

In [55]:
input_sentence = input("Enter a sentence: ")

Enter a sentence: i hate you nigga


In [56]:
predict_offensive_2(input_sentence)

Hate speech or offensive language detected.
Replaced sentence: i love you nigga


In [61]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\adith\AppData\Roaming\nltk_data...


True

In [62]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import random
sia = SentimentIntensityAnalyzer()

In [67]:
def predict_offensive_3(text):
    processed_text = preprocess_text(text)
    word_list = nltk.word_tokenize(processed_text)
    vector = [g_model.wv[word] for word in word_list if word in g_model.wv]
    if not vector:
        print("Clean sentence.")
        return
    vector_mean = sum(vector) / len(vector)
    prediction = model.predict([vector_mean])
    if prediction[0] == 1:
        print("Hate speech or offensive language detected.")
        replaced_text = replace_negative_words(text)
        print("Replaced sentence:", replaced_text)
    else:
        print("Clean sentence.")

# Function to replace negative words with positive ones
def replace_negative_words(text):
    word_list = nltk.word_tokenize(text)
    replaced_text = text
    for i, word in enumerate(word_list):
        if is_negative_word(word):
            positive_word = get_random_positive_word()
            replaced_text = replaced_text.replace(word, positive_word, 1)  # Replace only the first occurrence
    return replaced_text

# Function to check if a word is negative using VADER sentiment analysis
def is_negative_word(word):
    # Use VADER to get the sentiment score of the word
    compound_score = sia.polarity_scores(word)['compound']
    return compound_score < 0  # Negative compound score indicates negative sentiment

# Function to get a random positive word
def get_random_positive_word():
    positive_words = ['love', 'excellent', 'amazing', 'joyful', 'wonderful']  # Add more positive words if needed
    return random.choice(positive_words)

# Take input sentence
input_sentence = input("Enter a sentence: ")

Enter a sentence: i hate you bitch die


In [68]:
predict_offensive_3(input_sentence)

Hate speech or offensive language detected.
Replaced sentence: i amazing you love wonderful
