#Importing Required Modules

In [35]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import gensim.downloader as api
from transformers import BertTokenizer, BertModel
import torch

#NLTK Resource Downloads

In [36]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

##Load The dataset

In [37]:
df = pd.read_csv("/content/labeled_data.csv")

#Display initial information about the dataset

In [38]:
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [39]:
df.describe()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class
count,24783.0,24783.0,24783.0,24783.0,24783.0,24783.0
mean,12681.192027,3.243473,0.280515,2.413711,0.549247,1.110277
std,7299.553863,0.88306,0.631851,1.399459,1.113299,0.462089
min,0.0,3.0,0.0,0.0,0.0,0.0
25%,6372.5,3.0,0.0,2.0,0.0,1.0
50%,12703.0,3.0,0.0,3.0,0.0,1.0
75%,18995.5,3.0,0.0,3.0,0.0,1.0
max,25296.0,9.0,7.0,9.0,9.0,2.0


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          24783 non-null  int64 
 1   count               24783 non-null  int64 
 2   hate_speech         24783 non-null  int64 
 3   offensive_language  24783 non-null  int64 
 4   neither             24783 non-null  int64 
 5   class               24783 non-null  int64 
 6   tweet               24783 non-null  object
dtypes: int64(6), object(1)
memory usage: 1.3+ MB


## Check for missing values

In [8]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
count,0
hate_speech,0
offensive_language,0
neither,0
class,0
tweet,0


# Select only the columns needed for classification


In [41]:
df=df[['class','tweet']]

In [42]:
df

Unnamed: 0,class,tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...
24778,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,2,"you've gone and broke the wrong heart baby, an..."
24780,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,1,youu got wild bitches tellin you lies


#Data Preprocessing

### Function to clean and preprocess text

In [44]:
def preprocess_text(text):
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'#\S+', '', text)
    text = re.sub('[^a-zA-Z]', ' ', text)  # Remove special characters and numbers
    text = text.lower()
    text = text.split()
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text if word not in stopwords.words('english')]
    return ' '.join(text)

###Apply text preprocessing

In [45]:
df['cleaned_tweet'] = df['tweet'].apply(preprocess_text)

# Data Splitting

# Split dataset into training and testing sets

In [46]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_tweet'], df['class'], test_size=0.2, random_state=42)

#Feature Extraction Using TF-IDF

In [47]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

##Model Training and Evaluation: Logistic Regression with TF-IDF

In [67]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

##Evaluate model performance

In [69]:
y_pred_tfidf = model.predict(X_test_tfidf)
print("\n=== TF-IDF Results ===")
print("Accuracy:", accuracy_score(y_test, y_pred_tfidf))
print(classification_report(y_test, y_pred_tfidf))


=== TF-IDF Results ===
Accuracy: 0.8916683477910026
              precision    recall  f1-score   support

           0       0.48      0.16      0.24       290
           1       0.92      0.96      0.94      3832
           2       0.82      0.83      0.83       835

    accuracy                           0.89      4957
   macro avg       0.74      0.65      0.67      4957
weighted avg       0.87      0.89      0.88      4957



##Load Pre-Trained Word2Vec Model

In [50]:
word2vec = api.load("glove-wiki-gigaword-50")

##Function to compute average Word2Vec representation

In [51]:
def get_average_word2vec(text, model, vector_size):
    words = text.split()
    word_vectors = [model[word] for word in words if word in model]
    if len(word_vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

##Feature Extraction Using Word2Vec

In [52]:
X_train_w2v = np.array([get_average_word2vec(text, word2vec, 50) for text in X_train])
X_test_w2v = np.array([get_average_word2vec(text, word2vec, 50) for text in X_test])

##Model Training and Evaluation: Logistic Regression with Word2Vec

In [70]:
model.fit(X_train_w2v, y_train)
y_pred_w2v = model.predict(X_test_w2v)

In [71]:
print("\n=== Word2Vec Results ===")
print("Accuracy:", accuracy_score(y_test, y_pred_w2v))
print(classification_report(y_test, y_pred_w2v))


=== Word2Vec Results ===
Accuracy: 0.8414363526326407
              precision    recall  f1-score   support

           0       0.33      0.04      0.08       290
           1       0.86      0.96      0.91      3832
           2       0.73      0.58      0.65       835

    accuracy                           0.84      4957
   macro avg       0.64      0.53      0.55      4957
weighted avg       0.81      0.84      0.82      4957



##Load BERT Model and Tokenize

In [72]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

##Function to get BERT embeddings for text

In [56]:
def get_bert_embedding(text):
    inputs = bert_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

##Feature Extraction Using BERT

In [73]:
X_train_bert = np.array([get_bert_embedding(text) for text in X_train[:500]])
X_test_bert = np.array([get_bert_embedding(text) for text in X_test[:500]])

## Model Training and Evaluation: Logistic Regression with BERT Embedding

In [74]:
model.fit(X_train_bert, y_train[:500])
y_pred_bert = model.predict(X_test_bert)

In [75]:
print("Accuracy:", accuracy_score(y_test[:500], y_pred_bert))
print(classification_report(y_test[:500], y_pred_bert))

Accuracy: 0.82
              precision    recall  f1-score   support

           0       0.38      0.10      0.16        29
           1       0.86      0.94      0.90       390
           2       0.63      0.49      0.56        81

    accuracy                           0.82       500
   macro avg       0.62      0.51      0.54       500
weighted avg       0.79      0.82      0.80       500



##Model Training and Evaluation: Random Forest with TF-IDF

In [76]:
rf_model_tfidf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_tfidf.fit(X_train_tfidf, y_train)
y_pred_rf_tfidf = rf_model_tfidf.predict(X_test_tfidf)

In [77]:
print("Accuracy:", accuracy_score(y_test, y_pred_rf_tfidf))
print(classification_report(y_test, y_pred_rf_tfidf))

Accuracy: 0.8952995763566673
              precision    recall  f1-score   support

           0       0.39      0.14      0.21       290
           1       0.93      0.95      0.94      3832
           2       0.81      0.91      0.85       835

    accuracy                           0.90      4957
   macro avg       0.71      0.67      0.67      4957
weighted avg       0.88      0.90      0.88      4957



##Model Training and Evaluation: Random Forest with Word2Vec

In [78]:
rf_model_w2v = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_w2v.fit(X_train_w2v, y_train)
y_pred_rf_w2v = rf_model_w2v.predict(X_test_w2v)

In [79]:
print("Accuracy:", accuracy_score(y_test, y_pred_rf_w2v))
print(classification_report(y_test, y_pred_rf_w2v))

Accuracy: 0.8382085939076054
              precision    recall  f1-score   support

           0       0.31      0.03      0.05       290
           1       0.85      0.97      0.91      3832
           2       0.79      0.50      0.62       835

    accuracy                           0.84      4957
   macro avg       0.65      0.50      0.52      4957
weighted avg       0.81      0.84      0.81      4957



#Model Training and Evaluation: Random Forest with BERT Embeddings

In [80]:
rf_model_bert = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_bert.fit(X_train_bert, y_train[:500])
y_pred_rf_bert = rf_model_bert.predict(X_test_bert)

In [81]:
print("Accuracy:", accuracy_score(y_test[:500], y_pred_rf_bert))
print(classification_report(y_test[:500], y_pred_rf_bert))

Accuracy: 0.804
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        29
           1       0.80      0.99      0.89       390
           2       0.79      0.19      0.30        81

    accuracy                           0.80       500
   macro avg       0.53      0.39      0.40       500
weighted avg       0.76      0.80      0.74       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
