<a href="https://colab.research.google.com/github/13-1550/projNLP/blob/main/Final_Proj.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Cyber Bullying Detection**

By Using GapHate Corpus we imported data of catigorized tweets to train the data:

-vo : violent language or context

-hd : hate speech

-cv : esplicit calls to violence

In [1]:
!pip install scikit-learn pandas



In [2]:
import pandas as pd

In [5]:
train = pd.read_csv('ghc_train.tsv', sep='\t')
test = pd.read_csv('ghc_test.tsv', sep='\t')
test.sample(5)

Unnamed: 0,text,hd,cv,vo
5439,http://www.breitbart.com/london/2018/03/18/lef...,1,0,0
1768,"We need to get this to President Trump, so I'm...",0,0,0
3106,"Can we walk (or roll thru, if in a mobility de...",0,0,0
2396,Nice content on this page. - http://www.teslaf...,0,0,0
541,PELOSI IS PISSED! NEVER THOUGHT MSNBC WOULD AS...,0,0,0


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer #
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier


In [7]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import SGDClassifier

In [10]:
vectorizer = TfidfVectorizer()
x_train_vec = vectorizer.fit_transform(train['text'])

y_train = train[['hd', 'vo', 'cv']]



sgd_clf = MultiOutputClassifier(SGDClassifier())

_ = sgd_clf.fit(x_train_vec, y_train)

In [11]:
from sklearn import metrics

# **Preprocessing**

In [12]:
#removing URLS an special char
import re

def preprocess_text(text):

    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.strip()
    return text

#preprocessing
train['text'] = train['text'].apply(preprocess_text)
test['text'] = test['text'].apply(preprocess_text)



In [13]:
pip install nltk spacy gensim



In [16]:
import pandas as pd
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from gensim.models import Word2Vec


train = pd.read_csv('ghc_train.tsv', sep='\t')
test = pd.read_csv('ghc_test.tsv', sep='\t')

nltk.download('punkt_tab')
nltk.download('stopwords')

# Preprocessing Pipeline
def preprocess_text(text):
    #  Tokenization
    tokens = word_tokenize(text)

    # normalization: Lowercasing and removing punctuation
    normalized = [re.sub(r'[^\w\s]', '', token.lower()) for token in tokens]

    #  removing stopwords
    filtered = [word for word in normalized if word not in stopwords.words('english') and word != '']

    # 4. Stemming
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(word) for word in filtered]

    return stemmed

# Preprocessing
train['processed_text'] = train['text'].apply(preprocess_text)
test['processed_text'] = test['text'].apply(preprocess_text)

# Word2Vec
all_sentences = train['processed_text'].tolist() + test['processed_text'].tolist()
embedding_model = Word2Vec(sentences=all_sentences, vector_size=100, window=5, min_count=1, workers=4)

train['embeddings'] = train['processed_text'].apply(lambda x: [embedding_model.wv[word] for word in x if word in embedding_model.wv])
test['embeddings'] = test['processed_text'].apply(lambda x: [embedding_model.wv[word] for word in x if word in embedding_model.wv])

# Example output
print(train[['text', 'processed_text', 'embeddings']].head())


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                text  \
0  He most likely converted to islam due to his n...   
1  So Ford lied about being a psychologist. Recor...   
2     Jobs. Education. Ending abuse of Nation. CA43.   
3  I share a lot of your values, & like many who ...   
4  I am so ready to get back to blogging! www.ben...   

                                      processed_text  \
0  [like, convert, islam, due, natur, suitabl, is...   
1  [ford, lie, psychologist, record, seem, indic,...   
2               [job, educ, end, abus, nation, ca43]   
3  [share, lot, valu, like, mani, nt, call, alt, ...   
4  [readi, get, back, blog, wwwbenbrihousecom, re...   

                                          embeddings  
0  [[-0.46432433, 0.815524, 0.5789813, 0.28381035...  
1  [[-0.09219599, 0.3420268, 0.24634716, 0.114879...  
2  [[-0.18768682, 0.63626, 0.45595437, 0.17196359...  
3  [[-0.23578328, 0.5211941, 0.3769944, 0.1132621...  
4  [[-0.12046853, 0.35940775, 0.25682473, 0.10532..

In [17]:
#vec
# Vectorize the text data
x_train_vec = vectorizer.fit_transform(train['text'])
x_test_vec = vectorizer.transform(test['text'])


# **Training the data**

In [18]:
y_train = train[['hd', 'vo', 'cv']]
y_test = test[['hd', 'vo', 'cv']]


sgd_clf = MultiOutputClassifier(SGDClassifier())
sgd_clf.fit(x_train_vec, y_train)

In [20]:
from sklearn.metrics import accuracy_score

def evaluate(model):
    y_pred = model.predict(x_test_vec)
    for i, col in enumerate(['hd', 'vo', 'cv']):
        accuracy = accuracy_score(test[col], y_pred[:, i])
        print(f'Accuracy for {col}: {100 * accuracy:.2f}%')

evaluate(sgd_clf)

Accuracy for hd: 91.25%
Accuracy for vo: 93.87%
Accuracy for cv: 99.56%


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.metrics import accuracy_score

train = pd.read_csv('ghc_train.tsv', sep='\t')
test = pd.read_csv('ghc_test.tsv', sep='\t')

x_test = test[['text']]
y_test = test[['cv','hd','vo']]

x_train = train[['text']]
y_train = train[['cv','hd','vo']]

#x_train = test[['text']]
#y_train = train[['hd']]

vectorizer = TfidfVectorizer()
x_train_vec = vectorizer.fit_transform(x_train['text'])
x_test_vec = vectorizer.transform(x_test['text'])

rf = RandomForestClassifier()
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
}

grid_search = GridSearchCV(rf, param_grid, cv=5)
grid_search.fit(x_train_vec, y_train)

best_rf = grid_search.best_estimator_

y_pred = best_rf.predict(x_test_vec)


accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {100*accuracy:.1f}%')


best_rf = grid_search.best_estimator_

Testing on another data set

In [None]:
new_data = pd.read_csv('new_data.txt', sep='\t')

with open('new_data.txt', 'r') as file:
    lines = file.readlines()

new_data = pd.DataFrame({'text': [line.strip() for line in lines]})
print(new_data.head())

# Preprocess new data
x_new = new_data['text']
x_new_vec = vectorizer.transform(x_new)


y_new_pred = best_rf.predict(x_new_vec)
y_new_pred_df = pd.DataFrame(y_new_pred, columns=['cv', 'hd', 'vo'])

#predictions
print("Predictions:")
print(y_new_pred_df)

# Evaluate
y_new_true = new_data[['cv', 'hd', 'vo']]

new_data.to_csv('predicted_data.csv', index=False)


from sklearn.metrics import classification_report
print("Evaluation Report:")
print(classification_report(y_new_true, y_new_pred_df, target_names=['cv', 'hd', 'vo']))
