In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import re
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV

from time import time
import pickle

import warnings
warnings.filterwarnings("ignore")

## Dataset reading

In [2]:
data = pd.read_csv("C:/Users/adars/OneDrive/Desktop/nlp_project/data.csv")

In [3]:
data

Unnamed: 0,media,text,category
0,facebook_corpus_msr_451811,The quality of re made now makes me think it i...,CAG
1,facebook_corpus_msr_334368,@siva \nHow is ur mother???\nHow is ur wife???...,NAG
2,facebook_corpus_msr_331195,Also see ....hw ur RSS activist caught in Burk...,NAG
3,facebook_corpus_msr_403402,On the death of 2 jawans in LOC CROSS FIRING\n...,NAG
4,facebook_corpus_msr_379239,Modi ho ya Manmohan singh saala yeh log kuch n...,OAG
...,...,...,...
14996,facebook_corpus_msr_394638,They belong to you flight dirty terrorist coun...,OAG
14997,facebook_corpus_msr_429177,"Really motivating programme, congratulations t...",NAG
14998,facebook_corpus_msr_2032370,fabricated news,OAG
14999,facebook_corpus_msr_1722926,What's wrong with you secular idiots,OAG


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15001 entries, 0 to 15000
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   media     15001 non-null  object
 1   text      15001 non-null  object
 2   category  15001 non-null  object
dtypes: object(3)
memory usage: 351.7+ KB


In [5]:
data.isnull().sum()

media       0
text        0
category    0
dtype: int64

In [6]:
 # Overtly Aggressive(OAG), Covertly Aggressive (CAG) and Non-Aggressive(NAG) classes.
value_counts = data['category'].value_counts()
value_counts

category
NAG    6285
CAG    5297
OAG    3419
Name: count, dtype: int64

## Data cleaning

In [7]:
encoder = LabelEncoder()
data['category'] = encoder.fit_transform(data['category'])
data

Unnamed: 0,media,text,category
0,facebook_corpus_msr_451811,The quality of re made now makes me think it i...,0
1,facebook_corpus_msr_334368,@siva \nHow is ur mother???\nHow is ur wife???...,1
2,facebook_corpus_msr_331195,Also see ....hw ur RSS activist caught in Burk...,1
3,facebook_corpus_msr_403402,On the death of 2 jawans in LOC CROSS FIRING\n...,1
4,facebook_corpus_msr_379239,Modi ho ya Manmohan singh saala yeh log kuch n...,2
...,...,...,...
14996,facebook_corpus_msr_394638,They belong to you flight dirty terrorist coun...,2
14997,facebook_corpus_msr_429177,"Really motivating programme, congratulations t...",1
14998,facebook_corpus_msr_2032370,fabricated news,2
14999,facebook_corpus_msr_1722926,What's wrong with you secular idiots,2


In [8]:
for idx, class_name in enumerate(encoder.classes_):
    print(f"Class '{class_name}' is mapped to {idx}")

Class 'CAG' is mapped to 0
Class 'NAG' is mapped to 1
Class 'OAG' is mapped to 2


In [9]:
data.replace({'category':{2:0}}, inplace=True)

In [10]:
data['category'].value_counts()

category
0    8716
1    6285
Name: count, dtype: int64

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15001 entries, 0 to 15000
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   media     15001 non-null  object
 1   text      15001 non-null  object
 2   category  15001 non-null  int32 
dtypes: int32(1), object(2)
memory usage: 293.1+ KB


## Data preprocessing

In [12]:
# Download required resources
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adars\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\adars\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adars\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adars\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Helper function to map POS tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatizing(content):
    text = re.sub('[^a-zA-Z]',' ', content)
    text = text.lower()
    text = text.split()
    filtered_text = [word for word in text if word not in stop_words]

    pos_tags = nltk.pos_tag(filtered_text)
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    lemmatized_text = ' '.join(lemmatized_words)
    
    return lemmatized_text

In [14]:
data['text'] = data['text'].apply(lemmatizing)

In [15]:
data.head()

Unnamed: 0,media,text,category
0,facebook_corpus_msr_451811,quality make make think something buy fish market,0
1,facebook_corpus_msr_334368,siva ur mother ur wife ur sister hope everyone...,1
2,facebook_corpus_msr_331195,also see hw ur rss activist catch burkha throw...,1
3,facebook_corpus_msr_403402,death jawan loc cross firing h ble home minist...,1
4,facebook_corpus_msr_379239,modi ho ya manmohan singh saala yeh log kuch n...,0


## Vectorization and train-test-split

In [16]:
a = data['category'].values

In [17]:
def data_split_vectorization(df, ts, rs):
    X_train, X_test, y_train, y_test = train_test_split(df['text'], df['category'], test_size = ts, stratify=a, random_state = rs)
    
    tfidfvector = TfidfVectorizer(ngram_range=(1, 3))
    
    training_data = tfidfvector.fit_transform(X_train)
    
    testing_data = tfidfvector.transform(X_test)
    
    filename = 'tfidfvectoizer.pkl'
    
    pickle.dump(tfidfvector.vocabulary_, open(filename, 'wb'))
    # pickle.dump(tfidfvector, open(filename, 'wb'))

    
    return X_train, X_test, y_train, y_test, training_data, testing_data

In [18]:
X_train, X_test, y_train, y_test, training_data, testing_data = data_split_vectorization(data, 0.2, 42)

## Model training

In [19]:
def pipeline(X_train, y_train, X_test, y_test):
    ml_models = [LogisticRegression(), MultinomialNB(), LinearSVC(), SGDClassifier()]
    
    size = len(y_train)
    
    results = {}
    
    final_result = []
    
    for model in ml_models:
        results['Algorithm'] = model.__class__.__name__
        
        start = time()
        print(f"Training Time : {model.__class__.__name__}")
        model.fit(X_train, y_train)
        end = time()
        
        filename = model.__class__.__name__+".pkl"
        pickle.dump(model, open(filename, "wb"))
        
        results['Training Time'] = end - start
        
        start = time()
        prediction_train = model.predict(X_train)
        prediction_test = model.predict(X_test)
        end = time()
        
        results['Prediction Time'] = end - start
        
        results['Accuracy : Train'] = accuracy_score(y_train, prediction_train)
        results['Accuracy : Test'] = accuracy_score(y_test, prediction_test)
        
        results['F1 Score : Test'] = f1_score(y_test, prediction_test, average='weighted')
        results['F1 Score : Train'] = f1_score(y_train, prediction_train, average='weighted')
        
        results['Precision : Test'] = precision_score(y_test, prediction_test, average='weighted')
        results['Precision : Train'] = precision_score(y_train, prediction_train, average='weighted')
        
        results['Recall : Test'] = recall_score(y_test, prediction_test, average='weighted')
        results['Recall : Train'] = recall_score(y_train, prediction_train, average='weighted')
        
        print(f"Training {model.__class__.__name__} finished in {results['Training Time']} sec")
        
        final_result.append(results.copy())
    return final_result

In [20]:
final_result = pipeline(training_data, y_train, testing_data, y_test)

Training Time : LogisticRegression
Training LogisticRegression finished in 1.9874742031097412 sec
Training Time : MultinomialNB
Training MultinomialNB finished in 0.027102231979370117 sec
Training Time : LinearSVC
Training LinearSVC finished in 0.1606612205505371 sec
Training Time : SGDClassifier
Training SGDClassifier finished in 0.11164975166320801 sec


In [21]:
def performfinalresult(final_results):
    results = pd.DataFrame(final_results)
    results.reindex(columns = ['Algorithm', 'Accuracy : Test', 'Precision : Test', 'Recall : Test', 'F1 Score : Test', 'Prediction Time',
                          'Accuracy : Train', 'Precision : Train', 'Recall : Train', 'F1 Score : Train', 'Training Time'])
    results.sort_values(by = 'F1 Score : Test', inplace=True)
    
    return results

In [22]:
results = performfinalresult(final_result)
results.reset_index(drop = True)

Unnamed: 0,Algorithm,Training Time,Prediction Time,Accuracy : Train,Accuracy : Test,F1 Score : Test,F1 Score : Train,Precision : Test,Precision : Train,Recall : Test,Recall : Train
0,MultinomialNB,0.027102,0.015458,0.9385,0.703099,0.662972,0.937672,0.760403,0.943676,0.703099,0.9385
1,LogisticRegression,1.987474,0.004745,0.910667,0.727757,0.713485,0.908969,0.737216,0.918625,0.727757,0.910667
2,SGDClassifier,0.11165,0.004008,0.965917,0.735422,0.726482,0.96576,0.7382,0.966756,0.735422,0.965917
3,LinearSVC,0.160661,0.004587,0.991333,0.732423,0.726821,0.99133,0.731376,0.991338,0.732423,0.991333


In [26]:
data = [input("Enter the text")]

tfidf_vector = TfidfVectorizer(stop_words=stopwords.words('english'), lowercase=True, vocabulary=pickle.load(open("tfidfvectoizer.pkl", "rb")))
# tfidf_vector = pickle.load(open("tfidfvectoizer.pkl", "rb"))
preprocessed_data = tfidf_vector.fit_transform(data)

trained_model = pickle.load(open('LinearSVC.pkl', 'rb'))


if(trained_model.predict(preprocessed_data)==0):
    print("Cyberbullying")
else:
    print('Non-cyberbullying')

Enter the text he is a good human


Non-cyberbullying
