In [12]:
# importing package
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob # finds all the pathnames matching pattern
import os

In [2]:
# NLP package
import nltk

In [6]:
nltk.download('words')
words = set(nltk.corpus.words.words())

[nltk_data] Downloading package words to /home/l3gion/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [14]:
# make dataframe from json raw data
def make_dataframe_from_json():
    arr = np.array([])
    path_to_json_data = './GoogleExtensionScrape/scrapedata/'
    file_list = glob.glob(os.path.join(path_to_json_data, '*.json'))
    for file in file_list:
        json_array = pd.read_json(file).comments.values
        arr = np.append(arr, json_array)
    df = pd.DataFrame(arr, columns =['comments'])
    df['profanity'] = 0
    return df



In [59]:
special_words =['mug','mugi','cock','lado','bitch','jantha','jatha']

In [65]:
# replace the empty string with NAN value
def naRow(r):
    x = str(r).strip()
    if x == '' or x == 'nan':
        return np.nan
    return x

# english word removal function
def removeEnglishWord(sent):
    return " ".join(w for w in nltk.wordpunct_tokenize(sent) if w.lower() not in words or w.lower() in special_words or not w.isalpha())
# remove repeating letters
import re
def remove_recurring_letters(sentence):
    words = sentence.split()
    modified_sentence = []
    for word in words:
        modified_word = re.sub(r"(.)\1+", r"\1", word)
        modified_sentence.append(modified_word)
    return ' '.join(modified_sentence)


def drop_duplicates_null(df):
    # remove duplicates 
    df = df.drop_duplicates()
    # replace empty comment with NAN
    df['comments'] = df['comments'].apply(naRow)
    #delete the NAN value
    df = df.dropna()
    df = df.reset_index(drop=True)
    return df

# Clean function
def clean_comment(df):
    # clean special character except a-z
    df['comments'] = df['comments'].str.replace(r'[^a-zA-Z]', ' ', regex=True)
    # remove english words only while training 
    df['comments'] = df.apply(lambda row : removeEnglishWord(row['comments']), axis=1)
    # remove double + letters
    df['comments'] = df['comments'].apply(lambda x:remove_recurring_letters(x))
    # convert to lower letters
    df['comments'] = df['comments'].apply(lambda x: ' '.join([word.lower() for word in x.split()]))
    df = drop_duplicates_null(df)
    return df




In [66]:
nltk.download('stopwords')
from nltk.corpus import stopwords
import ntr 
from nltk.tokenize import word_tokenize

def remove_stop_words(df):
    stopwords_nepali = set(stopwords.words('nepali'))
    # stop words in roman
    stopwords_roman = list(map(ntr.nep_to_rom, stopwords_nepali))
    # remove stop words
    df['comments'] = df['comments'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stopwords_roman]))
    return df



[nltk_data] Downloading package stopwords to /home/l3gion/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [67]:
# remove single letter in comments
def remove_single_letter_words(sentence):
    pattern = r"\b\w\b"
    modified_sentence = re.sub(pattern, "", sentence)
    return modified_sentence
def remove_clean_single_letter(df):
    # remove single letter comment
    df['comments'] = df['comments'].apply(lambda x: remove_single_letter_words(x))
    # remove rows with empty set
    df['comments'] = df['comments'].apply(naRow)
    df = drop_duplicates_null(df)
    return df

## EDA

In [82]:
# import dataset
# df = make_dataframe_from_json()
df = pd.read_csv('./test1.3.csv')
df.head()

Unnamed: 0,comments,profanity
0,Nasty is talented as f**ck No matter what,1
1,IDK about business contract or what but nasty...,0
2,Uniq you playing too much man looks like you ...,0
3,Bring Nasty on the podcast plz,0
4,Bring NASTY in podcast lets hear frm his side,0


In [83]:
# clean df
df = clean_comment(df)
# remove stopwords 
df = remove_stop_words(df)
# remove single letter and clean empty string
df = remove_clean_single_letter(df)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['comments'] = df['comments'].apply(naRow)


Unnamed: 0,comments,profanity
0,ck,1
1,idk uniq,0
2,uniq playing loks shit ain gona,0
3,podcast plz,0
4,podcast lets frm,0


In [84]:
df.to_csv('clean_df_test', index= False)

In [85]:
df.head()

Unnamed: 0,comments,profanity
0,ck,1
1,idk uniq,0
2,uniq playing loks shit ain gona,0
3,podcast plz,0
4,podcast lets frm,0


after updating the profanity to each comments manaually

### Model trianing

In [23]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [86]:
# load dataset 
df = pd.read_csv('./clean_df_test')
df.head()

Unnamed: 0,comments,profanity
0,ck,1
1,idk uniq,0
2,uniq playing loks shit ain gona,0
3,podcast plz,0
4,podcast lets frm,0


In [87]:
# Feature Extraction
vectorizer = CountVectorizer()  # or TfidfVectorizer()
X = vectorizer.fit_transform(df['comments'])


In [88]:
# Model Training
X_train, X_test, y_train, y_test = train_test_split(X, df['profanity'], test_size=0.2, random_state=42)

classifier = MultinomialNB()
classifier.fit(X_train, y_train)

MultinomialNB()

In [96]:
# Random forest classifier
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

RandomForestClassifier()

In [107]:
# Model Evaluation
def model_evaluation(classifier):
    y_pred = classifier.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-Score:", f1)
 
print("----MultinomialNB-----")
model_evaluation(classifier)
print("----Randomforest-----")
model_evaluation(rf_classifier)


----MultinomialNB-----
Accuracy: 0.9086538461538461
Precision: 0.6756756756756757
Recall: 0.78125
F1-Score: 0.7246376811594203
----Randomforest-----
Accuracy: 0.9423076923076923
Precision: 0.9545454545454546
Recall: 0.65625
F1-Score: 0.7777777777777778


In [90]:
# confusion matrix
from sklearn.metrics import confusion_matrix
confusion_mat = confusion_matrix(y_test, y_pred)
print(confusion_mat)

[[164  12]
 [  7  25]]


### prediction

In [94]:
def preprocess_new_comment(new_comment):
    new_comment = clean_comment(pd.DataFrame({'comments': [new_comment]}))['comments'][0]
    new_comment = remove_stop_words(pd.DataFrame({'comments': [new_comment]}))['comments'][0]
    new_comment = remove_clean_single_letter(pd.DataFrame({'comments': [new_comment]}))['comments'][0]
    return new_comment

In [92]:
def transform_new_comment(new_comment):
    return vectorizer.transform([new_comment])

In [106]:
new_comment = "k gardai chas mug"
new_comment = preprocess_new_comment(new_comment)
X_new = transform_new_comment(new_comment)
prediction = classifier.predict(X_new)
prediction = rf_classifier.predict(X_new)
print('multinomialNB',prediction)
print('Random Forest',prediction)


multinomialNB [1]
Random Forest [1]
