In [22]:
#Pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool,
#built on top of the Python programming language.
import pandas as pd

#NLTK is a leading platform for building Python programs to work with human language data.
import nltk

#Regular Expression
import re

#A specific Arabic language library for Python, provides basic functions to manipulate Arabic letters and text, 
#like detecting Arabic letters, Arabic letters groups and characteristics, remove diacritics etc.
import pyarabic.araby as araby
from pyarabic.araby import tokenize, is_arabicrange, strip_tashkeel

#This module provides access to the Unicode Character Database (UCD) which defines character properties for all
#Unicode characters.
import unicodedata

In [23]:
#Call nltk.RegexpTokenizer(pattern) with pattern as r"\w+" to create a tokenzier that uses pattern to split a string. 
#Call RegexpTokenizer.tokenize(text) with RegexpTokenizer as the previous result and text as a string representing a sentence
#to return text as a list of words with punctuation's removed.( Remove punctuation and emojis)
tokenizer = nltk.RegexpTokenizer(r"\w+")

In [40]:
def remove_repeating_char(text):
    new_text = []
    for current_word in text.split(' '):
        if current_word not in  ['الله','والله']:
            new_text.append(re.sub(r'(.)\1+', r'\1', current_word))
        else : 
            new_text.append(current_word)
    return ' '.join(new_text)

In [25]:
def remove_laten_char(text):
    text = ''.join((c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn'))
    text = re.sub(r'[a-zA-Z]+', '', text)
    return text

In [26]:
def remove_diacritics(text):
    return araby.strip_tashkeel(text)
    

In [27]:
def normalize_hamza(text):
    return araby.normalize_hamza(text, method="tasheel")

In [28]:
def remove_laten_char_and_diacritics(text):
    return ' '.join(tokenize(text, conditions=is_arabicrange, morphs=strip_tashkeel))

In [29]:
def data_pre_processing(data) :
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    data = remove_laten_char(data)
    data = remove_diacritics(data)
    data = normalize_hamza(data)
    data = remove_repeating_char(data)
    #Remove punctuation and tokenize the data : 
    data = tokenizer.tokenize(data)
 
    return data



In [31]:
def cleaning_data(data):
    
    new_comments = []
    new_data = pd.DataFrame(columns=['author', 'authorChannelUrl', 'text','likeCount', 'publishedAt', 'offensive/non offensive', 'Algerian Dialect'])
    for index in range(len(data)):
        comment = data.iloc[index]
        comment["text"] = ' '.join(data_pre_processing(comment["text"]))
        if not (len( comment["text"])<2 and ( not  comment["text"] or len(comment["text"][0]) < 2)) :
            if(new_data.empty):
                new_data = new_data.append({'author':comment["author"] , 'authorChannelUrl':comment["authorChannelUrl"] , 'text':comment["text"] ,'likeCount':comment["likeCount"] , 'publishedAt':comment["publishedAt"] , 'offensive/non offensive':comment["offensive/non offensive"] , 'Algerian Dialect':comment["Algerian Dialect"] }, ignore_index=True)
                new_comments.append(comment["text"])
            else:
                if(comment["text"] not in new_comments):
                    new_data = new_data.append({'author':comment["author"] , 'authorChannelUrl':comment["authorChannelUrl"] , 'text':comment["text"] ,'likeCount':comment["likeCount"] , 'publishedAt':comment["publishedAt"] , 'offensive/non offensive':comment["offensive/non offensive"] , 'Algerian Dialect':comment["Algerian Dialect"] }, ignore_index=True)
                    new_comments.append(comment["text"])
    return new_data 

In [42]:
data = pd.read_csv("../Data/YouTube Data For cyberbullying Detection in the Algerian Dialect - YouTubeDataFile.csv")

In [41]:
new_data = cleaning_data(data)
new_data.to_csv("../Data/PreProcessedYouTubeDataFile.csv")
print(len(data))
print(len(new_data))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


10552
8931


In [43]:
 # Create dictionary that contains : (key= word  :  weight=avgOfSeverities)

def create_dictionary():
    df = pd.read_csv('../Data/bad_words.csv')
    dictionary = {}
    for i in df.index:
        data = (df.loc[i])
        key = ' '.join(data_pre_processing(data["bad_word"]))
        if key in dictionary:
            dictionary[key] = (dictionary[key] + data["severity"]) / 2
        else :
            dictionary[key] = data["severity"]
    return dictionary 
        

In [46]:
# Create dictionary that contains : (key= (word,classe)  :  weight=avgOfSeverities)

def create_dictionaryِClasses():
    df = pd.read_csv('../Data/bad_words.csv')
    dictionaryِClasses = {}
    classes = ["سب" , "سخرية" , "تحرش" , "تهديد"]

    for i in df.index:
        data = (df.loc[i])
        for class_ in classes : 
            key = (' '.join(data_pre_processing(data["bad_word"])), class_ )
            if key in dictionaryِClasses:
                if(data["classification"] == class_):
                    dictionaryِClasses[key] = (dictionaryِClasses[key] + data["severity"]) / 2
            else :
                if(data["classification"] == class_):
                    dictionaryِClasses[key] = data["severity"]
    return dictionaryِClasses

In [48]:
dic = create_dictionary()
dic_clas = create_dictionaryِClasses()

print(len(dic))
print(dic)

print(len(dic_clas))
print(dic_clas)

215
{'مهبول': 9, 'جايح': 8.4453125, 'زبل': 7.25, 'خرية': 5.5, 'الله يلعنك': 10.0, 'بلع فمك': 5.0, 'متخلف': 8, 'بغل': 8.4296875, 'مسطاشة': 7, 'حمارة': 6.25, 'عطايا': 10, 'حيوان': 8.0, 'وي السلعة': 7, 'نقتلك': 7.34375, 'تافه': 4.5, 'قود': 5.515625, 'رخيس': 7.890625, 'حمار': 7.8876953125, 'تفو': 6.0, 'عاقر': 10, 'بلعه': 5, 'مال ربك': 10, 'سمينة': 7, 'خرا': 5.0, 'نيك امك': 8.5, 'نيك مك': 9.75, 'قهوي': 9.0, 'يا الكعبة': 5, 'كعبة': 8.5, 'كافي': 7.875, 'هايشة': 5.5, 'ساقطة': 9, 'معفون': 5, 'يا الزح': 7, 'متعرف والوا': 5, 'معقد': 7, 'كلب': 9.03125, 'بغلة': 5.5, 'خماج': 6.25, 'خامج': 9.0, 'ينعل والديك': 9.0, 'موسخ': 6.25, 'رخيص': 7, 'قحبة': 9.2265625, 'حابس': 6.375, 'نحيلك سروال': 6, 'نيك اخنك': 7, 'زبي': 9.0, 'اخرجلي طابي': 10, 'شماتة': 7.5, 'ڨليط': 5.5, 'بلعيه': 8, 'نت متفهمش': 6, 'نحرقك': 9.0, 'مقطعة': 10, 'جايحة': 3, 'قودي': 8, 'ليك غير القمع': 9, 'خامجة': 6.5, 'انت دير هادي': 6, 'طحان': 7.5, 'فرخ': 6.625, 'مشحاح': 5, 'نقش': 9.3125, 'قحابجي': 9, 'فايحة': 4, 'عطاي': 9.59375, 'بومبا': 6, 'بلع

In [50]:
pd.DataFrame(list(dic),columns = ['bad_word',''] ).to_csv('../Data/dic.csv')

AttributeError: 'dict' object has no attribute 'to_csv'