# 1 - Packages

- string - to get the list of punctuation
- re - to work with regular expressions
- os - to work with operating system directory and files
- itertools - to count frequencies
- random - to shuffle lists
- [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) - it is used to remove `HTML` tags
------
- [NumPy](http://www.numpy.org/) - to manipulate matrices
- csv - to work with `csv` files
- [matplotlib.pyplot](https://matplotlib.org/api/pyplot_api.html) - to work with plots

In [None]:
import string
import re
import os
import pickle as pkl
from os import walk
from bs4 import BeautifulSoup

import numpy as np
import csv
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

# 2 - Constants

- `ARABIC_LETTERS_LIST` - a list containing the Arabic alphabet characters
- `DIACRITICS_LIST` - a list containing the diacritics characters in Arabic language
- `DN1` - directory name 1, for the processed data files
- `DN2` - directory name 2, for the cleaned data files

In [None]:
CONSTANTS_PATH = 'Constants'

with open(CONSTANTS_PATH + '/ARABIC_LETTERS_LIST.pickle', 'rb') as file:
    ARABIC_LETTERS_LIST = pkl.load(file)

with open(CONSTANTS_PATH + '/DIACRITICS_LIST.pickle', 'rb') as file:
    DIACRITICS_LIST = pkl.load(file)

DN1 = 'processed'
DN2 = 'cleaned'

# 3 - Inputs

- `dir_path` - directory path which containing the files to process them

In [None]:
dir_path = input()

# 4 - Clean the data

1. Get files paths
2. Helper methods
3. Process the files

## Cleaning process

1. Remove `HTML` tags
2. Remove URLs
3. Fix diacritization issues
4. Remove English letters
5. Remove `Kashida` Arabic character
6. Remove `*` (asterisk)
7. Add space before and after the numbers
8. Remove multiple whitespaces
9. Remove diacritics

### 4.1 - Get files paths

- `files_paths` - a list containing the files names

In [None]:
files_paths = []
for (dir_path, dirs_names, files_names) in walk(dir_path):
    for file_name in files_names:
        files_paths.append(dir_path + os.sep + file_name)
print('Number of files:', len(files_paths))

### 4.2 - Helper methods

- `clean(file_path)` - the main method that does the job
- `read_file_content(file_path)` - reads the content of the file in `file_path`
- `write_file_content(file_path, content, dir_name)` - writes `content` to file using `file_path` and `dir_name`
- `remove_html_tags(content)` - removes the `HTML` tags from `content`
- `remove_urls(content)` - removes the URLs from `content` using regular expressions
- `fix_diacritics(content)` - fixes diacritics positions and remove unneeded or misplaced ones in `content`
- `remove_english_letters(content)` - removes the English language letters from `content`
- `remove_shift_j(content)` - removes `SHIFT+J` Arabic character from `content`
- `remove_asterisk(content)` - removes `*` (asterisk) from `content`
- `fix_numbers(content)` - add space before and after numbers in `content`
- `remove_white_spaces(content)` - removes the white spaces from `content`
- `remove_diacritics(content)` - removes the diacritics in `DIACRITICS_LIST` from `content`
- `calculate_file_statistics(file_path, content, without_diac_content, split)` - calculates the statistics for the file in `file_path`

In [None]:
def clean(file_path):
    print('Processing:', file_path)
    
    content = read_file_content(file_path)
    
    content = remove_html_tags(content)
    content = remove_urls(content)
    content = fix_diacritics(content)
    content = remove_english_letters(content)
    content = remove_shift_j(content)
    content = fix_numbers(content)
    content = remove_white_spaces(content)
    without_diac_content = remove_diacritics(content)
    
    if len(content) == 0:
        return ''
    
    write_file_content(file_path, content, DN1)
    write_file_content(file_path, without_diac_content, DN2)
    calculate_file_statistics(file_path, content, without_diac_content)
    
    return content

In [None]:
def read_file_content(file_path):
    return open(file_path).read()

In [None]:
def write_file_content(file_path, content, dir_name):
    file_path = file_path.split(os.sep)
    file_path = os.path.join(os.sep.join(file_path[:-1]), dir_name, file_path[-1])
    print('Writing:', file_path)
    with open(file_path, mode='w') as file_writer:
        file_writer.write(content)

In [None]:
def remove_html_tags(content):
    soup = BeautifulSoup(content, 'html.parser')
    return soup.get_text()

In [None]:
def remove_urls(content):
    content = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                     ' ', content, flags=re.MULTILINE)
    content = re.sub(r'www(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                     ' ', content, flags=re.MULTILINE)
    return content

In [None]:
def fix_diacritics(content):
    content = re.sub(r'(?P<char>[' + ''.join(ARABIC_LETTERS_LIST) + DIACRITICS_LIST[-1] + '])\s+(?P<diac>[' + ''.join(DIACRITICS_LIST) + ']+)(?P<brek>[\s+]|\Z)', r'\g<char>\g<diac>\g<brek>', content)
    content = re.sub(r'(?P<char>[^' + ''.join(ARABIC_LETTERS_LIST) + ''.join(DIACRITICS_LIST) + '])[' + ''.join(DIACRITICS_LIST) + ']+', r'\g<char>', content)
    content = re.sub(r'[' + DIACRITICS_LIST[-1] + ']+', DIACRITICS_LIST[-1], content)
    content = re.sub(r'(?P<diac>[' + ''.join(DIACRITICS_LIST[:-1]) + '])[' + ''.join(DIACRITICS_LIST) + ']+', r'\g<diac>', content)
    return content

In [None]:
def remove_english_letters(content):
    return content.translate(str.maketrans(string.ascii_letters, ' ' * len(string.ascii_letters)))

In [None]:
def remove_shift_j(content):
    return content.replace('ـ', '')

In [None]:
def remove_asterisk(content):
    return content.replace('*', '\n')

In [None]:
def fix_numbers(content):
    return re.sub(r'(?P<numb>[0-9]+)', r' \g<numb> ', content)

In [None]:
def remove_white_spaces(content):
    content = re.sub(r'[^\S\n]*\n[\s]*', '\n', content, flags=re.MULTILINE)
    content = re.sub(r'[^\S\n]+', ' ', content, flags=re.MULTILINE)
    content = re.sub(r'\A | \Z', '', content, flags=re.MULTILINE)
    return content

In [None]:
def remove_diacritics(content):
    return content.translate(str.maketrans('', '', ''.join(DIACRITICS_LIST)))

In [None]:
def calculate_file_statistics(file_path, content, without_diac_content, split='training'):
    content_splitted = content.split('\n')
    without_diac_content_splitted = without_diac_content.split('\n')
    file_statistics = dict()
    
    file_statistics['file_name'] = file_path.split('/')[-1]
    
    file_statistics['processed_chars_count'] = len(content)
    file_statistics['cleaned_chars_count'] = len(without_diac_content)
    file_statistics['words_count'] = 0
    file_statistics['lines_count'] = len(without_diac_content_splitted)
    
    file_statistics['arabic_chars_count'] = 0
    for ch in ARABIC_LETTERS_LIST:
        file_statistics['arabic_chars_count'] += without_diac_content.count(ch)
    
    file_statistics['no_diacritics_percentage'] = 0
    file_statistics['one_diacritics_percentage'] = 0
    file_statistics['two_diacritics_percentage'] = 0
    file_statistics['error_diacritics_percentage'] = 0
    
    for line in content_splitted:
        tmp = 0
        for word in line.split(' '):
            if re.search(r'[' + ARABIC_LETTERS_LIST + ']+', word):
                tmp += 1
                file_statistics['words_count'] += 1
                for idx, ch in enumerate(word):
                    if ch in ARABIC_LETTERS_LIST:
                        if idx + 1 >= len(word) or word[idx + 1] not in DIACRITICS_LIST:
                            file_statistics['no_diacritics_percentage'] += 1
                        elif word[idx + 1] == 'ّ':
                            if idx + 2 >= len(word) or word[idx + 2] not in DIACRITICS_LIST:
                                file_statistics['one_diacritics_percentage'] += 1
                            elif word[idx + 2] != 'ّ' and (idx + 3 >= len(word) or word[idx + 3] not in DIACRITICS_LIST):
                                file_statistics['two_diacritics_percentage'] += 1
                            else:
                                file_statistics['error_diacritics_percentage'] += 1
                        else:
                            if idx + 2 >= len(word) or word[idx + 2] not in DIACRITICS_LIST:
                                file_statistics['one_diacritics_percentage'] += 1
                            else:
                                file_statistics['error_diacritics_percentage'] += 1
        if split == 'training':
            if tmp in line_lengths_fr.keys():
                line_lengths_fr[tmp] += 1
            else:
                line_lengths_fr[tmp] = 1
        elif split == 'testing':
            if tmp in line_lengths_fr_testing.keys():
                line_lengths_fr_testing[tmp] += 1
            else:
                line_lengths_fr_testing[tmp] = 1
    
    if file_statistics['words_count'] == 0:
        return
    
    file_statistics['avg_word_chars'] = file_statistics['arabic_chars_count'] / file_statistics['words_count']
    file_statistics['avg_line_words'] = file_statistics['words_count'] / len(without_diac_content_splitted)
    
    file_statistics['diacritics_count'] = file_statistics['processed_chars_count'] - file_statistics['cleaned_chars_count']
    file_statistics['diacritics_percentage'] = (file_statistics['one_diacritics_percentage'] + file_statistics['two_diacritics_percentage']) / file_statistics['arabic_chars_count'] * 100
    
    assert(file_statistics['no_diacritics_percentage'] + file_statistics['one_diacritics_percentage'] + file_statistics['two_diacritics_percentage'] + file_statistics['error_diacritics_percentage'] == file_statistics['arabic_chars_count'])
    
    file_statistics['no_diacritics_percentage'] /= file_statistics['arabic_chars_count'] / 100
    file_statistics['one_diacritics_percentage'] /= file_statistics['arabic_chars_count'] / 100
    file_statistics['two_diacritics_percentage'] /= file_statistics['arabic_chars_count'] / 100
    file_statistics['error_diacritics_percentage'] /= file_statistics['arabic_chars_count'] / 100
    
    file_statistics['min_line_chars'] = len(min(without_diac_content_splitted, key=len))
    file_statistics['max_line_chars'] = len(max(without_diac_content_splitted, key=len))
    file_statistics['min_line_words'] = len(min(without_diac_content_splitted, key=lambda line: len(line.split(' '))).split(' '))
    file_statistics['max_line_words'] = len(max(without_diac_content_splitted, key=lambda line: len(line.split(' '))).split(' '))
    
    if split == 'training':
        statistics.append(file_statistics)
    elif split == 'testing':
        statistics_testing.append(file_statistics)
    
    return file_statistics

### 4.3 - Process the files

Call `clean` method on each file :)

In [None]:
os.mkdir(os.path.join(dir_path, DN1))
os.mkdir(os.path.join(dir_path, DN2))

In [None]:
statistics = []
line_lengths_fr = dict()
statistics_testing = []
line_lengths_fr_testing = dict()

In [None]:
for file_path in files_paths:
    clean(file_path)
print('Finished!')

In [None]:
def write_dataset_statistics(output_file_name, statistics, line_lengths_fr):
    with open(output_file_name + '.csv', 'w') as csv_writer:
        f = csv.writer(csv_writer)

        f.writerow(['file_name',
                    'processed_chars_count', 'cleaned_chars_count',
                    'words_count',
                    'lines_count',
                    'arabic_chars_count',
                    'avg_word_chars', 'avg_line_words',
                    'diacritics_count', 'diacritics_percentage',
                    'no_diacritics_percentage', 'one_diacritics_percentage', 'two_diacritics_percentage', 'error_diacritics_percentage',
                    'min_line_chars', 'max_line_chars',
                    'min_line_words', 'max_line_words'])

        for file_statistics in statistics:
            f.writerow([file_statistics['file_name'],
                        file_statistics['processed_chars_count'], file_statistics['cleaned_chars_count'],
                        file_statistics['words_count'],
                        file_statistics['lines_count'],
                        file_statistics['arabic_chars_count'],
                        file_statistics['avg_word_chars'], file_statistics['avg_line_words'],
                        file_statistics['diacritics_count'], file_statistics['diacritics_percentage'],
                        file_statistics['no_diacritics_percentage'], file_statistics['one_diacritics_percentage'], file_statistics['two_diacritics_percentage'], file_statistics['error_diacritics_percentage'],
                        file_statistics['min_line_chars'], file_statistics['max_line_chars'],
                        file_statistics['min_line_words'], file_statistics['max_line_words']])

    figure(num=None, figsize=(8, 6), dpi=1000, facecolor='w', edgecolor='k')
    plt.bar(line_lengths_fr.keys(), line_lengths_fr.values(), align='center')
    plt.title(output_file_name + ' - Original')
    plt.ylabel('Lines Lengths Frequency')
    plt.xlabel('Lines Lengths')
    plt.savefig(output_file_name + ' - Original')

    figure(num=None, figsize=(8, 6), dpi=1000, facecolor='w', edgecolor='k')
    plt.bar(line_lengths_fr.keys(), np.sqrt(np.sqrt(list(line_lengths_fr.values()))), align='center')
    plt.title(output_file_name + ' - Re-Scaled')
    plt.ylabel('Lines Lengths Frequency')
    plt.xlabel('Lines Lengths')
    plt.savefig(output_file_name + ' - Re-Scaled')

In [None]:
write_dataset_statistics('Tashkeela Dataset Statistics', statistics, line_lengths_fr)