In [1]:
import re
import pandas as pd

In [3]:
articles1_df = pd.read_csv('articles1_common.csv')
articles2_df = pd.read_csv('articles2_aljazeera.csv')

In [4]:
columns_to_drop_list1 = ['targe']
columns_to_drop_list2 = ['guid', 'published', 'title', 'description', 'link', 'image', 'ref', 'tags']

articles1_df = articles1_df.drop(columns_to_drop_list1, axis=1)
articles2_df = articles2_df.drop(columns_to_drop_list2, axis=1)


In [5]:
print(len(articles1_df))
print(len(articles2_df))
print(f'Number of documents in total: {len(articles1_df) + len(articles2_df)}')

111728
5870
Number of documents in total: 117598


In [6]:

def remove_extra_newlines(data):
    data = re.sub(r'\n\s*\n', '\n', data)
    return data


def remove_long_letters(data):
    data = re.sub(r'ـ\s*ـ', '', data)
    return data


def create_space_between_special_characters(data):
    data = re.sub(r'[”]', ' ” ', data)
    data = re.sub(r'["]', ' " ', data)
    data = re.sub(r'[""]', ' " ', data)
    data = re.sub(r'[،]', ' ، ', data)
    data = re.sub(r'[.]', ' . ', data)
    data = re.sub(r'[:]', ' : ', data)
    data = re.sub(r'[؛]', ' ؛ ', data)
    data = re.sub(r'»', ' » ', data)
    data = re.sub(r'«', ' « ', data)
    data = re.sub("[?]", " ? ", data)
    data = re.sub("[!]", " ! ", data)
    data = re.sub("[%]", " % ", data)
    data = re.sub("[*]", " * ", data)
    data = re.sub("[/]", " / ", data)
    data = re.sub("–", " – ", data)
    data = re.sub("<", " < ", data)
    data = re.sub(">", " > ", data)
    data = re.sub("-", " - ", data)
    data = re.sub("[(]", " ( ", data)
    data = re.sub("[)]", " ) ", data)
    data = re.sub("[[]", " [ ", data)
    data = re.sub("[]]", " ] ", data)
    return data


def to_arabic_digits(data):
    data = re.sub('0', '٠', data)
    data = re.sub('1', '١', data)
    data = re.sub('2', '٢', data)
    data = re.sub('3', '٣', data)
    data = re.sub('4', '٤', data)
    data = re.sub('5', '٥', data)
    data = re.sub('6', '٦', data)
    data = re.sub('7', '٧', data)
    data = re.sub('8', '٨', data)
    data = re.sub('9', '٩', data)
    return data


def remove_duplicate_quotes(data):
    data = re.sub(r'"+', '"', data)
    return data


def replace_extra_whitespaces_into_one(data):
    data = re.sub(' +', ' ', data)
    return data


def apply_all(data):
    data = remove_extra_newlines(data)
    data = remove_long_letters(data)
    data = create_space_between_special_characters(data)
    data = to_arabic_digits(data)
    data = replace_extra_whitespaces_into_one(data)
    data = remove_duplicate_quotes(data)
    return data


def preprocess(df, text_column_name='text'):
    df[text_column_name] = df[text_column_name].apply(lambda x: apply_all(x))
    return df


Cleaning Data and getting rid of unneccessary tuples

In [7]:
for i, row in articles1_df.iterrows():
    if type(row['text']) != str:
        row['text'] = str(row['text'])
        articles1_df.drop(labels=i, axis=0, inplace=True)
    row['text'] = apply_all(row['text'])

for i, row in articles2_df.iterrows():
    if type(row['text']) != str:
        row['text'] = str(row['text'])
        articles2_df.drop(labels=i, axis=0, inplace=True)
    row['text'] = apply_all(row['text'])




In [8]:
articles1_df.to_csv('articles1.csv')
articles2_df.to_csv('articles2.csv')