In [1]:
!pip install farasapy

Collecting farasapy
  Downloading farasapy-0.0.14-py3-none-any.whl (11 kB)
Installing collected packages: farasapy
Successfully installed farasapy-0.0.14


In [2]:
import pandas as pd
import re
from tqdm import tqdm
import json
import random
from farasa.stemmer import FarasaStemmer
import string

In [3]:
def clean(text):
    text = re.sub(r"http\S+", " ", text)  # remove urls
    text = re.sub(r"@[\w]*", " ", text)  # remove handles
    # text = re.sub(r"[\.\,\#_\|\:\?\?\/\=]", " ", text) # remove special characters
    text = re.sub(r"\t", " ", text)  # remove tabs
    text = re.sub(r"\n", " ", text)  # remove line jump
    # text = re.sub(r"\s+", " ", text)  # remove extra white space
    # text = re.sub(r'[^\w\s]', '', text) # Removing punctuations in string using regex
    text = text.strip()
    return text


In [None]:
# arabic stemmer
stemmer_interactive = FarasaStemmer(interactive=True)
# remove arabic stop words
def ar_remove_stop_words(sentence):
    terms=[]
    stopWords= set({'من', 'الى', 'إلى', 'عن', 'على', 'في', 'حتى'})
    for term in sentence.split() :
        if term not in stopWords :
            terms.append(term)
    return " ".join(terms)

def remove_punc(text):
        exclude = set(string.punctuation)
        # Arabic punctuation
        exclude.add('،')
        exclude.add('؛')
        exclude.add('؟')
        return ''.join(ch for ch in text if ch not in exclude)

def white_space_fix(text):
    return ' '.join(text.split())


# normalize the arabic text
def normalize_arabic(text):
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    return(text)

# stem the arabic text
def ar_stem(sentence):
#     return " ".join([stemmer_interactive.stem(i) for i in sentence.split()])
    stemmed=[]
    for i in sentence.split():
        stemmed.append(stemmer_interactive.stem(sentence))
        stemmer_interactive.terminate()
    return stemmed


def preprocess_arabic(text):
    #
    text=white_space_fix(ar_remove_stop_words(remove_punc(text)))
    text = ar_stem(text)
    #text = normalize_arabic(text)
    return text



  1%|▏         | 3.41M/241M [00:04<05:15, 754kiB/s]

In [None]:
def read_file(input_file, sep="\t", names=""):
    if input_file.endswith(".xlsx"):
        df = pd.read_excel(input_file)
    else:
        if names != "":
            df = pd.read_csv(input_file, sep=sep, names=names, encoding="utf-8")
        else:
            df = pd.read_csv(input_file, sep=sep, encoding="utf-8")
    return df


qrels_columns = ["qid", "Q0", "docid", "relevance"]


def read_qrels_file(qrels_file_path):
    # split_token = '\t' if format_checker.is_tab_sparated(qrels_file) else  "\s+"
    df_qrels = pd.read_csv(qrels_file_path, sep='\t', names=qrels_columns,encoding='utf-8')
    df_qrels["qid"] = df_qrels["qid"].astype(str)
    df_qrels["docid"] = df_qrels["docid"].astype(str)
    return df_qrels

def prepare_data(path, column, id_type, id_column='docno'):
    df = read_file(path, names=['docno', 'text'])

    df[column] = df['text'].apply(clean)
    df[column] = df[column].apply(preprocess_arabic)

    df[id_type] = df[id_column].astype(str)  # convert the id column to string
    df = df[[id_type, 'text', column]]  # keep the columns needed for search

    return df

In [None]:
df_passage = prepare_data('/kaggle/input/dataset-raw/data/Thematic_QPC/QQA23_TaskA_QPC_v1.1.tsv', 'passage', 'pid')

df_query_train = prepare_data('/kaggle/input/dataset-raw/data/QQA23_TaskA_train.tsv', 'query', 'qid')
df_query_dev = prepare_data('/kaggle/input/dataset-raw/data/QQA23_TaskA_dev.tsv', 'query', 'qid')
df_query_test = prepare_data('/kaggle/input/dataset-raw/data/QQA23_TaskA_test.tsv', 'query', 'qid')

df_qppair_train = read_qrels_file("/kaggle/input/dataset-raw/data/qrels/QQA23_TaskA_qrels_train.gold")
df_qppair_dev = read_qrels_file("/kaggle/input/dataset-raw/data/qrels/QQA23_TaskA_qrels_dev.gold")
df_qppair_test=read_qrels_file("/kaggle/input/dataset-raw/data/qrels/QQA23_TaskA_ayatec_v1.2_qrels_test.gold")

In [None]:
df_passage=pd.read_csv('passage.csv')
df_query_train=pd.read_csv('questionsTrain.csv')
df_query_devpd=pd.read_csv('questionsDev.csv')
df_query_test=pd.read_csv('questionsTest.csv')
df_qppair_train=pd.read_csv('pairsTrain.csv')
df_qppair_dev=pd.read_csv('pairsDev.csv')
df_qppair_test=pd.read_csv('pairsTest.csv')

In [None]:
taskBTrain=pd.read_json('/kaggle/input/dataset-raw/QRCD/QQA23_TaskB_qrcd_v1.2_train_preprocessed.jsonl',lines=True)
taskBDev=pd.read_json('/kaggle/input/dataset-raw/QRCD/QQA23_TaskB_qrcd_v1.2_dev_preprocessed.jsonl',lines=True)
taskBTest=pd.read_json('/kaggle/input/dataset-raw/QRCD/QQA23_TaskB_qrcd_v1.2_test_gold_preprocessed.jsonl',lines=True)
taskBNA=pd.read_json('/kaggle/input/dataset-raw/QRCD/QQA23_TaskB_qrcd_v1.2_test_noAnswers_preprocessed.jsonl',lines=True)

In [None]:
def preprocessTaskB(data):
    for c in data.columns():
        if c in ['pid','surah','verses']:
            continue
        if c=='answers':
            for ans in range(len(data)):
                for a in data[c][ans]:
                    a['text']=preprocess_arabic(a['text'])
                    a['end_char']=a['start_char']+len(a['text'])
                    
                    
                

In [1]:
import multiprocessing
multiprocessing.cpu_count()

12

In [3]:
20/4

5.0