 
<img width="200px" height="200px" src='logo-en.png'/>

<br/>
<div style="text-align: center; font-size:20px; font-weight:bold; color: #212F3D">King Abdullah I School of Graduate Studies and Scientific Research</div><br/>
<div style="text-align: center; font-size:20px; font-weight:bold; color: #212F3D;">Data Augmentation using Transformers and Similarity Measures for Improving Arabic Text Classification</div><br/>
<div style="text-align: center; font-size:14px; font-weight:bold; color: #212F3D">Dania Refai<sup>1</sup>, Saleh Abu-Soud<sup>2</sup>, Mohammad Abdel-Rahman<sup>3</sup></div>
<br/>
<div style="text-align: left; font-size:14px; font-weight:normal; color: #212F3D">
    <sup>1</sup> Department of Computer Science, Princess Sumaya University for Technology (PSUT), Amman, Jordan</div>
<br/>
<div style="text-align: left; font-size:14px; font-weight:normal; color: #212F3D">
    <sup>2</sup> Department of Data Science, Princess Sumaya University for Technology (PSUT), Amman, Jordan</div>
<br/>
<div style="text-align: left; font-size:14px; font-weight:normal; color: #212F3D">
    <sup>3</sup> Department of Data Science, Princess Sumaya University for Technology (PSUT), Amman, Jordan</div>
<br/>

<div style="text-align: left; font-size:14px; font-weight:bold; color: #212F3D">
        Crosspending author: Dania Refai (<span style="text-align: left; font-size:16px; font-weight:bold; color: #6495ED">Dania.Refai@hotmail.com</span>).
</div>
<br/>
<hr/>

### <span style="text-align: left; font-size:20px; font-weight:bold; color: #C70039">General Notes and Directions</span> ###
<hr/>

> <li style="text-align: left; font-size:14px; font-weight:bold; color: #212F3D">&nbsp;Make sure you have pytorch installed on your machine. Moreover, if you want more information please refer to <a href="https://pytorch.org/">INSTALL PYTORCH</a> from their official website.</li>
> <li style="text-align: left; font-size:14px; font-weight:bold; color: #212F3D">&nbsp;Make sure your installed python version is 3.8</li>
> <li style="text-align: left; font-size:14px; font-weight:bold; color: #212F3D">&nbsp;Make sure you are running the commands INSIDE source code directory (<span style="color: #C70039">.\Implementation\</span>)</li>
> <li style="text-align: left; font-size:14px; font-weight:bold; color: #212F3D">&nbsp;Run the following commands in your command shell to create and activate a Virtualenv (<span style="color: #C70039">Windows based systems</span>):</li>
> <ol>    
> <li style="text-align: left; font-family:console; font-size:14px; font-weight:bold; color: #212F3D; list-style-type: none;">
       <span style="color: #C70039">cmd&gt;</span> set PATH=C:\Users\(<span style="text-align: left; font-size:14px; font-weight:bold; color: #C70039">-windows_user-</span>)\AppData\Local\Programs\Python\Python38\
    </li>
> <li style="text-align: left; font-family:console; font-size:14px; font-weight:bold; color: #212F3D; list-style-type: none;">
       <span style="color: #C70039">cmd&gt;</span> %PATH%\python.exe -m pip install --upgrade pip
    </li>   
> <li style="text-align: left; font-family:console; font-size:14px; font-weight:bold; color: #212F3D; list-style-type: none;">
       <span style="color: #C70039">cmd&gt;</span> %PATH%python.exe %PATH%Scripts\pip.exe install virtualenv 
    </li>    
> <li style="text-align: left; font-family:console; font-size:14px; font-weight:bold; color: #212F3D; list-style-type: none;">
       <span style="color: #C70039">cmd&gt;</span> %PATH%\python.exe -m virtualenv venv 
    </li>
> </ol>
> <li style="text-align: left; font-size:14px; font-weight:bold; color: #212F3D">&nbsp; Activate the virtual environment: </li>
> <ol>    
> <li style="text-align: left; font-family:console; font-size:14px; font-weight:bold; color: #212F3D; list-style-type: none;">
       <span style="color: #C70039">cmd&gt;</span> .\venv\Scripts\activate
    </li>  
> </ol>
> <li style="text-align: left; font-size:14px; font-weight:bold; color: #212F3D">&nbsp; Install requirements:</li>
> <ol>    
> <li style="text-align: left; font-family:console; font-size:14px; font-weight:bold; color: #212F3D; list-style-type: none;">
       <span style="color: #C70039">cmd&gt;</span> .\venv\Scripts\pip3 install python-dotenv
    </li>
> <li style="text-align: left; font-family:console; font-size:14px; font-weight:bold; color: #212F3D; list-style-type: none;">
       <span style="color: #C70039">cmd&gt;</span> .\venv\Scripts\pip3 install -r requirements.txt
    </li>   
> </ol>

> <li style="text-align: left; font-size:14px; font-weight:bold; color: #212F3D">&nbsp;Notebook Purpose: <span style="color: #C70039">Data Augmentation for ASTD dataset.</span></li>



### Imports

In [1]:
import pandas as pd
import numpy as np
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import corpus_bleu
import warnings
warnings.filterwarnings('ignore')


### Utils

In [2]:
def calculate_bleu_scores(references, hypotheses):
    """
    Calculates BLEU 1-4 scores based on NLTK functionality

    Args:
        references: List of reference sentences
        hypotheses: List of generated sentences

    Returns:
        bleu_1, bleu_2, bleu_3, bleu_4: BLEU scores

    """
    #return len(references), len(hypotheses)
    bleu_1 = np.round(corpus_bleu(references, hypotheses, weights=(1.0, 0., 0., 0.)), decimals=2)
    bleu_2 = np.round(corpus_bleu(references, hypotheses, weights=(0.50, 0.50, 0., 0.)), decimals=2)
    bleu_3 = np.round(corpus_bleu(references, hypotheses, weights=(0.34, 0.33, 0.33, 0.)), decimals=2)
    bleu_4 = np.round(corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25)), decimals=2)
    return bleu_1, bleu_2, bleu_3, bleu_4 

# Functions
def check_label (label):
    for lbl in LABEL_TO_AUGMENT:
        if lbl.upper() == label.upper():
            return True
    return False        

def check_similarity_cofficient (given_value, label, current_sim_coff):
    if not check_label(label):
        return False
    else:
        try:
            v = float(given_value)
            if float(SIM_COFFICIENTS_THRESHOLDS[current_sim_coff.upper()]) >= v:
                return True
            else:
                return False
        except:
            print('exception')

### Dataset Loading

In [3]:
datasetname = 'ASTD'
datasetpath = "datasets/xls/ASTD-Unbalanced-Augmented-aragpt2-base.csv"
df = pd.read_csv( datasetpath, sep="\t", encoding='utf-8')
df.columns = ['text', 'label', 'new_text', 'all_text', 'original_embbedding', 'new_embbedding', 'ecu_similarity', 'cos_similarity', 'jacc_similarity','text_split', 'all_text_split', 'new_text_split', 'bleu_sim_1','bleu_sim_2', 'bleu_sim_3', 'bleu_sim_4'] 
df.head()

Unnamed: 0,text,label,new_text,all_text,original_embbedding,new_embbedding,ecu_similarity,cos_similarity,jacc_similarity,text_split,all_text_split,new_text_split,bleu_sim_1,bleu_sim_2,bleu_sim_3,bleu_sim_4
0,5 هاتلي اخوان أي حاجة مش تنوين ومش ضمير اخوان ...,NEG,!!.,5 هاتلي اخوان أي حاجة مش تنوين ومش ضمير اخوان ...,"0.014882844,-0.051557414,-0.028316082,0.014168...","0.01946623,-0.010952667,-0.039843258,-0.057320...",0.772223,0.446,0.037037,"['5', 'هاتلي', 'اخوان', 'أي', 'حاجة', 'مش', 'ت...","['5', 'هاتلي', 'اخوان', 'أي', 'حاجة', 'مش', 'ت...",['!!.'],0.89,0.89,0.88,0.88
1,دباسم يوسف عمل برنامج البرنامج و #فسسسسسس,NEG,لر على # الفيس _ بوك [رابط]بسم الله الرحمن الر...,دباسم يوسف عمل برنامج البرنامج و # فسسلر على #...,"0.016909812,0.015640503,-0.02446039,-0.0235670...","0.017838204,0.007064947,-0.03709342,-0.0264731...",0.205765,0.929,0.4375,"['دباسم', 'يوسف', 'عمل', 'برنامج', 'البرنامج',...","['دباسم', 'يوسف', 'عمل', 'برنامج', 'البرنامج',...","['لر', 'على', '#', 'الفيس', '_', 'بوك', '[رابط...",0.14,0.13,0.12,0.11
2,منذ عامين وحتى الآن كل ما قدمه أنصار تيارات ال...,NEG,.,منذ عامين وحتى الآن كل ما قدمه أنصار تيارات ال...,"0.026780926,0.009709039,-0.030822175,-0.033138...","0.022658788,-0.0036188036,-0.033782676,-0.0437...",0.237325,0.904,0.0,"['منذ', 'عامين', 'وحتى', 'الآن', 'كل', 'ما', '...","['منذ', 'عامين', 'وحتى', 'الآن', 'كل', 'ما', '...",['.'],0.95,0.95,0.95,0.95
3,#السعاده ان يكون من نحب بخير وعافيه فنحن نشعر ...,POS,.,# السعاده ان يكون من نحب بخير وعافيه فنحن نشعر...,"0.011575715,-0.0191376,-0.041333534,-0.0137087...","0.022658788,-0.0036188036,-0.033782676,-0.0437...",0.352307,0.829,0.0,"['#السعاده', 'ان', 'يكون', 'من', 'نحب', 'بخير'...","['#', 'السعاده', 'ان', 'يكون', 'من', 'نحب', 'ب...",['.'],0.79,0.78,0.78,0.77
4,درية شرف الدين امرأة على الوشين لا مهنية ولا ا...,NEG,في الشوارع.,درية شرف الدين امرأة على الوشين لا مهنية ولا ا...,"0.016909812,0.015640503,-0.02446039,-0.0235670...","0.017580768,-0.0027376027,-0.03825421,-0.04189...",0.390541,0.834,0.36,"['درية', 'شرف', 'الدين', 'امرأة', 'على', 'الوش...","['درية', 'شرف', 'الدين', 'امرأة', 'على', 'الوش...","['في', 'الشوارع.']",0.92,0.92,0.92,0.92


In [4]:
df['label'].value_counts()

label
NEG        1640
NEUTRAL     805
POS         776
Name: count, dtype: int64

### Calculating Similarity Measures

In [5]:
df['text_split'] = [list(x.split()) for x in df['text']]
df['all_text_split'] = [x.split() for x in df['all_text']]
df['new_text_split'] = [str(x).split() for x in df['new_text']]

In [6]:
 df[['bleu_sim_1','bleu_sim_2','bleu_sim_3','bleu_sim_4']] = [ calculate_bleu_scores ([[x]],[y]) for x, y in zip(df['text_split'], df['all_text_split'])]


In [7]:
 df[['bleu_sim_1','bleu_sim_1','bleu_sim_1','bleu_sim_1']]

Unnamed: 0,bleu_sim_1,bleu_sim_1.1,bleu_sim_1.2,bleu_sim_1.3
0,0.89,0.89,0.89,0.89
1,0.14,0.14,0.14,0.14
2,0.95,0.95,0.95,0.95
3,0.79,0.79,0.79,0.79
4,0.92,0.92,0.92,0.92
...,...,...,...,...
3216,0.76,0.76,0.76,0.76
3217,0.74,0.74,0.74,0.74
3218,0.02,0.02,0.02,0.02
3219,0.13,0.13,0.13,0.13


In [8]:
df["ecu_similarity"].mean()

0.33158923031772564

In [9]:
df["cos_similarity"].mean()

0.8526818791946309

In [10]:
df["jacc_similarity"].mean()

0.3624915367325659

In [11]:
df["bleu_sim_1"].mean()

0.3949177274138466

In [12]:
# Parameters
all_datasets= []
SIM_COFFICIENTS_THRESHOLDS = {'ECU': df["ecu_similarity"].mean(), 'COS':df["cos_similarity"].mean(), 'JAC':df["jacc_similarity"].mean(), 'BLEU':df["bleu_sim_1"].mean()}
LABEL_TO_AUGMENT = ['POS', 'NEUTRAL']
DATA_COLUMN = "text"
LABEL_COLUMN = "label"

In [13]:
SIM_COFFICIENTS_THRESHOLDS

{'ECU': 0.33158923031772564,
 'COS': 0.8526818791946309,
 'JAC': 0.3624915367325659,
 'BLEU': 0.3949177274138466}

### Augmentation (All-Text)

In [14]:
EcuDF = pd.DataFrame()
CosDF = pd.DataFrame()
JacDF = pd.DataFrame()
BleDF = pd.DataFrame()
cntr = 1

print('All text augmentation is started... ')
for index, row in df.iterrows():         
    tmpDF = { 'text': row[DATA_COLUMN], 'label': row[LABEL_COLUMN]}
    Ecu_value = row['ecu_similarity']
    Cos_value = row['cos_similarity']
    Jac_value = row['jacc_similarity']
    Bleu_value = row['bleu_sim_1']
    
    EcuDF = EcuDF.append(tmpDF, ignore_index = True)
    CosDF = CosDF.append(tmpDF, ignore_index = True)
    JacDF = JacDF.append(tmpDF, ignore_index = True)
    BleDF = BleDF.append(tmpDF, ignore_index=True)
    
    tmpDF = { 'text': row['all_text'], 'label': row[LABEL_COLUMN]}
    # Check similarity 
    if check_similarity_cofficient (Ecu_value, row[LABEL_COLUMN], 'ecu'):
        EcuDF = EcuDF.append(tmpDF, ignore_index = True)
    
    if check_similarity_cofficient (Cos_value, row[LABEL_COLUMN], 'cos'):
        CosDF = CosDF.append(tmpDF, ignore_index = True)
    
    if check_similarity_cofficient (Jac_value, row[LABEL_COLUMN], 'jac'):
        JacDF = JacDF.append(tmpDF, ignore_index = True)
    
    if check_similarity_cofficient (Bleu_value, row[LABEL_COLUMN], 'bleu'):
        BleDF = BleDF.append(tmpDF, ignore_index = True)
        
print('All text augmentation is finished ... ')



All text augmentation is started... 


AttributeError: 'DataFrame' object has no attribute 'append'

In [None]:
# Export dataset
EcuDF.to_excel( "Augmented-Dataset/All/"+datasetname+"-Augmented-ECU-ALL-Text-Final.xlsx", encoding='utf-8', index=False)
CosDF.to_excel( "Augmented-Dataset/All/"+datasetname+"-Augmented-COS-ALL-Text-Final.xlsx", encoding='utf-8', index=False)
JacDF.to_excel( "Augmented-Dataset/All/"+datasetname+"-Augmented-JAC-ALL-Text-Final.xlsx", encoding='utf-8', index=False)
BleDF.to_excel( "Augmented-Dataset/All/"+datasetname+"-Augmented-BLE-ALL-Text-Final.xlsx", encoding='utf-8', index=False)

In [None]:
df.to_excel( "Augmented-Dataset/xls/ASTD-Unbalanced-Augmented-aragpt2-base.xlsx", encoding='utf-8', index=False)

### Augmentation (New-Text)

In [None]:
EcuDF = pd.DataFrame()
CosDF = pd.DataFrame()
JacDF = pd.DataFrame()
BleDF = pd.DataFrame()
cntr = 1

print('new text augmentation is started... ')
for index, row in df.iterrows():         
    tmpDF = { 'text': row[DATA_COLUMN], 'label': row[LABEL_COLUMN]}
    Ecu_value = row['ecu_similarity']
    Cos_value = row['cos_similarity']
    Jac_value = row['jacc_similarity']
    Bleu_value = row['bleu_sim_1']
    
    EcuDF = EcuDF.append(tmpDF, ignore_index = True)
    CosDF = CosDF.append(tmpDF, ignore_index = True)
    JacDF = JacDF.append(tmpDF, ignore_index = True)
    BleDF = BleDF.append(tmpDF, ignore_index=True)
    
    tmpDF = { 'text': row['new_text'], 'label': row[LABEL_COLUMN]}
    # Check similarity 
    if check_similarity_cofficient (Ecu_value, row[LABEL_COLUMN], 'ecu'):
        EcuDF = EcuDF.append(tmpDF, ignore_index = True)
    
    if check_similarity_cofficient (Cos_value, row[LABEL_COLUMN], 'cos'):
        CosDF = CosDF.append(tmpDF, ignore_index = True)
    
    if check_similarity_cofficient (Jac_value, row[LABEL_COLUMN], 'jac'):
        JacDF = JacDF.append(tmpDF, ignore_index = True)
    
    if check_similarity_cofficient (Bleu_value, row[LABEL_COLUMN], 'bleu'):
        BleDF = BleDF.append(tmpDF, ignore_index = True)
        
print('new text augmentation is finished ... ')



In [None]:
# Export dataset
EcuDF.to_excel( "Augmented-Dataset/New/"+datasetname+"-Augmented-ECU-New-Text-Final.xlsx", encoding='utf-8', index=False)
CosDF.to_excel( "Augmented-Dataset/New/"+datasetname+"-Augmented-COS-New-Text-Final.xlsx", encoding='utf-8', index=False)
JacDF.to_excel( "Augmented-Dataset/New/"+datasetname+"-Augmented-JAC-New-Text-Final.xlsx", encoding='utf-8', index=False)
BleDF.to_excel( "Augmented-Dataset/New/"+datasetname+"-Augmented-BLE-New-Text-Final.xlsx", encoding='utf-8', index=False)