In [1]:
import pandas as pd
import numpy as np
import pickle
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import jaccard_score, f1_score
from nltk.corpus import stopwords



In [2]:
pip install num2words


Note: you may need to restart the kernel to use updated packages.


In [3]:
from num2words import num2words


In [4]:
data_folder_path = "../Data/"
train_df = pd.read_csv(data_folder_path+"train.tsv",sep="\t")
validation_df = pd.read_csv(data_folder_path+"validation.tsv",sep="\t")
testing_df = pd.read_csv(data_folder_path+"test_unlabaled.tsv",sep="\t")

In [5]:
arabic_stopwords=stopwords.words('arabic')
train_df1 = train_df.copy()
c = 0
for article in train_df1['Article']:
    words=article.split()
    new_text =''
    for word in words:
        if(word not in arabic_stopwords):
            new_text = new_text+" "+word
    train_df1.loc[c,'Article'] = new_text
    c+=1

In [6]:
train_df2 = train_df1.copy()
c = 0
for article in train_df2['Article']:
    words=article.split()
    new_text =''
    for word in words:
        if(word.startswith('<') or (word.startswith('-'))):
            continue
        else:
            new_text = new_text+" "+word
    train_df2.loc[c,'Article'] = new_text
    c+=1

In [7]:
# show first 5 rows of the validation data
validation_df1 = validation_df.copy()
c = 0
for article in validation_df1['Article']:
    words=article.split()
    new_text =''
    for word in words:
        if(word not in arabic_stopwords):
            new_text = new_text+" "+word
    validation_df1.loc[c,'Article'] = new_text
    c+=1

In [8]:
validation_df2 = validation_df1.copy()
c = 0
for article in validation_df2['Article']:
    words=article.split()
    new_text =''
    for word in words:
        if(word.startswith('<') or (word.startswith('-'))):
            continue
        else:
            new_text = new_text+" "+word
    validation_df2.loc[c,'Article'] = new_text
    c+=1

In [9]:
# show first 5 rows of the testing data
testing_df1 = testing_df.copy()
c = 0
for article in testing_df1['Article']:
    words=article.split()
    new_text =''
    for word in words:
        if(word not in arabic_stopwords):
            new_text = new_text+" "+word
    testing_df1.loc[c,'Article'] = new_text
    c+=1

In [10]:
testing_df2 = testing_df1.copy()
c = 0
for article in testing_df2['Article']:
    words=article.split()
    new_text =''
    for word in words:
        if(word.startswith('<') or (word.startswith('-'))):
            continue
        else:
            new_text = new_text+" "+word
    testing_df2.loc[c,'Article'] = new_text
    c+=1

In [11]:
symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
for i in symbols:
    for j in np.arange(len(train_df2)):
        train_df2.loc[j,'Article'] =np.char.replace(train_df2.loc[j,'Article'], i, ' ').tolist()

In [12]:
for i in symbols:
    for j in np.arange(len(validation_df2)):
        validation_df2.loc[j,'Article'] = np.char.replace(validation_df2.loc[j,'Article'], i, ' ').tolist()

In [13]:
for i in symbols:
    for j in np.arange(len(testing_df2)):
        testing_df2.loc[j,'Article'] = np.char.replace(testing_df2.loc[j,'Article'], i, ' ').tolist()

In [14]:
train_df3 = train_df2.copy()
c = 0
for article in train_df2['Article']:
    words=article.split()
    new_text =''
    for word in words:
        if(word.isnumeric()):
            try:
                word=float(word)
                new_text = new_text+" "+num2words(word,lang='arabic')
            except:
                new_text = new_text+" "+word
                
        else:
            new_text = new_text+" "+word
    train_df3.loc[c,'Article'] = new_text
    c+=1

In [15]:
html_tags =["a","abbr","acronym","address","area","b","base","bdo","big","blockquote","body","br","button","caption","cite","code","col","colgroup","dd","del","dfn","div","dl","DOCTYPE","dt","em","fieldset","form","h1","h2","h3","h4","h5","h6","head","html","hr","i","img","input","ins","kbd","label","legend","li","link","map","meta","noscript","object","ol","optgroup","option","p","param","pre","q","samp","script","select","small","span","strong","style","sub","sup","table","tbody","td","textarea","tfoot","th","thead","title","tr","tt","ul","var"]
train_df4 = train_df3.copy()
c = 0
for article in train_df4['Article']:
    words=article.split()
    new_text =''
    for word in words:
        if(word not in html_tags):
            new_text = new_text+" "+word
    train_df4.loc[c,'Article'] = new_text
    c+=1

In [16]:
non_connecting_letter = ["و" ,"ذ", "د" ,"ز" ,"ر","ا",'ه','م' ]
train_df5 = train_df4.copy()
c = 0
for article in train_df2['Article']:
    words=article.split()
    new_text =''
    for word in words:
        if(len(word)==1 & (word in non_connecting_letter)):
            new_text = new_text+word  
        elif(len(word)==1 & (word not in non_connecting_letter)):
            new_text = new_text
        else:
            new_text = new_text+' '+word
    train_df5.loc[c,'Article'] = new_text
    c+=1

In [17]:
validation_df3 = validation_df2.copy()
c = 0
for article in validation_df2['Article']:
    words=article.split()
    new_text =''
    for word in words:
        if(word.isnumeric()):
            try:
                word=float(word)
                new_text = new_text+" "+num2words(word,lang='arabic')
            except:
                new_text = new_text+" "+word
                
        else:
            new_text = new_text+" "+word
    validation_df3.loc[c,'Article'] = new_text
    c+=1

In [18]:
testing_df3 = testing_df2.copy()
c = 0
for article in testing_df3['Article']:
    words=article.split()
    new_text =''
    for word in words:
        if(word.isnumeric()):
            try:
                word=float(word)
                new_text = new_text+" "+num2words(word,lang='arabic')
            except:
                new_text = new_text+" "+word
                
        else:
            new_text = new_text+" "+word
    testing_df3.loc[c,'Article'] = new_text
    c+=1

In [19]:
html_tags =["a","abbr","acronym","address","area","b","base","bdo","big","blockquote","body","br","button","caption","cite","code","col","colgroup","dd","del","dfn","div","dl","DOCTYPE","dt","em","fieldset","form","h1","h2","h3","h4","h5","h6","head","html","hr","i","img","input","ins","kbd","label","legend","li","link","map","meta","noscript","object","ol","optgroup","option","p","param","pre","q","samp","script","select","small","span","strong","style","sub","sup","table","tbody","td","textarea","tfoot","th","thead","title","tr","tt","ul","var"]
validation_df4 = validation_df3.copy()
c = 0
for article in validation_df4['Article']:
    words=article.split()
    new_text =''
    for word in words:
        if(word not in html_tags):
            new_text = new_text+" "+word
    validation_df4.loc[c,'Article'] = new_text
    c+=1

In [20]:
html_tags =["a","abbr","acronym","address","area","b","base","bdo","big","blockquote","body","br","button","caption","cite","code","col","colgroup","dd","del","dfn","div","dl","DOCTYPE","dt","em","fieldset","form","h1","h2","h3","h4","h5","h6","head","html","hr","i","img","input","ins","kbd","label","legend","li","link","map","meta","noscript","object","ol","optgroup","option","p","param","pre","q","samp","script","select","small","span","strong","style","sub","sup","table","tbody","td","textarea","tfoot","th","thead","title","tr","tt","ul","var"]
testing_df4 = testing_df3.copy()
c = 0
for article in testing_df4['Article']:
    words=article.split()
    new_text =''
    for word in words:
        if(word not in html_tags):
            new_text = new_text+" "+word
    testing_df4.loc[c,'Article'] = new_text
    c+=1

In [21]:
for i in np.arange(len(train_df4)):
    for j in train_df4.loc[i,'Article']:
        if(i=='،'):
            train_df4.loc[i,'Article']=np.char.replace(train_df4.loc[i,'Article'], "،", "").tolist()

In [22]:
for i in np.arange(len(validation_df4)):
    for j in validation_df4.loc[i,'Article']:
        if(i=='،'):
            validation_df4.loc[i,'Article']=np.char.replace(validation_df4.loc[i,'Article'], "،", "").tolist()

In [23]:
for i in np.arange(len(testing_df4)):
    for j in testing_df4.loc[i,'Article']:
        if(i=='،'):
            testing_df4.loc[i,'Article']=np.char.replace(testing_df4.loc[i,'Article'], "،", "").tolist()

In [25]:
train_df5.to_csv(r'C:\Users\kgu96\Desktop\nlp\paragraph classifying\JUST-Mowjaz-Competition-main\train.csv',index=False)

In [27]:
non_connecting_letter = ["و" ,"ذ", "د" ,"ز" ,"ر","ا",'ه','م' ]
validation_df5 = validation_df4.copy()
c = 0
for article in validation_df5['Article']:
    words=article.split()
    new_text =''
    for word in words:
        if(len(word)==1 & (word in non_connecting_letter)):
            new_text = new_text+word  
        elif(len(word)==1 & (word not in non_connecting_letter)):
            new_text = new_text
        else:
            new_text = new_text+' '+word
    validation_df5.loc[c,'Article'] = new_text
    c+=1

In [28]:
validation_df5.to_csv(r'C:\Users\kgu96\Desktop\nlp\paragraph classifying\JUST-Mowjaz-Competition-main\validation.csv',index=False)

In [29]:
non_connecting_letter = ["و" ,"ذ", "د" ,"ز" ,"ر","ا",'ه','م' ]
testing_df5 = testing_df4.copy()
c = 0
for article in testing_df5['Article']:
    words=article.split()
    new_text =''
    for word in words:
        if(len(word)==1 & (word in non_connecting_letter)):
            new_text = new_text+word  
        elif(len(word)==1 & (word not in non_connecting_letter)):
            new_text = new_text
        else:
            new_text = new_text+' '+word
    testing_df5.loc[c,'Article'] = new_text
    c+=1

In [30]:
testing_df5.to_csv(r'C:\Users\kgu96\Desktop\nlp\paragraph classifying\JUST-Mowjaz-Competition-main\test.csv',index=False)