## **Set up the environment**

In [1]:
import numpy as np 
import pandas as pd
import os, time, random

### **Access Dataset**

In [2]:
os.environ['KAGGLE_USERNAME'] = "kingalagbe"
os.environ['KAGGLE_KEY'] = "9d829fe67ac875f5eda3761e81b3811e"

!kaggle datasets download -d welcomehere/restoringdiacritics1

Downloading restoringdiacritics1.zip to /kaggle/working
 95%|██████████████████████████████████████▏ | 185M/194M [00:09<00:00, 25.8MB/s]
100%|████████████████████████████████████████| 194M/194M [00:09<00:00, 22.4MB/s]


### **Extract the dataset**

In [3]:
#Extraction process

#1.0
from zipfile import ZipFile

#2.0
file_name = "./restoringdiacritics1.zip"

#3.0
with ZipFile(file_name, 'r') as zip: 
  #zip.printdir()
  lst = zip.namelist()
  
  #3.1
  print('Extracting all the files now...') 
  tick = time.time()
  zip.extractall()
  tock = time.time()
  print(f'Done!\n{round((tock-tick),2)} seconds...')

Extracting all the files now...
Done!
11.06 seconds...


In [4]:
os.remove('./restoringdiacritics1.zip')

In [5]:
path = "./"

train_df = pd.read_csv(path + 'train.csv', compression = 'zip')
test_df = pd.read_csv(path + 'test.csv')

### **Process Train Data**

In [6]:
len(train_df)

1185126

In [7]:
train_df.shape

(1185126, 1)

In [8]:
train_df.head().T

Unnamed: 0,0,1,2,3,4
labels,A kì í ṣíwájú ẹlẹ́èẹ́dẹ́.,Èpè-é pọ̀ ju ohun tó nù lọ; abẹ́rẹ́ sọnù a gbé...,"Bí a kò bá ṣe bí ẹlẹ́dẹ̀ lọ́nà Ìkòròdú, a ò lè...",A kì í fi idà pa ìgbín.,"Ọkùnrin kì í ké, akọ igi kì í ṣoje."


In [9]:
train_df.tail().T

Unnamed: 0,1185121,1185122,1185123,1185124,1185125
labels,"13. B'ó bá k'ẹ̀gbẹ́ sí ẹ, kò bọ́si rárá-o. Fac...","14. B'ó bá bẹ̀rẹ̀ mọ́'lẹ̀, kò bọ́si rárá. Face...","15. À mọ́ kò lọ rántí wípé, láti [r]ìbà'dí ẹ t...","16. Láti [r]ìbà'dí ẹ títí lọ dé òkè orí, k'ó m...","17. Synchro System, ẹ máa jó!"


In [10]:
train_df.iloc[100_000].values[0]

'Ọ̀gá kan wà níbẹ̀ tó fẹ́ràn kó máa fi kòbókò na àwọn ọkùnrin náà .'

In [11]:
#Clean data
#Remove English words
cleaned_data = []
for i, value in enumerate(train_df.labels.values):
    high = max([ord(i) for i in list(value.lower())])
    if (300 > high > 127) or (7935 > high > 7680):
        cleaned_data.append(value.strip())

In [12]:
pd.Series(cleaned_data).sample(5).T

471724     Bákan náà ni ó rí ní àkókò ti Lọti. Wọ́n ń jẹ,...
1090473    Ẹ máa yẹ́ gbogbo eniyan sí. Ẹ máa fẹ́ràn àwọn ...
1034013                                                 àìjà
719268     Dájúdájú , ìfẹ́ aládùúgbò wa jẹ́ ọ̀kan lára àm...
1051501    Nígbà tí o bá ń rìn, o kò ní rí ìdínà,nígbà tí...
dtype: object

In [13]:
len(cleaned_data)

1185126

In [14]:
random.seed(42)

In [15]:
random.shuffle(cleaned_data)

In [16]:
train_df.to_csv("train.csv", index = False)#, compression = 'zip')
train_df.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1185126 entries, 0 to 1185125
Data columns (total 1 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   labels  1185126 non-null  object
dtypes: object(1)
memory usage: 275.4 MB


In [17]:
df = pd.DataFrame(cleaned_data, columns = ['labels'])
df.head()

Unnamed: 0,labels
0,"Lẹ́yìn tí wọ́n ṣègbéyàwó , aya náà sọ pé ẹ̀rí ..."
1,"wọ̀nyí ni nwọ́n níláti pamọ́ ní mímọ́, tí a sì..."
2,"Jòsáyà , tí í ṣe ọmọ Ámónì , ló wà lórí ìtẹ́ b..."
3,"Bí ọkùnrin méjì bá ń jà, tí ìyàwó ọ̀kan nínú w..."
4,"Lẹ́yìn èyí , Mósè gba àwọn wàláà òkúta méjì tí..."


In [18]:
df.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1185126 entries, 0 to 1185125
Data columns (total 1 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   labels  1185126 non-null  object
dtypes: object(1)
memory usage: 275.4 MB


In [19]:
#Convert all the words to lowercase
unstripped = df["labels"].str.lower()

In [20]:
#remove syntaxes and digits
d = ['\d+', '/', '*', '.', '&', '#', 
     ',', '?', '-', '_', '(',
     ')', '[', ']', '!', '%']
for v in d:
  unstripped = unstripped.str.replace(v, '')

  
  


In [21]:
df['labels'] = '\t ' + unstripped.astype(str).str.strip() + ' \n'

In [22]:
#Strip Texts from Diacritics
from unidecode import unidecode
stripped_letters = []
for i in unstripped:
  strip_letters = unidecode(i)
  stripped_letters.append(strip_letters)

In [23]:
#stripped_letters

In [24]:
df["feature"] = pd.Series(stripped_letters).str.strip()

In [25]:
df.head()

Unnamed: 0,labels,feature
0,\t lẹ́yìn tí wọ́n ṣègbéyàwó aya náà sọ pé ẹ̀r...,leyin ti won segbeyawo aya naa so pe eri okan...
1,\t wọ̀nyí ni nwọ́n níláti pamọ́ ní mímọ́ tí a ...,wonyi ni nwon nilati pamo ni mimo ti a si gbe ...
2,\t jòsáyà tí í ṣe ọmọ ámónì ló wà lórí ìtẹ́ ...,josaya ti i se omo amoni lo wa lori ite bayii
3,\t bí ọkùnrin méjì bá ń jà tí ìyàwó ọ̀kan nínú...,bi okunrin meji ba n ja ti iyawo okan ninu won...
4,\t lẹ́yìn èyí mósè gba àwọn wàláà òkúta méjì ...,leyin eyi mose gba awon walaa okuta meji ti o...


In [26]:
!git config --global user.name "Crinmatic"
!git config --global user.email "Oluseunalagbee@gmailcom"
!git config --global user.name
!git config --global user.email

Crinmatic
Oluseunalagbee@gmailcom


In [27]:
df.to_csv("new_train.csv", index = False)#, compression = 'zip')
df.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1185126 entries, 0 to 1185125
Data columns (total 2 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   labels   1185126 non-null  object
 1   feature  1185126 non-null  object
dtypes: object(2)
memory usage: 427.3 MB


In [28]:
!ls -l *

----------  1 root root     50299 Jan 23 01:40 __notebook__.ipynb
-rw-r--r--  1 root root 232322485 Jan 23 01:40 new_train.csv
-rw-r--r--  1 root root    188284 Jan 23 01:37 test.csv
-rw-r--r--  1 root root 139519307 Jan 23 01:38 train.csv

yoruba-text:
total 176
drwxr-xr-x 2 root root  4096 Jan 23 01:37 Alabi_YorubaTwi_Embedding
drwxr-xr-x 2 root root  4096 Jan 23 01:37 Asubiaro_LangID
drwxr-xr-x 4 root root  4096 Jan 23 01:37 Bibeli_Mimo
drwxr-xr-x 3 root root  4096 Jan 23 01:37 Book_of_Mormon
drwxr-xr-x 2 root root  4096 Jan 23 01:37 Iroyin
drwxr-xr-x 2 root root  4096 Jan 23 01:37 JW300
-rw-r--r-- 1 root root 35149 Jan 23 01:37 LICENSE
drwxr-xr-x 3 root root  4096 Jan 23 01:37 LagosNWU
drwxr-xr-x 2 root root  4096 Jan 23 01:37 Lesika
drwxr-xr-x 4 root root  4096 Jan 23 01:37 OCR_Text
drwxr-xr-x 4 root root  4096 Jan 23 01:37 Owe
drwxr-xr-x 2 root root 57344 Jan 23 01:37 Quran_Mimo
-rw-r--r-- 1 root root  2747 Jan 23 01:37 README.md
drwxr-xr-x 2 root root  4096 J

### **Process Test Data**

In [29]:
test_df.shape

(4334, 1)

In [30]:
test_df.head().T

Unnamed: 0,0,1,2,3,4
labels,ṣùgbọ́n ọlọpáá alábòójútó ìhámọ...,ẹ̀bẹ̀ mo bẹ̀ ọ́ olúwa,áfíríkà nínú ìgbìmọ̀ ààbò àjọ,ó kà nítòrì nípa lílọ,jáde ilé ìwé àgbà yunifásítì


In [31]:
#Convert all the words to lowercase
nonstripped = test_df["labels"].str.lower()

In [32]:
#remove syntaxes and digits
d = ['\d+', '/', '*', '.', '&', '#', ',', '?', '-', '_', '(', ')', '[', ']', '!', '%']

for v in d: nonstripped = nonstripped.str.replace(v, '')

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


In [33]:
test_df['labels'] = '\t ' + nonstripped.astype(str).str.strip() + ' \n'

In [34]:
#Strip Texts from Diacritics
from unidecode import unidecode
stripped_chr = []
for i in nonstripped:
    strip_chr = unidecode(i)
    stripped_chr.append(strip_chr)

In [35]:
#stripped_chr

In [36]:
test_df["feature"] = pd.Series(stripped_chr).str.strip()

In [37]:
test_df.head()

Unnamed: 0,labels,feature
0,\t ṣùgbọ́n ọlọpáá alábòójútó ìhá...,sugbon olopaa alaboojuto ihamo le
1,\t ẹ̀bẹ̀ mo bẹ̀ ọ́ olúwa \n,ebe mo be o oluwa
2,\t áfíríkà nínú ìgbìmọ̀ ààbò àjọ \n,afirika ninu igbimo aabo ajo
3,\t ó kà nítòrì nípa lílọ \n,o ka nitori nipa lilo
4,\t jáde ilé ìwé àgbà yunifásítì \n,jade ile iwe agba yunifasiti


In [38]:
test_df.to_csv("new_test.csv", index = False)
test_df.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4334 entries, 0 to 4333
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   labels   4334 non-null   object
 1   feature  4334 non-null   object
dtypes: object(2)
memory usage: 996.3 KB
