In [5]:
import os
import re
import pandas as pd
import seaborn as sns

In [6]:
folder = os.path.join(os.getcwd(),'..','data','korpus')
subjects = ['Academic','Culture','European','Law','News',
            'Opinion','Parliament','Religion','Sport','Test']

## Removing XML tags from <i>korpus</i>

All tags except <b>\<s\></b> will be removed from the korpus.

In [7]:
regex = re.compile(r'<(?![/]?s).*?>')
def clean(text):
    return regex.sub('', text)

In [8]:
for sbj in subjects:
    path = os.path.join(folder,sbj)
    
    for filename in os.listdir(path):
        filepath = os.path.join(path,filename)

        with open(filepath, 'r', encoding='utf-8') as f:
            print(filename)
            text = f.read()
        
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(clean(text))
    
    

Academic1.txt
Academic2.txt
Culture1.txt
Culture2.txt
European.txt
Law.txt
News.txt
Opinion1.txt
Parliament.csv
Parliament2.txt
Religion.csv
Religion2.txt
Sport.csv
Sport1.txt
Test.csv
Test.txt


## Turning text file into dataset

In [12]:
for sbj in subjects:
    data = []
    path = os.path.join(folder,sbj)
    
    for filename in os.listdir(path):
        filepath = os.path.join(path,filename)
        
        with open(filepath,'r', encoding='utf-8') as f:
            print(filename)
            lines = f.readlines()
            
            for l in lines:
                d = l.split('\t')
                
                #Start and End Token
                if len(d) == 1:
                    if re.search(r'<s id="[0-9]*">', d[0]): data.append(['<s>','START',None,None])
                    elif re.search(r'</s>', d[0]):          data.append(['</s>','END',None,None])
                
                elif len(d) > 1:
                    d[-1] = d[-1][:-1]
                    data.append(d)
                    
    df = pd.DataFrame(data, columns=['Word','POS','Lemma','Root'])
    df.to_csv(os.path.join(path,sbj+'.csv'), index=False)

Academic1.txt
Academic2.txt
Culture1.txt
Culture2.txt
European.txt
Law.txt
News.txt
Opinion1.txt
Parliament2.txt
Religion2.txt
Sport1.txt
Test.txt


## Joining individual csv files

We will create 2 versions of korpus.csv. One will simply be all the csv files together while the other will read an equal number of bytes from each file. This is so the frequency counts won't be biased based on the subject of the texts.

Note that we do not include the text file in the training korpus

In [13]:
subjects = ['Academic','Culture','European','Law','News',
            'Opinion','Parliament','Religion','Sport',]

with open(os.path.join(folder,'korpus.csv'), 'w', encoding='utf-8') as f1:
    for sbj in subjects:
        with open(os.path.join(folder,sbj,sbj+'.csv'), 'r', encoding='utf-8') as f2:
            f1.write('\n')
            f1.write(f2.read())
            print(f'{sbj} Finished')

print('All Finished\n')


min_size = min([os.path.getsize(os.path.join(folder,sbj,sbj+'.csv')) for sbj in subjects])

#Only read min_length lines from each csv file. 

with open(os.path.join(folder,'norm_korpus.csv'), 'w', encoding='utf-8') as f1:
    for sbj in subjects:
        with open(os.path.join(folder,sbj,sbj+'.csv'), 'r', encoding='utf-8') as f2:    
            f1.write('\n')
            f1.write(f2.read(min_size))
            print(f'{sbj} Finished')

print('All Finished')


Academic Finished
Culture Finished
European Finished
Law Finished
News Finished
Opinion Finished
Parliament Finished
Religion Finished
Sport Finished
All Finished

Academic Finished
Culture Finished
European Finished
Law Finished
News Finished
Opinion Finished
Parliament Finished
Religion Finished
Sport Finished
All Finished


## Exploring Dataset

In [14]:
%%time

#70 million rows
df = pd.read_csv(os.path.join(folder,'korpus.csv'),
                 usecols=["Word","POS"],
                 dtype={"Word": "U","POS": "S"},
                 nrows=70_000_000)

df_norm = pd.read_csv(os.path.join(folder,'norm_korpus.csv'),
                      usecols=["Word","POS"],
                      dtype={"Word": "U","POS": "S"})

df.head(10)

Wall time: 27.8 s


Unnamed: 0,Word,POS
0,<s>,START
1,L-,DEF
2,għan,NOUN
3,prinċipali,ADJ
4,ta',GEN
5,Conectando,NOUN-PROP
6,Mundos,NOUN-PROP
7,(,X-PUN
8,Malta,NOUN-PROP
9,),X-PUN


### Cleaning

In [15]:
df['POS'].unique()

array(['START', 'DEF', 'NOUN', 'ADJ', 'GEN', 'NOUN-PROP', 'X-PUN',
       'X-ABV', 'PREP', 'CONJ-CORD', 'PART-PASS', 'PREP-DEF', 'PRON-PERS',
       'COMP', 'VERB', 'END', 'LIL-DEF', 'CONJ-SUB', 'KIEN', 'GEN-PRON',
       'ADV', 'VERB-PSEU', 'NEG', 'GEN-DEF', 'QUAN', 'PRON-DEM', 'X-DIG',
       'PRON-INT', 'FOC', 'PREP-PRON', 'NUM-WHD', 'LIL', 'NUM-CRD',
       'X-ENG', 'X-FOR', 'PROG', 'INT', 'X-BOR', 'PRON-PERS-NEG',
       'LIL-PRON', 'PRON-INDEF', 'PRON-DEM-DEF', 'NUM-ORD', 'HEMM',
       'PRON-REF', 'PART-ACT', 'FUT', 'NUM-FRC', 'PRON-REC', 'POS'],
      dtype=object)

In [19]:
%%time
#Maltese Tagset: https://mlrs.research.um.edu.mt/resources/malti03/tagset30.html

df = df.drop(df[df['POS']=='X-PUN'].index) #Punctuation
# df = df.drop(df[df['POS']=='X-DIG'].index) #Digits
# df = df.drop(df[df['POS']=='X-ENG'].index) #English
# df = df.drop(df[df['POS']=='X-FOR'].index) #Foreign
# df = df.drop(df[df['POS']=='X-ABV'].index) #Abbreviations
df = df.drop(df[df['POS']=='X-BOR'].index) #Gibberish
df = df.drop(df[df['POS']=='INT'].index)   #Interjections


df_norm = df_norm.drop(df_norm[df_norm['POS']=='X-PUN'].index) #Punctuation
# df_norm = df_norm.drop(df_norm[df_norm['POS']=='X-DIG'].index) #Digits
# df_norm = df_norm.drop(df_norm[df_norm['POS']=='X-ENG'].index) #English
# df_norm = df_norm.drop(df_norm[df_norm['POS']=='X-FOR'].index) #Foreign
# df_norm = df_norm.drop(df_norm[df_norm['POS']=='X-ABV'].index) #Abbreviations
df_norm = df_norm.drop(df_norm[df_norm['POS']=='X-BOR'].index) #Gibberish
df_norm = df_norm.drop(df_norm[df_norm['POS']=='INT'].index)   #Interjections


### Removing semi-colons

Semi colons will be used as delimters in ngrams later on. Having semicolons at the beginning or end of a word disrupts the delimiting process.

In [17]:
df['Word']

0                  <s>
1                   L-
2                 għan
3           prinċipali
4                  ta'
               ...    
69999995          żewġ
69999996        kwarti
69999997          tas-
69999998          sena
69999999          wara
Name: Word, Length: 62811891, dtype: object

In [20]:
df['Word'] = df['Word'].apply(lambda s: ''.join(str(s).split(';')), 1)
df_norm['Word'] = df_norm['Word'].apply(lambda s: ''.join(str(s).split(';')), 1)

In [21]:
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})

missing_value_df

Unnamed: 0,column_name,percent_missing
Word,Word,0.000726
POS,POS,0.0


In [22]:
df = df.dropna(subset=['Word'])
df = df.drop(df[df["Word"]=='"'].index)
df = df.drop(df[df["Word"]=='&lt'].index)
df = df.drop(df[df["Word"]=='&gt'].index)
df = df.drop(df[df["Word"]=='&amp'].index)

df_norm = df_norm.dropna(subset=['Word'])
df_norm = df_norm.drop(df_norm[df_norm["Word"]=='"'].index)
df_norm = df_norm.drop(df_norm[df_norm["Word"]=='&lt'].index)
df_norm = df_norm.drop(df_norm[df_norm["Word"]=='&gt'].index)
df_norm = df_norm.drop(df_norm[df_norm["Word"]=='&amp'].index)

In [24]:
df.to_csv(os.path.join(folder,'korpus_clean.csv'), index=False)
df_norm.to_csv(os.path.join(folder,'norm_korpus_clean.csv'), index=False)

## Frequency Counts

In [25]:
df = pd.read_csv(os.path.join(folder, "korpus_clean.csv"),
                 usecols=["Word","POS"],
                 dtype={"Word": "U","POS": "S"})

df_norm = pd.read_csv(os.path.join(folder, "norm_korpus_clean.csv"),
                 usecols=["Word","POS"],
                 dtype={"Word": "U","POS": "S"})


In [26]:
#Get all unique words
df_frequency = df.value_counts().to_frame()[0].reset_index()
df_frequency.columns = ['Word','POS','Frequency']


df_normal_frequency = df_norm.value_counts().to_frame()[0].reset_index()
df_normal_frequency.columns = ['Word','POS','Frequency']

df_frequency.to_csv(os.path.join(folder,'korpus_frequency.csv'), index=False)
df_normal_frequency.to_csv(os.path.join(folder,'norm_korpus_frequency.csv'), index=False)