In [16]:
import os
import re
import pandas as pd
import seaborn as sns
from bs4 import BeautifulSoup

In [17]:
folder = os.path.join(os.getcwd(),'..','data','korpus')
subjects = ['Academic','Culture','European','Law','News',
            'Opinion','Parliament','Religion','Sport']

## Removing XML tags from <i>korpus</i>

All tags except <b>\<s\></b> will be removed from the korpus.

In [18]:
regex = re.compile(r'<(?![/]?s).*?>')
def clean(text):
    return regex.sub('', text)

In [19]:
for sbj in subjects:
    path = os.path.join(folder,sbj)
    
    for filename in os.listdir(path):
        filepath = os.path.join(path,filename)

        with open(filepath, 'r', encoding='utf-8') as f:
            print(filename)
            text = f.read()
        
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(clean(text))
    
    

malti03.academic.1.txt
malti03.culture.1.txt
malti03.law.txt
malti03.news.1.txt
malti03.opinion.1.txt
malti03.religion.2.txt
malti03.sport.1.txt


## Turning text file into dataset

In [20]:
subjects = ['Academic']

for sbj in subjects:
    data = []
    path = os.path.join(folder,sbj)
    
    for filename in os.listdir(path):
        filepath = os.path.join(path,filename)
        
        with open(filepath,'r', encoding='utf-8') as f:
            print(filename)
            lines = f.readlines()
            
            for l in lines:
                d = l.split('\t')
                
                #Start and End Token
                if len(d) == 1:
                    if re.search(r'<s id="[0-9]*">', d[0]): data.append(['<s>','START',None,None])
                    elif re.search(r'</s>', d[0]):          data.append(['</s>','END',None,None])
                
                elif len(d) > 1:
                    d[-1] = d[-1][:-1]
                    data.append(d)
                    
    df = pd.DataFrame(data, columns=['Word','POS','Lemma','Root'])
    df.to_csv(os.path.join(path,sbj+'.csv'), index=False)

malti03.academic.1.txt
malti03.culture.1.txt
malti03.law.txt
malti03.news.1.txt
malti03.opinion.1.txt
malti03.religion.2.txt
malti03.sport.1.txt


## Joining individual csv files

We will create 2 versions of korpus.csv. One will simply be all the csv files together while the other will read an equal number of bytes from each file. This is so the frequency counts won't be biased based on the subject of the texts.

In [21]:
with open(os.path.join(folder,'korpus.csv'), 'w', encoding='utf-8') as f1:
    for sbj in subjects:
        with open(os.path.join(folder,sbj,sbj+'.csv'), 'r', encoding='utf-8') as f2:
            f1.write('\n')
            f1.write(f2.read())
            print(f'{sbj} Finished')

print('All Finished\n')


min_size = min([os.path.getsize(os.path.join(folder,sbj,sbj+'.csv')) for sbj in subjects])

#Only read min_length lines from each csv file. 

with open(os.path.join(folder,'norm_korpus.csv'), 'w', encoding='utf-8') as f1:
    for sbj in subjects:
        with open(os.path.join(folder,sbj,sbj+'.csv'), 'r', encoding='utf-8') as f2:    
            f1.write('\n')
            f1.write(f2.read(min_size))
            print(f'{sbj} Finished')

print('All Finished')


Academic Finished
All Finished

Academic Finished
All Finished


## Exploring Dataset

In [34]:
%%time

#40 million rows
df = pd.read_csv(os.path.join(folder,'korpus.csv'),
                 usecols=["Word","POS"],
                 dtype={"Word": "U","POS": "S"})

df_norm = pd.read_csv(os.path.join(folder,'norm_korpus.csv'),
                      usecols=["Word","POS"],
                      dtype={"Word": "U","POS": "S"})

df.head(10)

Wall time: 155 ms


Unnamed: 0,Word,POS
0,</s>,END
1,</s>,END
2,</s>,END
3,</s>,END
4,</s>,END
5,</s>,END
6,</s>,END
7,</s>,END
8,</s>,END
9,</s>,END


### Cleaning

In [23]:
df['POS'].unique()

array(['END'], dtype=object)

In [24]:
%%time
#Maltese Tagset: https://mlrs.research.um.edu.mt/resources/malti03/tagset30.html

df = df.drop(df[df['POS']=='X-PUN'].index) #Punctuation
# df = df.drop(df[df['POS']=='X-DIG'].index) #Digits
# df = df.drop(df[df['POS']=='X-ENG'].index) #English
# df = df.drop(df[df['POS']=='X-FOR'].index) #Foreign
# df = df.drop(df[df['POS']=='X-ABV'].index) #Abbreviations
df = df.drop(df[df['POS']=='X-BOR'].index) #Gibberish
df = df.drop(df[df['POS']=='INT'].index)   #Interjections


df_norm = df_norm.drop(df_norm[df_norm['POS']=='X-PUN'].index) #Punctuation
# df_norm = df_norm.drop(df_norm[df_norm['POS']=='X-DIG'].index) #Digits
# df_norm = df_norm.drop(df_norm[df_norm['POS']=='X-ENG'].index) #English
# df_norm = df_norm.drop(df_norm[df_norm['POS']=='X-FOR'].index) #Foreign
# df_norm = df_norm.drop(df_norm[df_norm['POS']=='X-ABV'].index) #Abbreviations
df_norm = df_norm.drop(df_norm[df_norm['POS']=='X-BOR'].index) #Gibberish
df_norm = df_norm.drop(df_norm[df_norm['POS']=='INT'].index)   #Interjections


Wall time: 212 ms


In [25]:
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})

missing_value_df

Unnamed: 0,column_name,percent_missing
Word,Word,0.0
POS,POS,0.0


In [26]:
df = df.dropna(subset=['Word'])
df = df.drop(df[df["Word"]=='"'].index)
df = df.drop(df[df["Word"]=='&lt'].index)
df = df.drop(df[df["Word"]=='&gt'].index)
df = df.drop(df[df["Word"]=='&amp'].index)

df_norm = df_norm.dropna(subset=['Word'])
df_norm = df_norm.drop(df_norm[df_norm["Word"]=='"'].index)
df_norm = df_norm.drop(df_norm[df_norm["Word"]=='&lt'].index)
df_norm = df_norm.drop(df_norm[df_norm["Word"]=='&gt'].index)
df_norm = df_norm.drop(df_norm[df_norm["Word"]=='&amp'].index)

In [27]:
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})

missing_value_df

Unnamed: 0,column_name,percent_missing
Word,Word,0.0
POS,POS,0.0


In [28]:
%%time
df.to_csv(os.path.join(folder,'korpus_clean.csv'), index=False)
df_norm.to_csv(os.path.join(folder,'norm_korpus_clean.csv'), index=False)

Wall time: 1.03 s


## Frequency Counts

In [29]:
df = pd.read_csv(os.path.join(folder, "korpus_clean.csv"),
                 usecols=["Word","POS"],
                 dtype={"Word": "U","POS": "S"})

df_norm = pd.read_csv(os.path.join(folder, "norm_korpus_clean.csv"),
                 usecols=["Word","POS"],
                 dtype={"Word": "U","POS": "S"})


In [30]:
#Get all unique words
df_frequency = df.value_counts().to_frame()[0].reset_index()
df_frequency.columns = ['Word','POS','Frequency']


df_normal_frequency = df_norm.value_counts().to_frame()[0].reset_index()
df_normal_frequency.columns = ['Word','POS','Frequency']

df_frequency.to_csv(os.path.join(folder,'korpus_frequency.csv'), index=False)
df_normal_frequency.to_csv(os.path.join(folder,'norm_korpus_frequency.csv'), index=False)

df_frequency

Unnamed: 0,Word,POS,Frequency
0,</s>,END,373680


### Viewing most common nouns, adjectives or verbs.

In [31]:
df_frequency[(df_frequency["POS"] == "NOUN")|
             (df_frequency["POS"] == "ADJ") |
             (df_frequency["POS"] == "VERB")]

Unnamed: 0,Word,POS,Frequency


In [32]:
df_normal_frequency[(df_normal_frequency["POS"] == "NOUN")|
                    (df_normal_frequency["POS"] == "ADJ") |
                    (df_normal_frequency["POS"] == "VERB")]

Unnamed: 0,Word,POS,Frequency
