In [1]:
import os
import re
import pandas as pd
import seaborn as sns

In [2]:
folder = os.path.join(os.getcwd(),'..','data','korpus')
subjects = ['Academic','Culture','European','Law','News',
            'Opinion','Parliament','Religion','Sport']

## Removing HTML tags from <i>korpus</i>

In [None]:
expr = re.compile('<.*?>') 

def clean(html):
  html_clean = re.sub(expr, '', html)
  return html_clean

In [None]:
for sbj in subjects:
    path = os.path.join(folder,sbj)
    
    for filename in os.listdir(path):
        filepath = os.path.join(path,filename)
        
        with open(filepath,'r', encoding='utf-8') as f:
            print(filename)
            text = f.read()
        
        with open(filepath,'w', encoding='utf-8') as f:
            f.write(clean(text))
    
    

## Turning text file into dataset

In [3]:
for sbj in subjects:
    data = []
    path = os.path.join(folder,sbj)
    
    for filename in os.listdir(path):
        filepath = os.path.join(path,filename)
        
        with open(filepath,'r', encoding='utf-8') as f:
            print(filename)
            lines = f.readlines()
            
            for l in lines:
                d = l.split('\t')
                
                #Start and End Token
                if len(d) == 1:
                    if re.search(r'<s id="[0-9]*">', d[0]): data.append(['<s>','START',None,None])
                    elif re.search(r'</s>', d[0]):          data.append(['</s>','END',None,None])
                
                elif len(d) > 1:
                    d[-1] = d[-1][:-1]
                    data.append(d)
                    
    df = pd.DataFrame(data, columns=['Word','POS','Lemma','Root'])
    df.to_csv(os.path.join(path,sbj+'.csv'), index=False)

News2.txt


## Joining individual csv files

We will create 2 versions of korpus.csv. One will simply be all the csv files together while the other will read an equal number of bytes from each file. This is so the frequency counts won't be biased based on the subject of the texts.

In [6]:
with open(os.path.join(folder,'korpus.csv'), 'w', encoding='utf-8') as f1:
    for sbj in subjects:
        with open(os.path.join(folder,sbj,sbj+'.csv'), 'r', encoding='utf-8') as f2:
            f1.write('\n')
            f1.write(f2.read())
            print(f'{sbj} Finished')

print('All Finished\n')


min_size = min([os.path.getsize(os.path.join(folder,sbj,sbj+'.csv')) for sbj in subjects])

#Only read min_length lines from each csv file. 

with open(os.path.join(folder,'norm_korpus.csv'), 'w', encoding='utf-8') as f1:
    for sbj in subjects:
        with open(os.path.join(folder,sbj,sbj+'.csv'), 'r', encoding='utf-8') as f2:    
            f1.write('\n')
            f1.write(f2.read(min_size))
            print(f'{sbj} Finished')

print('All Finished')


Academic Finished
Culture Finished
European Finished
Law Finished
News Finished
Opinion Finished
Parliament Finished
Religion Finished
Sport Finished
All Finished

Academic Finished
Culture Finished
European Finished
Law Finished
News Finished
Opinion Finished
Parliament Finished
Religion Finished
Sport Finished
All Finished


## Exploring Dataset

In [3]:
%%time

#50 Million rows
df = pd.read_csv(os.path.join(folder,'korpus.csv'),
                 usecols=["Word","POS"],
                 dtype={"Word": "U","POS": "S"},
                 nrows=50_000_000)

df_norm = pd.read_csv(os.path.join(folder,'norm_korpus.csv'),
                      usecols=["Word","POS"],
                      dtype={"Word": "U","POS": "S"})

df.head(10)

Wall time: 17.6 s


Unnamed: 0,Word,POS
0,L-,DEF
1,għan,NOUN
2,prinċipali,ADJ
3,ta',GEN
4,Conectando,NOUN-PROP
5,Mundos,NOUN-PROP
6,(,X-PUN
7,Malta,NOUN-PROP
8,),X-PUN
9,(,X-PUN


### Cleaning

In [4]:
df['POS'].unique()

array(['DEF', 'NOUN', 'ADJ', 'GEN', 'NOUN-PROP', 'X-PUN', 'X-ABV', 'PREP',
       'CONJ-CORD', 'PART-PASS', 'PREP-DEF', 'PRON-PERS', 'COMP', 'VERB',
       'LIL-DEF', 'CONJ-SUB', 'KIEN', 'GEN-PRON', 'ADV', 'VERB-PSEU',
       'NEG', 'GEN-DEF', 'QUAN', 'PRON-DEM', 'X-DIG', 'PRON-INT', 'FOC',
       'PREP-PRON', 'NUM-WHD', 'LIL', 'NUM-CRD', 'X-ENG', 'X-FOR', 'PROG',
       'INT', 'X-BOR', 'PRON-PERS-NEG', 'LIL-PRON', 'PRON-INDEF',
       'PRON-DEM-DEF', 'NUM-ORD', 'HEMM', 'PRON-REF', 'PART-ACT', 'FUT',
       'NUM-FRC', 'PRON-REC', 'POS'], dtype=object)

In [5]:
%%time
#Maltese Tagset: https://mlrs.research.um.edu.mt/resources/malti03/tagset30.html

df = df.drop(df[df['POS']=='X-PUN'].index) #Punctuation
df = df.drop(df[df['POS']=='X-DIG'].index) #Digits
df = df.drop(df[df['POS']=='X-ENG'].index) #English
df = df.drop(df[df['POS']=='X-FOR'].index) #Foreign
df = df.drop(df[df['POS']=='X-ABV'].index) #Abbreviations
df = df.drop(df[df['POS']=='X-BOR'].index) #Gibberish
df = df.drop(df[df['POS']=='INT'].index)   #Interjections


df_norm = df_norm.drop(df_norm[df_norm['POS']=='X-PUN'].index) #Punctuation
df_norm = df_norm.drop(df_norm[df_norm['POS']=='X-DIG'].index) #Digits
df_norm = df_norm.drop(df_norm[df_norm['POS']=='X-ENG'].index) #English
df_norm = df_norm.drop(df_norm[df_norm['POS']=='X-FOR'].index) #Foreign
df_norm = df_norm.drop(df_norm[df_norm['POS']=='X-ABV'].index) #Abbreviations
df_norm = df_norm.drop(df_norm[df_norm['POS']=='X-BOR'].index) #Gibberish
df_norm = df_norm.drop(df_norm[df_norm['POS']=='INT'].index)   #Interjections


Wall time: 49.6 s


In [6]:
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})

missing_value_df

Unnamed: 0,column_name,percent_missing
Word,Word,0.000675
POS,POS,0.0


In [7]:
df = df.dropna(subset=['Word'])
df = df.drop(df[df["Word"]=='"'].index)
df = df.drop(df[df["Word"]=='&lt'].index)
df = df.drop(df[df["Word"]=='&gt'].index)
df = df.drop(df[df["Word"]=='&amp'].index)

df_norm = df_norm.dropna(subset=['Word'])
df_norm = df_norm.drop(df_norm[df_norm["Word"]=='"'].index)
df_norm = df_norm.drop(df_norm[df_norm["Word"]=='&lt'].index)
df_norm = df_norm.drop(df_norm[df_norm["Word"]=='&gt'].index)
df_norm = df_norm.drop(df_norm[df_norm["Word"]=='&amp'].index)

In [8]:
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})

missing_value_df

Unnamed: 0,column_name,percent_missing
Word,Word,0.0
POS,POS,0.0


In [9]:
%%time
df.to_csv(os.path.join(folder,'korpus_clean.csv'), index=False)
df_norm.to_csv(os.path.join(folder,'norm_korpus_clean.csv'), index=False)

Wall time: 55.9 s


## Frequency Counts

In [10]:
df = pd.read_csv(os.path.join(folder, "korpus_clean.csv"),
                 usecols=["Word","POS"],
                 dtype={"Word": "U","POS": "S"})

df_norm = pd.read_csv(os.path.join(folder, "norm_korpus_clean.csv"),
                 usecols=["Word","POS"],
                 dtype={"Word": "U","POS": "S"})


In [11]:
#Get all unique words
df_frequency = df.value_counts().to_frame()[0].reset_index()
df_frequency.columns = ['Word','POS','Frequency']


df_normal_frequency = df_norm.value_counts().to_frame()[0].reset_index()
df_normal_frequency.columns = ['Word','POS','Frequency']

df_frequency.to_csv(os.path.join(folder,'korpus_frequency.csv'), index=False)
df_normal_frequency.to_csv(os.path.join(folder,'norm_korpus_frequency.csv'), index=False)

df_frequency

Unnamed: 0,Word,POS,Frequency
0,l-,DEF,1696176
1,li,COMP,1552199
2,ta',GEN,1418081
3,u,CONJ-CORD,1041506
4,tal-,GEN-DEF,919326
...,...,...,...
452797,kulju,NOUN,1
452798,Deleitosa,NOUN-PROP,1
452799,kuljun,NOUN,1
452800,kuljunar,ADV,1


### Viewing most common nouns, adjectives or verbs.

In [12]:
df_frequency[(df_frequency["POS"] == "NOUN")|
             (df_frequency["POS"] == "ADJ") |
             (df_frequency["POS"] == "VERB")]

Unnamed: 0,Word,POS,Frequency
45,Regolament,NOUN,86538
46,oħra,ADJ,85898
49,Kummissjoni,NOUN,82379
54,Artikolu,NOUN,80165
62,sena,NOUN,74874
...,...,...,...
452790,kuljunar,NOUN,1
452792,aprovazzjonijiet,NOUN,1
452796,kuljom,NOUN,1
452797,kulju,NOUN,1


In [13]:
df_normal_frequency[(df_normal_frequency["POS"] == "NOUN")|
                    (df_normal_frequency["POS"] == "ADJ") |
                    (df_normal_frequency["POS"] == "VERB")]

Unnamed: 0,Word,POS,Frequency
10,ta',VERB,19076
30,sena,NOUN,9759
37,f',NOUN,8457
41,oħra,ADJ,8097
66,Ministru,NOUN,5912
...,...,...,...
123496,jilliberaħ,VERB,1
123497,jillikwifikaw,VERB,1
123498,jillima,VERB,1
123501,jillimitax,VERB,1
