In [None]:
# check which language dictionaries are currently available
import enchant
broker = enchant.Broker()
broker.describe()
broker.list_languages()

In [None]:
import csv
import os

from enchant import Dict
from enchant.checker import SpellChecker
from nltk import download
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import EnglishStemmer, ItalianStemmer
from nltk.tokenize import word_tokenize
from pathlib import Path

In [None]:
download('stopwords')
download('punkt')

In [None]:
cc_files = os.listdir('data/cc/')
uc_files = os.listdir('data/uc/')
d = Dict('it')   # create dictionary for Italian
ps = PorterStemmer()
ss_eng = EnglishStemmer() # Snowball
ss_ita = ItalianStemmer() # Snowball

In [None]:
'''
- clean contents of all cc files then save in dictionary for easy concatenation
- using the cc filenames as dict keys
'''

cc_data_ps = [] # Porter stemmed data
cc_data_ss = [] # Snowball stemmed data
cc_dict_ps = {}
cc_dict_ss = {}

for cc in cc_files:
    # convert contents of cc file to a string
    cc_data = Path('data/cc/'+cc).read_text(encoding='latin-1')
    # tokenize file contents
    cc_data = word_tokenize(cc_data)
    # remove stop words
    cc_data = [word for word in cc_data if not word in stopwords.words()]
    # remove tokens which are punctuation or purely numeric
    cc_data = [word for word in cc_data if word.isalnum() and not word.isnumeric()]
    
    # stem tokens based upon token's language 
    for token in cc_data:
        # print('*** %s ***' %cc)
        # print(token, '-', d.check(str(token)))
        
        # True: Italian, False: otherwise
        if d.check(token):
            cc_data_ss.append(ss_ita.stem(token))
        else:
            cc_data_ss.append(ss_eng.stem(token))
            
        # nltk Porter stemmer is language invariant
        cc_data_ps.append(ps.stem(token))
            
    # break
    
    # convert cleaned data list to string and add to dict using filename as key
    cc_dict_ps[cc] = ' '.join(cc_data_ps)
    cc_dict_ss[cc] = ' '.join(cc_data_ss)
    
    cc_data_ps.clear()
    cc_data_ss.clear()

In [None]:
'''
- clean contents of all uc files then save in dictionary for easy concatenation
- using the uc filenames as dict keys
'''

uc_data_ps = [] # Porter stemmed data
uc_data_ss = [] # Snowball stemmed data
uc_dict_ps = {}
uc_dict_ss = {}

for uc in uc_files:
    # convert contents of uc file to a string
    uc_data = Path('data/uc/'+uc).read_text(encoding='latin-1')
    # tokenize file contents
    uc_data = word_tokenize(uc_data)
    # remove stop words
    uc_data = [word for word in uc_data if not word in stopwords.words()]
    # remove tokens which are punctuation or purely numeric
    uc_data = [word for word in uc_data if word.isalnum() and not word.isnumeric()]
    
    # stem tokens based upon token's language 
    for token in uc_data:
        # print('*** %s ***' %cc)
        # print(token, '-', d.check(str(token)))
        
        # True: Italian, False: otherwise
        if d.check(token):
            uc_data_ss.append(ss_ita.stem(token))
        else:
            uc_data_ss.append(ss_eng.stem(token))
            
        # nltk Porter stemmer is language invariant
        uc_data_ps.append(ps.stem(token))
            
    # break
    
    # convert cleaned data list to string and add to dict using filename as key
    uc_dict_ps[uc] = ' '.join(uc_data_ps)
    uc_dict_ss[uc] = ' '.join(uc_data_ss)
    
    uc_data_ps.clear()
    uc_data_ss.clear()

In [None]:
'''
- iterate through all cc files and for each file iterate through all uc files
- find which uc files are attached to each cc by checking smos_oracle
'''

labeled_list_ps = [] # Porter stemmed data
labeled_list_ss = [] # Snowball stemmed data

with open('data/smos_oracle.txt', newline='') as oraclefile:
    oracle_reader = csv.reader(oraclefile, delimiter=',')
        
    for row in oracle_reader:
        # remove leading whitespace from comma separated values in smos_oracle
        for i in range(len(row)):
            row[i] = row[i].lstrip()

        # cc_dict_ps & cc_dict_ss have identical key sets
        for cc_key in cc_dict_ps.keys():
            # print('*** CC: %s ***' %cc_key)
            
            # if the cc filename w/o extension is in the given smos_oracle row
            if cc_key.replace('.txt', '') in row:
                # uc_dict_ps & uc_dict_ss have identical key sets
                for uc_key in uc_dict_ps.keys():
                    # if the uc filename w/o extension is in the given smos_oracle row
                    if uc_key.replace('.txt', '') in row:
                        label = 1
                    else:
                        label = 0
                        
                    # print('\tUC: %s | label: %s' %(uc_key, label))
                    
                    # save joined data w/ label as a tuple in list for easy stemming    
                    labeled_list_ps.append((cc_dict_ps[cc_key] + ' ' + uc_dict_ps[uc_key], label))
                    labeled_list_ss.append((cc_dict_ss[cc_key] + ' ' + uc_dict_ss[uc_key], label))

In [None]:
# Porter stemming output file
with open('data/all_links_porter.txt', 'w', newline='') as outfile:
    writer = csv.writer(outfile, quoting=csv.QUOTE_MINIMAL)
    
    for labeled_link in labeled_list_ps:
        writer.writerow([labeled_link[0], labeled_link[1]])

In [None]:
# Snowball stemming output file
with open('data/all_links_snowball.txt', 'w', newline='') as outfile:
    writer = csv.writer(outfile, quoting=csv.QUOTE_MINIMAL)
    
    for labeled_link in labeled_list_ss:
        writer.writerow([labeled_link[0], labeled_link[1]])