In [None]:
import csv
import os

from pathlib import Path

from nltk import download
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.tokenize import word_tokenize

In [None]:
download('stopwords')
download('punkt')

In [None]:
cc_files = os.listdir('data/cc/')
uc_files = os.listdir('data/uc/')

In [None]:
'''
- clean contents of all cc files then save in dictionary for easy concatenation
- using the cc filenames as dict keys
'''

cc_dict = {}

for cc in cc_files:
    # convert contents of cc file to a string
    cc_data = Path('data/cc/'+cc).read_text(encoding='latin-1')
    # tokenize file contents
    cc_data = word_tokenize(cc_data)
    # remove stop words
    cc_data = [word for word in cc_data if not word in stopwords.words()]
    # remove punt
    cc_data = [word for word in cc_data if word.isalnum()]
    
    # print(cc_data)
    # break
    
    # convert cleaned data list to string and add to dict
    cc_dict[cc] = ' '.join(cc_data)

In [None]:
'''
- clean contents of all uc files then save in dictionary for easy concatenation
- using the uc filenames as dict keys
'''

uc_dict = {}

for uc in uc_files:
    # convert contents of uc file to a string
    uc_data = Path('data/uc/'+uc).read_text(encoding='latin-1')
    # tokenize file contents
    uc_data = word_tokenize(uc_data)
    # remove stop words
    uc_data = [word for word in uc_data if not word in stopwords.words()]
    # remove punt
    uc_data = [word for word in uc_data if word.isalnum()]
    
    # print(uc_data)
    # break
    
    # convert cleaned data list to string and add to dict
    uc_dict[uc] = ' '.join(uc_data)

In [None]:
'''
- iterate through all cc files and for each file iterate through all uc files
- find which uc files are attached to each cc by checking smos_oracle
'''

labeled_list = []

with open('data/smos_oracle.txt', newline='') as oraclefile:
    oracle_reader = csv.reader(oraclefile, delimiter=',')
        
    for row in oracle_reader:
        # remove leading whitespace from comma separated values in smos_oracle
        for i in range(len(row)):
            row[i] = row[i].lstrip()
            
        for cc_key in cc_dict.keys():
            # print('*** CC: %s ***' %cc_key)
            # if the cc filename w/o extension is in the given smos_oracle row
            if cc_key.replace('.txt', '') in row:
                for uc_key in uc_dict.keys():
                    # if the uc filename w/o extension is in the given smos_oracle row
                    if uc_key.replace('.txt', '') in row:
                        label = 1
                    else:
                        label = 0
                    # print('\tUC: %s | label: %s' %(uc_key, label))
                    # save joined data w/ label as a tuple in list for easy stemming    
                    labeled_list.append((cc_dict[cc_key] + ' ' + uc_dict[uc_key], label))                        

In [None]:
# Porter stemming output file
with open('data/all_links_porter.txt', 'w', newline='') as outfile:
    writer = csv.writer(outfile, quoting=csv.QUOTE_MINIMAL)
    ps = PorterStemmer()
    
    for labeled_link in labeled_list:
        writer.writerow([' '.join([ps.stem(word) for word in labeled_link[0].split()]), labeled_link[1]])

In [None]:
# Snowball stemming output file
with open('data/all_links_snowball.txt', 'w', newline='') as outfile:
    writer = csv.writer(outfile, quoting=csv.QUOTE_MINIMAL)
    ss = SnowballStemmer('english')
    
    for labeled_link in labeled_list:
        writer.writerow([' '.join([ss.stem(word) for word in labeled_link[0].split()]), labeled_link[1]])