## Assignment 1

In [1]:
#importing libraries
import os
import spacy
import pandas as pd
import re

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


#### Setup
Loading the language model and defining functions. <br>  
Specifically, includes two functions: first function cleans the text for information that occurs between '<>', and the second function processes the text according to the assignment requirements.


In [2]:
nlp = spacy.load("en_core_web_md")

In [3]:
def cleaning_text(text):
    return re.sub(r'<.*?>', '', text) #handling 'text' to remove occurances beginning with '<' and ending in '>' (and everything in between)

In [4]:
#following logic from cds-viz class of making a function that processes a single file at a time, 
#instead of loading everything in at once
def process_text(file_path):
    #opening and reading files using 'latin1' as instructed in class
    with open(file_path, 'r', encoding='latin1') as f:
        text = cleaning_text(f.read())
    
    #creating a doc object
    doc = nlp(text)
    
    #dictionaries to keep track/counting
    pos_counts = {'NOUN': 0, 'VERB': 0, 'ADJ': 0, 'ADV': 0}
    unique_entities = {'PERSON': set(), 'GPE': set(), 'ORG': set()}
    #using for loops to count tokens and entities
    for token in doc:
        if token.pos_ in pos_counts:
            pos_counts[token.pos_] += 1 #if yes, increments counter
    for ent in doc.ents:
        if ent.label_ in unique_entities:
            unique_entities[ent.label_].add(ent.text) #if yes, adds ent's text 

    #calculating total numb of tokens/words (only alphabetic characters counted)
    total_words = len([token for token in doc if token.is_alpha])
    #calculating relative frequencies - utalising dic comprehension
    relative_freq = {pos: ((count / total_words) * 10000) for pos, count in pos_counts.items()}

    #arranging data for saving
    data = {
        'RelFreq NOUN': relative_freq.get('NOUN', 0), #specifying 0, to return numb 0 if there are no occurances instead of 'None'
        'RelFreq VERB': relative_freq.get('VERB', 0),
        'RelFreq ADJ': relative_freq.get('ADJ', 0),
        'RelFreq ADV': relative_freq.get('ADV', 0),
        'Unique PER': len(unique_entities['PERSON']),
        'Unique LOC': len(unique_entities['GPE']),
        'Unique ORG': len(unique_entities['ORG']),
    }
    return data

### Running the script
Defining the different data paths. <br>  
Using a for loop with the functions from above to extract the needed information and output csv files for each subfolder.

In [5]:
#getting the current working directory
os.getcwd()

'/work/EmilieMunchAndreasen#4014/cds-la-assignments/Assignment 1/src'

In [6]:
#setting the data_path and output_path
data_path = "/work/EmilieMunchAndreasen#4014/cds-la-assignments/Assignment 1/data/USEcorpus"
output_path = "/work/EmilieMunchAndreasen#4014/cds-la-assignments/Assignment 1/out"
dirs = sorted(os.listdir(data_path))

In [7]:
#for loop for iterating over all files in every subfolder
for directory in dirs: 
    subfolder = os.path.join(data_path, directory) #constructing paths
    filenames = sorted(os.listdir(subfolder))
    
    #list to store results in
    results = [] 
    #loops over each file
    for text_file in filenames:
        file_path = os.path.join(subfolder, text_file)
        file_data = process_text(file_path) 
        results.append({'Filename': text_file, **file_data}) #appends results with filename
    #creating pandas df 
    df = pd.DataFrame(results) 
    df.to_csv(os.path.join(output_path, f"{directory}_linguistic_features.csv"), index=False) #without row indicies