# Assignment 1 - Analyzing Swedish high school English corpus across different grades

In [3]:
# Loading necessary packages
import pandas as pd
import spacy
import os
import re

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
# downloading medium-sized spaCy NLP model
!python -m spacy download en_core_web_md 

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m40.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [7]:
# path to the "USEcorpus" folder
main_folder = (".." "/in/USEcorpus")

# list of subfolders in the USEcorpus. Also sorting.
subfolders = sorted(os.listdir(main_folder))

# load spacy model
nlp = spacy.load("en_core_web_md")

# output folder
output_folder = (".." "/out/")

In [8]:
# Pre-processing function
def preprocess_text(text):
    """ 
    Clean-up function that removes punctionation, \n, \t and characters inbetween <>, and lowercases the string.
    
    Args:
    - text (string): A string containing a text file

    Returns:
    - A text (string): Returns the same string but pre-processed
    """
    text = text.lower() # lowercasing the text
    text = re.sub(r"<[^>]*>", "", text) # remove metadata text contained within "<>"
    text = re.sub(r"\n", "", text) #removing \n, which occured often in the text files
    text = re.sub(r"\t", "", text) #removing \t, which occured often in the text files
    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~''' # list of punctuations

    for char in text: #removing the defined punctuations from the text
        if char in punc:
            text = text.replace(char, "")
    return text

In [9]:
def calculate_nlp_metrics(texts):
    """
    Calculate NLP metrics for a list of texts.
    
    Args:
    - texts (list): A list of strings representing text documents.
    
    Returns:
    - DataFrame: A DataFrame containing the calculated metrics.
    """
    nlp_metrics = [] #empty list for the metrics

    for i, text in enumerate(texts, 1):
        # convert the individual text into a doc (using the medium-sized model)
        doc = nlp(text)

        # Calculate various NLP metrics
        num_words = len(doc) # calculate the number of words in the text file
        num_tokens = doc.count_by(spacy.attrs.POS) # calculate the number of tokens in the text file
        rel_freq_noun = round(num_tokens.get(spacy.parts_of_speech.NOUN, 0) / num_words * 10000, 1) # relative number of nouns in text file
        rel_freq_verb = round(num_tokens.get(spacy.parts_of_speech.VERB, 0) / num_words * 10000, 1) # relative number of verbs in text file
        rel_freq_adj = round(num_tokens.get(spacy.parts_of_speech.ADJ, 0) / num_words * 10000, 1) # relative number of adjectives in text file
        rel_freq_adv = round(num_tokens.get(spacy.parts_of_speech.ADV, 0) / num_words * 10000, 1) # relative number of adverbs in text file

        # using the ent (EntityRecognizer) from SpaCy to identify unique tokens (or span of tokens). This is then used to identify the number of unique entities in regards to persons, locations and organizations.
        unique_entities = set([(ent.text, ent.label_) for ent in doc.ents]) 

        # counting the number of unique entities in the text in regards to persons, locataions and organizations
        unique_per = sum(1 for ent in unique_entities if ent[1] == "PERSON")
        unique_loc = sum(1 for ent in unique_entities if ent[1] == "LOC")
        unique_org = sum(1 for ent in unique_entities if ent[1] == "ORG")

        # append the metrics to nlp_metrics
        nlp_metrics.append([f"file{i}.txt", rel_freq_noun, rel_freq_verb, rel_freq_adj, rel_freq_adv, unique_per, unique_loc, unique_org])

    # Convert nlp_metrics to DataFrame
    columns = ["Filename", "RelFreq NOUN", "RelFreq VERB", "RelFreq ADJ", "RelFreq ADV", "Unique PER", "Unique LOC", "Unique ORG"]
    return pd.DataFrame(nlp_metrics, columns=columns)

In [10]:
# making a dictionary to store the texts from each subfolder
subfolder_texts = {}

for subfolder in subfolders:
    # getting the path of the specific subfolder
    subfolder_path = os.path.join(main_folder, subfolder)

    print(f"Loading and pre-processing text files in {subfolder}...")
    # making a list to store the text from the files in the subfolder
    texts = []

    # open each individual text file in the specific sorted subfolder and stores these as one long string for each subfolder
    for file in sorted(os.listdir(subfolder_path)):
        file_path = os.path.join(subfolder_path, file)
        # using ISO-8859-1 which ensures that all of the files can be loaded (some of them contain characters which otherwise cannot be loaded)
        with open(file_path, "r", encoding = "ISO-8859-1") as file:
            # reading the contents of the text files
            text = file.read()
            # pre-processes the text
            text = preprocess_text(text)
            texts.append(text)
    # storing the texts from each folder in a dictionary
    subfolder_texts[subfolder] = texts

Processing text files in a1...
Processing text files in a2...
Processing text files in a3...
Processing text files in a4...
Processing text files in a5...
Processing text files in b1...
Processing text files in b2...
Processing text files in b3...
Processing text files in b4...
Processing text files in b5...
Processing text files in b6...
Processing text files in b7...
Processing text files in b8...
Processing text files in c1...


In [11]:
# make a dictionary to store metrics for each subfolder
subfolder_metrics = {}

# for loop that calculates the NLP metrics for an individual folder at a time and stores it
for subfolder, texts in subfolder_texts.items():
    print(f"Performing analysis for {subfolder}...")
    
    # calculate NLP metrics for the current subfolder's texts
    subfolder_df = calculate_nlp_metrics(texts)
    
    # Store the dataframe in the specified dictionary
    subfolder_metrics[subfolder] = subfolder_df


# Create CSV files for each subfolder in the output folder
for subfolder, df in subfolder_metrics.items():
    df.to_csv(os.path.join(output_folder, f"nlp_analysis_results_{subfolder}.csv"), index=False)

Performing analysis for a1...
Performing analysis for a2...
Performing analysis for a3...
Performing analysis for a4...
Performing analysis for a5...
Performing analysis for b1...
Performing analysis for b2...
Performing analysis for b3...
Performing analysis for b4...
Performing analysis for b5...
Performing analysis for b6...
Performing analysis for b7...
Performing analysis for b8...
Performing analysis for c1...
