<a href="https://colab.research.google.com/github/00alba00/Exercise3/blob/main/Exercise3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Preprocessing of frequencies from corpus ELEXIS Spanish Web 2020

In [4]:
import pandas as pd
import re

# Load the CSV file into a DataFrame
df = pd.read_csv('wordlist.csv')

# Remove special characters from the 'item' column
df['Item'] = df['Item'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Calculate the total frequency
total_frequency = df['Frequency'].sum()

# Calculate the relative frequency
df['relative_frequency'] = df['Frequency'] / total_frequency

# Create a dictionary with item as key and relative frequency as value
item_relative_frequency = dict(zip(df['Item'], df['relative_frequency']))

# Print out the processed data
print("Processed data:")
print(df)

# Save the processed data to a variable
processed_data = df[['Item', 'relative_frequency']].set_index('Item').to_dict()['relative_frequency']

Processed data:
         Item  Frequency  relative_frequency
0          de   71143538            0.087946
1               57568049            0.071164
2          la   38911836            0.048102
3               38462549            0.047546
4         que   30267030            0.037415
..        ...        ...                 ...
995  paciente     104611            0.000129
996  animales     104494            0.000129
997    figura     104471            0.000129
998  completa     104403            0.000129
999  contrato     104387            0.000129

[1000 rows x 3 columns]


Analysis of model performance based on word frequency categories

In [2]:
import pandas as pd

# Step 1: Load processed word frequency data
# Convert the dictionary to a DataFrame
word_frequency_df = pd.DataFrame(processed_data.items(), columns=['Item', 'relative_frequency'])

# Step 2: Function to categorize word frequency
def categorize_frequency(relative_frequency):
    if relative_frequency >= 0.01:
        return 'High Frequency'
    elif relative_frequency >= 0.001:
        return 'Medium Frequency'
    else:
        return 'Low Frequency'

word_frequency_df['Frequency Level'] = word_frequency_df['relative_frequency'].apply(categorize_frequency)

# Step 3: Load model performance data
model_performance_df = pd.read_csv('EsCoLA.csv')  # Replace 'model_performance_data.csv' with your file name

# Step 4: Analyze model performance for each frequency category
result = {}
for freq_level in ['High Frequency', 'Medium Frequency', 'Low Frequency']:
    sentences_with_freq_level = model_performance_df[model_performance_df['Sentence'].str.contains('|'.join(word_frequency_df[word_frequency_df['Frequency Level'] == freq_level]['Item']))]
    total_sentences = len(sentences_with_freq_level)
    error_sentences = len(sentences_with_freq_level[sentences_with_freq_level['Label'] != sentences_with_freq_level['Human_annotation_median']])
    error_percentage = (error_sentences / total_sentences) * 100 if total_sentences != 0 else 0
    result[freq_level] = error_percentage

# Step 5: Output the results
output_df = pd.DataFrame(result.items(), columns=['Frequencies', 'Error percentage'])

# Step 6: Print the output
print("Output:")
print(output_df)

Output:
        Frequencies  Error percentage
0    High Frequency          8.301486
1  Medium Frequency          8.330700
2     Low Frequency          8.320185


In [3]:
import pandas as pd

# Step 1: Load processed word frequency data
word_frequency_df = pd.DataFrame(processed_data.items(), columns=['Item', 'relative_frequency'])

# Step 2: Calculate frequency levels based on relative frequency
def calculate_frequency_level(relative_frequency):
    if relative_frequency >= 0.01:
        return 'High frequency'
    elif relative_frequency >= 0.001:
        return 'Medium frequency'
    else:
        return 'Low frequency'

word_frequency_df['Frequency_level'] = word_frequency_df['relative_frequency'].apply(calculate_frequency_level)

# Step 3: Load model performance data
model_performance_df = pd.read_csv('EsCoLA.csv')  # Replace 'model_performance_data.csv' with your CSV file


# Step 4: Analyze relationship between word frequency and model's performance
def analyze_performance_by_frequency_level(sentence):
    # Calculate frequency level for each word in the sentence
    words = sentence.split()
    frequency_levels = []
    for word in words:
        # Check if the word is present in the frequency data
        if word in word_frequency_df['Item'].values:
            # Retrieve the frequency level for the word
            frequency_level = word_frequency_df.loc[word_frequency_df['Item'] == word, 'Frequency_level'].values[0]
            frequency_levels.append(frequency_level)
        else:
            # If the word is not found in the frequency data, consider it as low frequency
            frequency_levels.append('Low frequency')

    # Assign the minimum frequency level of all words in the sentence as the frequency level of the sentence
    sentence_frequency_level = min(frequency_levels, key=lambda x: ['Low frequency', 'Medium frequency', 'High frequency'].index(x))

    # Count the number of errors for the given frequency level
    errors = model_performance_df[(model_performance_df['Sentence'] == sentence) & (model_performance_df['Label'] != model_performance_df['Human_annotation_median'])]
    error_count = len(errors)

    return sentence_frequency_level, error_count

# Apply the function to each sentence and create a DataFrame
sentences = model_performance_df['Sentence'].unique()
frequency_error_data = [analyze_performance_by_frequency_level(sentence) for sentence in sentences]
frequency_error_df = pd.DataFrame(frequency_error_data, columns=['Frequencies', 'Error Number'])


# Step 5: Output the table
print(frequency_error_df)

        Frequencies  Error Number
0     Low frequency             0
1     Low frequency             0
2     Low frequency             1
3     Low frequency             1
4     Low frequency             0
...             ...           ...
9460  Low frequency             0
9461  Low frequency             0
9462  Low frequency             0
9463  Low frequency             0
9464  Low frequency             0

[9465 rows x 2 columns]
