In [2]:
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
%matplotlib inline

In [7]:
# Define the path to the file
metadata_file = 'data/primary_data/merlin-metadata-v1.2/metadata_ratings_indicators.csv'

# Try loading as CSV (comma-separated)
try:
    df_meta = pd.read_csv(metadata_file)
except:
    # If the file fails to load with the default comma delimiter, try a different one (like tab or semicolon)
    # The MERLIN data often uses TSV (tab-separated)
    try:
        df_meta = pd.read_csv(metadata_file, sep='\t')
        print("Successfully loaded metadata using tab separator.")
    except Exception as e:
        print(f"Could not load file. Check the delimiter in the README: {e}")
        # Stop and advise checking the README
        exit()

# Display the first few rows and column info to confirm successful loading
print("\nFirst 5 rows of the metadata:")
print(df_meta.head())
print("\nColumns and Data Types:")
print(df_meta.info())


First 5 rows of the metadata:
  _author_id _author _author_L1   _author_age _author_gender  \
0       0601    0601     German  not reported         female   
1       0602    0602     German  not reported           male   
2       0603    0603     German  not reported           male   
3       0604    0604     German  not reported           male   
4       0605    0605     German  not reported         female   

  _rating_coherence _rating_coherence2 _rating_fair_cefr  \
0                B1                NaN               A2+   
1                B1                NaN               A2+   
2                B1                NaN                B1   
3                B1                NaN                B1   
4                B1                NaN                B1   

  _rating_fair_cefr_rough _rating_general_linguistic_range  ...  \
0                      A2                               B1  ...   
1                      A2                               B1  ...   
2                     

  df_meta = pd.read_csv(metadata_file)


## Loaded data and analysed their informations, structure, dimensions

In [8]:
# Assuming the full DataFrame is named df_meta

# 1. Filter out native German speakers (who are in the first 5 rows)
df_l2_learners = df_meta[df_meta['_author_L1'] != 'German'].copy()

# 2. Convert CEFR to a categorical type for proper sorting (e.g., A1, A2, B1...)
cefr_order = ['A1', 'A2', 'A2+', 'B1', 'B1+', 'B2', 'C1', 'C2']
df_l2_learners['_rating_fair_cefr'] = pd.Categorical(
    df_l2_learners['_rating_fair_cefr'], categories=cefr_order, ordered=True
)

print(f"Total L2 German Learner Texts: {len(df_l2_learners)}")
print(f"Top 5 L1s: \n{df_l2_learners['_author_L1'].value_counts().head()}")

Total L2 German Learner Texts: 1700
Top 5 L1s: 
_author_L1
not reported    284
Russian         254
Polish          237
Other           213
Hungarian       178
Name: count, dtype: int64


In [9]:
# Distribution of proficiency levels
cefr_distribution = df_l2_learners['_rating_fair_cefr'].value_counts(normalize=True).sort_index()
print("\nCEFR Level Distribution (L2 Texts Only):")
print(cefr_distribution)


CEFR Level Distribution (L2 Texts Only):
_rating_fair_cefr
A1     0.044665
A2     0.225806
A2+    0.112283
B1     0.275434
B1+    0.133375
B2     0.178040
C1     0.028536
C2     0.001861
Name: proportion, dtype: float64


In [10]:
# Count texts for Alex's (English) and Jia's (Chinese) L1s
l1_counts = df_l2_learners['_author_L1'].value_counts()
print("\nTarget L1 Counts:")
print(f"English L1 (Alex): {l1_counts.get('English', 0)} texts")
print(f"Chinese L1 (Jia): {l1_counts.get('Chinese', 0)} texts")


Target L1 Counts:
English L1 (Alex): 69 texts
Chinese L1 (Jia): 10 texts


## The number of texts to support my 2nd persona (Jia) is definitely insufficient 

In [11]:
# Assuming df_l2_learners is the DataFrame filtered to exclude native German speakers
# We need to see the distribution of proficiency levels.

# 1. Filter out native German speakers (who were in the first 5 rows)
df_l2_learners = df_meta[df_meta['_author_L1'] != 'German'].copy()

# 2. Convert CEFR to a categorical type for proper sorting (e.g., A1, A2, B1...)
cefr_order = ['A1', 'A2', 'A2+', 'B1', 'B1+', 'B2', 'C1', 'C2']
df_l2_learners['_rating_fair_cefr'] = pd.Categorical(
    df_l2_learners['_rating_fair_cefr'], categories=cefr_order, ordered=True
)

# Distribution of proficiency levels
cefr_distribution = df_l2_learners['_rating_fair_cefr'].value_counts(normalize=False).sort_index()

print("\nCEFR Level Distribution (L2 Texts Only):")
print(cefr_distribution)


CEFR Level Distribution (L2 Texts Only):
_rating_fair_cefr
A1      72
A2     364
A2+    181
B1     444
B1+    215
B2     287
C1      46
C2       3
Name: count, dtype: int64


In [12]:
# Extracting columns that relate to linguistic complexity and proficiency
key_columns = ['_author_L1', '_author_gender', '_rating_fair_cefr', '_task_id']

# We need to find columns that quantify error rates.
# Look for columns containing "error" or "incorrect" if they exist in the metadata.
error_rate_columns = [col for col in df_l2_learners.columns if 'error' in col.lower() or 'incorrect' in col.lower()]

# We'll also select a few representative linguistic metrics (the NaN values suggest these columns need cleaning)
linguistic_metrics = [
    'lex_lexicalDensity', # How rich the vocabulary is
    'syn_sentenceComplexityRatio', # How complex the sentences are
    'syn_passiveVoiceToSentenceRatio' # Use of passive voice (more advanced structure)
]

print("\nIdentified Error Rate Columns in Metadata (If any):")
print(error_rate_columns)


Identified Error Rate Columns in Metadata (If any):
['ind_errorfree_sentence', 'ind_errorfree_sentence_C_Con_accur', 'ind_errorfree_sentence_G', 'ind_errorfree_sentence_G_Art', 'ind_errorfree_sentence_G_Clit', 'ind_errorfree_sentence_G_Conj', 'ind_errorfree_sentence_G_Inflect_Inexist', 'ind_errorfree_sentence_G_Morphol_Wrong', 'ind_errorfree_sentence_G_Neg', 'ind_errorfree_sentence_G_Prep', 'ind_errorfree_sentence_G_Refl', 'ind_errorfree_sentence_G_Valency', 'ind_errorfree_sentence_G_Verb', 'ind_errorfree_sentence_G_Verb_compl', 'ind_errorfree_sentence_G_Verb_main', 'ind_errorfree_sentence_G_Wo', 'ind_errorfree_sentence_O', 'ind_errorfree_sentence_O_Apostr', 'ind_errorfree_sentence_O_Graph', 'ind_errorfree_sentence_O_Punct', 'ind_errorfree_sentence_O_Wordbd', 'ind_errorfree_sentence_S_Form', 'ind_errorfree_sentence_S_Var', 'ind_errorfree_sentence_V', 'ind_errorfree_sentence_V_Wordform', 'ind_errorfree_sentence_V_form_word_fs_nonexist', 'ind_errorfree_sentence_V_semconn_at_word_fs', 'i

In [13]:
# Assuming df_l2_learners is your DataFrame filtered for L2 texts with the CEFR column set as categorical.

error_cols = [
    '_rating_fair_cefr', # Key for grouping
    'ind_errorfree_sentence_G',
    'ind_errorfree_sentence_G_Art',
    'ind_errorfree_sentence_G_Wo',
    'ind_errorfree_sentence_G_Verb',
    'ind_errorfree_sentence_G_Prep'
]

# 1. Select relevant columns and group by CEFR level, calculating the mean error-free rate
error_free_rates = df_l2_learners[error_cols].groupby('_rating_fair_cefr').mean()

# 2. Convert error-free rate (0 to 1) to error rate (0 to 1) for easier interpretation
# A higher number means a higher error rate.
average_error_rates = 1 - error_free_rates

# 3. Round and display the results
print("\nAverage Error Rate (1 - Error Free Rate) by CEFR Level:")
print(average_error_rates.round(3))


Average Error Rate (1 - Error Free Rate) by CEFR Level:
                   ind_errorfree_sentence_G  ind_errorfree_sentence_G_Art  \
_rating_fair_cefr                                                           
A1                                    0.239                         0.148   
A2                                    0.152                         0.081   
A2+                                   0.104                         0.053   
B1                                    0.072                         0.047   
B1+                                   0.052                         0.028   
B2                                    0.058                         0.034   
C1                                    0.021                         0.010   
C2                                    0.000                         0.000   

                   ind_errorfree_sentence_G_Wo  ind_errorfree_sentence_G_Verb  \
_rating_fair_cefr                                                               
A1        

  error_free_rates = df_l2_learners[error_cols].groupby('_rating_fair_cefr').mean()
