In [1]:
import pandas as pd

# Load the local file (much faster than downloading via script)
# If you downloaded 'train.parquet', use this:
df = pd.read_parquet('train.parquet')

# View the first 5 rows
print(df.head())

# To make it look like a CSV you're used to:
text_sample = df.iloc[0]['text']
entities_sample = df.iloc[0]['entities']

print(f"\n--- Clinical Text ---\n{text_sample}")
print(f"\n--- Annotated Entities ---\n{entities_sample}")

      docid                                              words  \
0   6803733  [Insuffisance, gonadotrope, associée, à, l', h...   
1    284306  [Hygiène, dentaire, et, éducation, sanitaire, ...   
2   2147750  [Quels, sont, les, vrais, agents, cancérogènes...   
3   3592496  [Typologie, des, accidents, liés, à, l', éduca...   
4  17028560  [La, maladie, d', Alzheimer, et, la, mémoire, ...   

                                            ner_tags  
0               [1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 3, 0]  
1         [1, 1, 0, 2, 0, 0, 0, 4, 0, 0, 8, 0, 8, 0]  
2                              [0, 0, 0, 0, 5, 5, 0]  
3  [0, 0, 1, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, ...  
4                        [1, 1, 1, 1, 0, 0, 6, 4, 0]  


KeyError: 'text'

In [2]:
import pandas as pd

# Define the mapping for visualization
id2label = {0: "O", 1: "DISO", 2: "PROC", 3: "ANAT", 4: "LIVB", 5: "CHEM", 6: "PHYS", 8: "GEOG"}

# 1. Reconstruct the 'text' column
df['text'] = df['words'].apply(lambda x: " ".join(x))

# 2. Create a human-readable tag list
df['labels'] = df['ner_tags'].apply(lambda tags: [id2label.get(t, "OTHER") for t in tags])

# 3. View a specific example correctly
row = df.iloc[4] # The Alzheimer example in your screenshot
print(f"TEXT: {row['text']}")
print("\nWORD-BY-WORD LABELS:")
for word, label in zip(row['words'], row['labels']):
    if label != "O": # Only print the important medical terms
        print(f"[{word}] -> {label}")

TEXT: La maladie d' Alzheimer et la mémoire humaine .

WORD-BY-WORD LABELS:
[La] -> DISO
[maladie] -> DISO
[d'] -> DISO
[Alzheimer] -> DISO
[mémoire] -> PHYS
[humaine] -> LIVB


In [7]:
df.head(2650)

Unnamed: 0,docid,words,ner_tags,text,labels
0,6803733,"[Insuffisance, gonadotrope, associée, à, l', h...","[1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 3, 0]",Insuffisance gonadotrope associée à l' hypopla...,"[DISO, DISO, O, O, O, DISO, DISO, DISO, O, O, ..."
1,284306,"[Hygiène, dentaire, et, éducation, sanitaire, ...","[1, 1, 0, 2, 0, 0, 0, 4, 0, 0, 8, 0, 8, 0]",Hygiène dentaire et éducation sanitaire chez l...,"[DISO, DISO, O, PROC, O, O, O, LIVB, O, O, GEO..."
2,2147750,"[Quels, sont, les, vrais, agents, cancérogènes...","[0, 0, 0, 0, 5, 5, 0]",Quels sont les vrais agents cancérogènes ?,"[O, O, O, O, CHEM, CHEM, O]"
3,3592496,"[Typologie, des, accidents, liés, à, l', éduca...","[0, 0, 1, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, ...",Typologie des accidents liés à l' éducation ph...,"[O, O, DISO, O, O, O, PROC, PROC, O, O, O, O, ..."
4,17028560,"[La, maladie, d', Alzheimer, et, la, mémoire, ...","[1, 1, 1, 1, 0, 0, 6, 4, 0]",La maladie d' Alzheimer et la mémoire humaine .,"[DISO, DISO, DISO, DISO, O, O, PHYS, LIVB, O]"
...,...,...,...,...,...
2645,118_2_22,"[La, probabilité, d, ’, interactions, métaboli...","[0, 0, 0, 0, 6, 6, 0, 0, 0, 0, 0, 6, 0, 0, 0, ...",La probabilité d ’ interactions métaboliques e...,"[O, O, O, O, PHYS, PHYS, O, O, O, O, O, PHYS, ..."
2646,118_2_23,"[L, ’, administration, de, triméthoprime, (, 1...","[0, 0, 2, 0, 5, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, ...",L ’ administration de triméthoprime ( 160 mg )...,"[O, O, PROC, O, CHEM, O, O, O, O, O, O, CHEM, ..."
2647,118_2_24,"[Il, n, ’, est, cependant, pas, nécessaire, d,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, ...",Il n ’ est cependant pas nécessaire d ’ adapte...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, CHE..."
2648,118_2_25,"[2, ), .]","[0, 0, 0]",2 ) .,"[O, O, O]"
