In [5]:
import pandas as pd
import csv

file_path = r"C:\Users\jawri\Data course\Capstone\CORDIS data\Projects\project.csv"

project_df = pd.read_csv(
    file_path,
    sep=';',
    quoting=csv.QUOTE_ALL,
    encoding='utf-8',
    on_bad_lines='skip'  # skip problematic lines if any
)

# Show structure and head
print("✅ Loaded project.csv successfully")
print(project_df.info())
print(project_df.head())
print(project_df.describe())


✅ Loaded project.csv successfully
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35105 entries, 0 to 35104
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  35105 non-null  int64 
 1   acronym             35105 non-null  object
 2   status              35105 non-null  object
 3   title               35105 non-null  object
 4   startDate           35094 non-null  object
 5   endDate             35094 non-null  object
 6   totalCost           35105 non-null  object
 7   ecMaxContribution   35105 non-null  object
 8   legalBasis          35105 non-null  object
 9   topics              35105 non-null  object
 10  ecSignatureDate     35105 non-null  object
 11  frameworkProgramme  35105 non-null  object
 12  masterCall          35105 non-null  object
 13  subCall             35105 non-null  object
 14  fundingScheme       35104 non-null  object
 15  nature              593 non-null    

In [7]:
from IPython.display import display, HTML

# Force DataFrame to scroll horizontally in Jupyter
display(HTML(project_df.head().to_html(notebook=True)))

Unnamed: 0,id,acronym,status,title,startDate,endDate,totalCost,ecMaxContribution,legalBasis,topics,ecSignatureDate,frameworkProgramme,masterCall,subCall,fundingScheme,nature,objective,contentUpdateDate,rcn,grantDoi
0,672641,ADHESION,CLOSED,Development of new biotech AdvanceD materials ...,2015-05-01,2015-10-31,71429,50000,H2020-EU.2.1.4.,BIOTEC-5a-2014-1,2015-04-17,H2020,H2020-SMEInst-2014-2015,H2020-SMEINST-1-2014,SME-1,,The aim of the ADHESION Project is to strength...,2024-09-06 18:16:22,196463,10.3030/672641
1,750248,GlycoNanoPep,CLOSED,Glyco and Nano Peptide Conjugates for Selectiv...,2017-04-01,2019-03-31,1581216,1581216,H2020-EU.1.3.,MSCA-IF-2016,2017-02-27,H2020,H2020-MSCA-IF-2016,H2020-MSCA-IF-2016,MSCA-IF-EF-ST,,The next generation therapeutic agents such as...,2022-08-16 00:01:34,208681,10.3030/750248
2,855867,NANOFLOWSIZER,CLOSED,Developing a disruptive and revolutionary tool...,2019-03-01,2019-06-30,71429,50000,H2020-EU.2.3.,EIC-SMEInst-2018-2020,2019-02-05,H2020,H2020-EIC-SMEInst-2018-2020,H2020-SMEInst-2018-2020-1,SME-1,,Nanotechnology is rapidly growing in the pharm...,2024-09-06 17:42:42,221149,10.3030/855867
3,778157,CanBioSe,SIGNED,Novel 1D photonic metal oxide nanostructures f...,2018-01-01,2023-12-31,1116000,1116000,H2020-EU.1.3.,MSCA-RISE-2017,2017-11-22,H2020,H2020-MSCA-RISE-2017,H2020-MSCA-RISE-2017,MSCA-RISE,,The project CanBioSe targeted to strengthen in...,2025-02-19 12:25:10,213008,10.3030/778157
4,798830,MAGNET-CELLPATCH,CLOSED,Multimodal magnetic cellular-patches with syne...,2019-09-01,2021-08-31,1581216,1581216,H2020-EU.1.3.,MSCA-IF-2017,2018-03-19,H2020,H2020-MSCA-IF-2017,H2020-MSCA-IF-2017,MSCA-IF-EF-ST,,The key goals of the MAGNET-CELLPATCH project ...,2022-08-24 00:16:07,215681,10.3030/798830


In [11]:
for col in project_df.columns:
    print(f"\n📌 {col}")

    # Count missing
    missing = project_df[col].isna().sum()
    print(f"   Missing: {missing}")

    # Count unique values
    unique_count = project_df[col].nunique()
    print(f"   Unique: {unique_count}")

    # Show most frequent values (top 5)
    top_values = project_df[col].value_counts(dropna=False).head(5)
    print("   Most frequent values:")
    print(top_values)

    # Optional: show types if column is mixed (object type)
    if project_df[col].dtype == 'object':
        types = project_df[col].map(type).value_counts()
        if len(types) > 1:
            print("   ⚠️ Mixed types:", types.to_dict())



📌 id
   Missing: 0
   Unique: 35105
   Most frequent values:
id
672641       1
101005330    1
101006249    1
641342       1
786641       1
Name: count, dtype: int64

📌 acronym
   Missing: 0
   Unique: 32731
   Most frequent values:
acronym
SMILE      15
INSPIRE    15
IMPACT     14
EPIC       12
COSMOS     12
Name: count, dtype: int64

📌 status
   Missing: 0
   Unique: 3
   Most frequent values:
status
CLOSED        27594
SIGNED         6045
TERMINATED     1466
Name: count, dtype: int64

📌 title
   Missing: 0
   Unique: 34830
   Most frequent values:
title
Establishing services enhancing the innovation management capacity of SME's in the Enterprise Europe Network     6
Science in the City                                                                                              5
Illes Balears INNOvation Management Services                                                                     5
Establishing services enhancing the innovation management capacity of SME´s in Hamburg and S

In [19]:
project_df['ecMaxContribution'] = pd.to_numeric(project_df['ecMaxContribution'], errors='coerce')
project_df['totalCost'] = pd.to_numeric(project_df['totalCost'], errors='coerce')

# Describe funding columns with renamed index
summary = (
    project_df[['ecMaxContribution', 'totalCost']]
    .describe()
    .rename(index={
        'count': 'Non-null count',
        'mean': 'Mean',
        'std': 'Std Dev',
        'min': 'Min',
        '25%': '25th percentile',
        '50%': 'Median',
        '75%': '75th percentile',
        'max': 'Max'
    })
)

# Display summary
print(summary)

                 ecMaxContribution     totalCost
Non-null count        1.840800e+04  1.781100e+04
Mean                  1.686962e+06  1.741858e+06
Std Dev               6.165377e+06  4.978828e+06
Min                   3.150000e+03  0.000000e+00
25th percentile       1.000000e+05  7.142900e+04
Median                1.350688e+06  1.375869e+06
75th percentile       1.999973e+06  2.000000e+06
Max                   6.788000e+08  2.353209e+08


In [5]:
import pandas as pd
from IPython.display import display, HTML
import os
import csv

# Folder path
data_path = r"C:\Users\jawri\Data course\Capstone\CORDIS data\Projects"

# File list (excluding project.csv)
filenames = [
    "euroSciVoc.csv",
    "legalBasis.csv",
    "organization.csv",
    "projectDeliverables.csv",
    "projectIrps_h2020.csv",
    "projectPublications.csv",
    "reportSummaries.csv",
    "topics.csv",
    "webItem.csv",
    "webLink.csv"
]

# Display nice scrollable head for each
for file in filenames:
    print(f"\n🔹 {file}")
    try:
        df = pd.read_csv(os.path.join(data_path, file), sep=';', encoding='utf-8', quoting=csv.QUOTE_ALL, on_bad_lines='skip')
        display(HTML(df.head(50).to_html(notebook=True)))
    except Exception as e:
        print(f"❌ Failed to load {file}: {e}")



🔹 euroSciVoc.csv


Unnamed: 0,projectID,euroSciVocCode,euroSciVocPath,euroSciVocTitle,euroSciVocDescription
0,672641,/25/75/469,/engineering and technology/mechanical enginee...,manufacturing engineering,
1,672641,/21/39/223,/medical and health sciences/clinical medicine...,surgery,
2,672641,/23/53/365/72505310,/natural sciences/chemical sciences/inorganic ...,transition metals,
3,672641,/25/69/435,/engineering and technology/nanotechnology/nan...,nano-materials,
4,672641,/21/39/221,/medical and health sciences/clinical medicine...,cardiology,
5,816332,/25/67/425/1163,/engineering and technology/environmental engi...,liquid fuels,
6,816332,/25/67/425/1165/1681,/engineering and technology/environmental engi...,petroleum,
7,816332,/25/69/435,/engineering and technology/nanotechnology/nan...,nano-materials,
8,750248,/23/49/315/997/1613,/natural sciences/biological sciences/biochemi...,proteins,
9,750248,/23/49/313,/natural sciences/biological sciences/cell bio...,cell biology,



🔹 legalBasis.csv


Unnamed: 0,projectID,legalBasis,title,uniqueProgrammePart
0,672641,H2020-EU.2.1.4.,INDUSTRIAL LEADERSHIP - Leadership in enabling...,True
1,672641,H2020-EU.2.3.1.,"Mainstreaming SME support, especially through...",
2,816332,H2020-EU.3.,PRIORITY 'Societal challenges,
3,816332,H2020-EU.2.3.,INDUSTRIAL LEADERSHIP - Innovation In SMEs,True
4,816332,H2020-EU.2.1.,INDUSTRIAL LEADERSHIP - Leadership in enabling...,
5,750248,H2020-EU.1.3.,EXCELLENT SCIENCE - Marie Skłodowska-Curie Act...,True
6,750248,H2020-EU.1.3.2.,Nurturing excellence by means of cross-border ...,
7,855867,H2020-EU.3.,PRIORITY 'Societal challenges,
8,855867,H2020-EU.2.3.,INDUSTRIAL LEADERSHIP - Innovation In SMEs,True
9,855867,H2020-EU.2.1.,INDUSTRIAL LEADERSHIP - Leadership in enabling...,



🔹 organization.csv


Unnamed: 0,projectID,projectAcronym,organisationID,vatNumber,name,shortName,SME,activityType,street,postCode,city,country,nutsCode,geolocation,organizationURL,contactForm,contentUpdateDate,rcn,order,role,ecContribution,netEcContribution,totalCost,endOfParticipation,active
0,672641,ADHESION,969027576,IT01949700221,NOVAGENIT SRL,NOVAGENIT SRL,True,PRC,VIALE TRENTO 115/117,38017,MEZZOLOMBARDO,IT,ITH20,"46.203852,11.0976153",,https://ec.europa.eu/info/funding-tenders/oppo...,2024-09-06 18:16:22,1936887,1,coordinator,50000.0,50000.0,71429,False,
1,816332,OILCS,927067995,,FIRMA INSTYTUT ECOLOGII LIUDYNY-INEKO,FIRM INSTITUTE OF HUMAN ECOLOGY-INEKO,True,PRC,82 ALISHER NAVOI AVE APT 15 DNIPROVSKYI DISTRICT,02125,KYIV,UA,UA,"50.4501071,30.5240501",,https://ec.europa.eu/info/funding-tenders/oppo...,2022-08-12 11:39:44,2470604,1,coordinator,50000.0,50000.0,71429,False,
2,750248,GlycoNanoPep,999829635,ESQ1518001A,UNIVERSIDAD DE SANTIAGO DE COMPOSTELA,UNIVERSIDADE DE SANTIAGO DE COMPOSTELA,False,HES,COLEXIO DE SAN XEROME PRAZA DO OBRADOIRO S/N,15782,Santiago De Compostela,ES,ES111,"42.8804219,-8.5458607",http://www.usc.es,https://ec.europa.eu/info/funding-tenders/oppo...,2022-08-16 00:01:34,1907764,1,coordinator,158121.6,158121.6,1581216,False,
3,855867,NANOFLOWSIZER,918624727,NL854200484B01,INPROCESS-LSP BV,,True,PRC,KLOOSTERSTRAAT 9,5349 AB,Oss,NL,NL416,"51.8232382,5.56651",,https://ec.europa.eu/info/funding-tenders/oppo...,2024-09-06 17:42:42,1956205,1,coordinator,50000.0,50000.0,71429,False,
4,778157,CanBioSe,952695492,,NANOPHARMA AS,NANOPHARMA AS,True,PRC,NOVA 306,530 09,Pardubice,CZ,CZ053,"50.0511228,15.7591047",https://www.nanopharma.cz,https://ec.europa.eu/info/funding-tenders/oppo...,2025-02-19 12:25:10,2200531,8,participant,9000.0,9000.0,9000,False,
5,778157,CanBioSe,999886865,PL7770006350,UNIWERSYTET IM. ADAMA MICKIEWICZA WPOZNANIU,ADAM MICKIEWICZ UNIVERSITYAMU,False,HES,ULICA HENRYKA WIENIAWSKIEGO 1,61 712,Poznan,PL,PL415,"52.4078848,16.9159902",http://www.amu.edu.pl,https://ec.europa.eu/info/funding-tenders/oppo...,2025-02-19 12:25:10,1907016,4,participant,265500.0,265500.0,265500,False,
6,778157,CanBioSe,890330603,LT100013933613,MB SENSOGRAFA,MB SENSOGRAFA,True,PRC,KIPARISU 29,LT 14226,Vilnius,LT,LT011,"54.78142765,25.2643835",,https://ec.europa.eu/info/funding-tenders/oppo...,2025-02-19 12:25:10,1978647,14,participant,27000.0,27000.0,27000,False,
7,778157,CanBioSe,998587550,FR12193401122,ECOLE NATIONALE SUPERIEURE DE CHIMIE DE MONTPE...,ENSCM,False,HES,240 AV DU PROF EMILE JEANBRAU,34296,Montpellier Cedex 5,FR,FRJ13,"43.6112422,3.8767337",https://www.enscm.fr,https://ec.europa.eu/info/funding-tenders/oppo...,2025-02-19 12:25:10,1992226,3,participant,112500.0,112500.0,112500,False,
8,778157,CanBioSe,999893170,LT119508113,VILNIAUS UNIVERSITETAS,Vilniaus universitetas,False,HES,UNIVERSITETO G. 3,01513,Vilnius,LT,LT011,"54.682708,25.2866715",http://www.vu.lt,https://ec.europa.eu/info/funding-tenders/oppo...,2025-02-19 12:25:10,1906326,2,participant,193500.0,193500.0,193500,False,
9,778157,CanBioSe,999625450,IT02003000227,FONDAZIONE BRUNO KESSLER,FBK,False,REC,VIA SANTA CROCE 77,38122,Trento,IT,ITH20,"46.0627414,11.124081763669",http://www.fbk.eu,https://ec.europa.eu/info/funding-tenders/oppo...,2025-02-19 12:25:10,1909535,5,participant,31500.0,31500.0,31500,False,



🔹 projectDeliverables.csv


Unnamed: 0,id,title,deliverableType,description,projectID,projectAcronym,url,collection,contentUpdateDate,rcn
0,884229_20_DELIV,Socioeconomic evaluation and policy recommenda...,"Documents, reports",Report on the potential social and economic im...,884229,HYFLEXPOWER,https://ec.europa.eu/research/participants/doc...,Project deliverable,2025-04-22 16:42:03,1253499
1,101036480_30_DELIV,Dissemination and Communication Report,"Documents, reports",An assessment of the impact of the communicati...,101036480,GreenSCENT,https://ec.europa.eu/research/participants/doc...,Project deliverable,2025-04-14 15:59:25,1279507
2,899293_9_DELIV,Characterization report on WP2 samples,"Documents, reports",Several MOF single crystals for morphological ...,899293,SPARTE,https://ec.europa.eu/research/participants/doc...,Project deliverable,2025-04-14 15:59:20,1279502
3,818144_45_DELIV,Assessment of the value of genetic profiles an...,"Documents, reports",This relates to task 3.3 and a protocol will b...,818144,InnoVar,https://ec.europa.eu/research/participants/doc...,Project deliverable,2025-04-14 13:53:02,1279383
4,101006964_33_DELIV,Report on assessment and quantification of dri...,"Documents, reports",Report on further assessment and quantificatio...,101006964,OptiDrill,https://ec.europa.eu/research/participants/doc...,Project deliverable,2025-04-14 13:51:53,1279268
5,862221_49_DELIV,7.4: EUFGIS training workshop report,"Documents, reports",Workshop reports – these will include particip...,862221,FORGENIUS,https://ec.europa.eu/research/participants/doc...,Project deliverable,2025-04-23 16:24:21,1289123
6,861079_18_DELIV,Summary report on innovation in heat exchangers,"Documents, reports",Innovation in heat exchanger design for MGTs i...,861079,NextMGT,https://ec.europa.eu/research/participants/doc...,Project deliverable,2025-04-23 16:24:21,1289120
7,847552_32_DELIV,Report on the evaluation for nuclear structure...,"Documents, reports",Report on the evaluation for nuclear structure...,847552,SANDA,https://ec.europa.eu/research/participants/doc...,Project deliverable,2025-04-23 16:24:21,1289117
8,861079_16_DELIV,Report of cost optimisation methods,"Documents, reports",Requirements for optimum operation and cost,861079,NextMGT,https://ec.europa.eu/research/participants/doc...,Project deliverable,2025-04-23 16:24:19,1289108
9,847552_30_DELIV,Report on new nuclear reaction data evaluation,"Documents, reports",Report on new nuclear reaction data evaluation,847552,SANDA,https://ec.europa.eu/research/participants/doc...,Project deliverable,2025-04-23 16:24:19,1289105



🔹 projectIrps_h2020.csv


Unnamed: 0,"projectID,type,title,organisationID,applicantName,applicationDate,applicationPrefix,applicationIdentifier,applicationKind,epoAppUrl,priorityDate,patentType,awardDate,awardPrefix,awardIdentifier,awardKind,epoPubUrl,patentFamilyIdentifier"
0,"641518,PATENT,Method for the acquisition and p..."
1,"635734,PATENT,KETOREDUCTASES,986597283.0,C-LEC..."
2,"653203,PATENT,Mecanisme de pivotement de dossi..."
3,"653203,PATENT,Ouvrant en élytre vers l'arrière..."
4,"653203,PATENT,Système de fixation et de réglag..."
5,"653203,PATENT,Suspension Hydropneumatique sur ..."
6,"653203,PATENT,Gestion du roulis pour véhicules..."
7,"653203,PATENT,Montage de roue AR d'un bras osc..."
8,"653203,PATENT,Groupe hydraulique simplifié - ..."
9,"653203,PATENT,Contrôle semi-actif des véhicule..."



🔹 projectPublications.csv


  df = pd.read_csv(os.path.join(data_path, file), sep=';', encoding='utf-8', quoting=csv.QUOTE_ALL, on_bad_lines='skip')


Unnamed: 0,id,title,isPublishedAs,authors,journalTitle,journalNumber,publishedYear,publishedPages,issn,isbn,doi,projectID,projectAcronym,collection,contentUpdateDate,rcn
0,951475_2590757_PUBLI,A multiplex protein panel assay for severity p...,Peer reviewed articles,"Ziyue Wang, Adam Cryar, Oliver Lemke, Pinkus T...",eClinicalMedicine,49.0,2023.0,101495,2589-5370,,10.1016/j.eclinm.2022.101495,951475,FungalTolerance,Project publication,2025-04-22 19:52:40,1286170
1,825664_2581201_PUBLI,The global Alzheimer's Association round robin...,Peer reviewed articles,"Josef Pannee, Leslie M. Shaw, Magdalena Koreck...","Alzheimer's & Dementia: Diagnosis, Assessment ...",13.0,2023.0,,2352-8729,,10.1002/dad2.12242,825664,JPCOFUND2,Project publication,2025-04-22 19:49:42,1285553
2,825664_2581050_PUBLI,Head-to-Head Comparison of 8 Plasma Amyloid-β ...,Peer reviewed articles,"Shorena Janelidze, Charlotte E. Teunissen, Hen...",JAMA Neurology,78.0,2023.0,1375,2168-6149,,10.1001/jamaneurol.2021.3180,825664,JPCOFUND2,Project publication,2025-04-22 19:49:40,1285545
3,825664_2580833_PUBLI,Human Small Heat Shock Protein B8 Inhibits Pro...,Peer reviewed articles,"Dhawal Choudhary, Laura Mediani, Mario J. Avel...",Journal of the American Chemical Society,145.0,2023.0,15188-15196,0002-7863,,10.1021/jacs.3c02022,825664,JPCOFUND2,Project publication,2025-04-22 19:49:33,1285517
4,825546_2551530_PUBLI,Diet and exercise in frailty and sarcopenia. M...,Peer reviewed articles,"Fernando Millan-Domingo, Esther Garcia-Domingu...",Molecular Aspects of Medicine,100.0,2024.0,101322,0098-2997,,10.1016/j.mam.2024.101322,825546,DIABFRAIL-LATAM,Project publication,2025-04-22 19:49:31,1285508
5,956454_2575447_PUBLI,EU-compliant wastewater recycled phosphorus: H...,Peer reviewed articles,"Juan Serrano-Gomez, Geneviève S. Metson, Tina-...",Journal of Cleaner Production,429.0,2023.0,139482,0959-6526,,10.1016/j.jclepro.2023.139482,956454,RecaP,Project publication,2025-04-22 19:52:36,1286137
6,956439_2550634_PUBLI,Economically competitive Organic Acid‐Base mix...,Peer reviewed articles,"Flore Kilens, Ane Olazabal, Daniele Mantione, ...",ChemCatChem,16.0,2025.0,,1867-3880,,10.1002/cctc.202400215,956439,NATURE,Project publication,2025-04-22 19:52:34,1286131
7,899546_2600994_PUBLI,Biological invasions are a population‐level ra...,Peer reviewed articles,"Phillip J. Haubrock, Ismael Soto, Danish A. Ah...",Global Change Biology,30.0,2024.0,,1354-1013,,10.1111/gcb.17312,899546,BIENVENUE,Project publication,2025-04-23 10:18:35,1288735
8,890284_1737644_PUBLI,The role of individual variability on the pred...,Peer reviewed articles,"Marianna Chimienti, Akiko Kato, Olivia Hicks, ...",Scientific Reports,,2022.0,,2045-2322,,10.1038/s41598-022-22258-1,890284,MuFFIN,Project publication,2025-04-23 10:18:25,1288665
9,852187_2606421_PUBLI,"Active droplets through enzyme-free, dynamic p...",Peer reviewed articles,"Simone M. Poprawa, Michele Stasi, Brigitte A. ...",Nature Communications,15.0,2024.0,,2041-1723,,10.1038/s41467-024-48571-z,852187,ActiDrops,Project publication,2025-04-22 19:05:11,1283772



🔹 reportSummaries.csv


Unnamed: 0,id,title,projectID,projectAcronym,attachment,contentUpdateDate,rcn
0,101032582_PS,Periodic Reporting for period 1 - AML-SynergyX...,101032582,AML-SynergyX,/docs/results/h2020/101032/101032582_PS/flt3-d...,2025-04-08 15:36:00,1278826
1,101002584_PS,Periodic Reporting for period 2 - PlasticityOf...,101002584,PlasticityOfMind,,2025-04-22 13:51:28,1281783
2,101026382_PS,Periodic Reporting for period 1 - AMPLIFI (Dev...,101026382,AMPLIFI,/docs/results/h2020/101026/101026382_PS/graphi...,2025-02-24 17:20:00,1257164
3,101018645_PS,Periodic Reporting for period 2 - MINiTEXTS (M...,101018645,MINiTEXTS,,2025-04-22 13:51:28,1281780
4,803096_PS,Periodic Reporting for period 4 - SPEC (Secure...,803096,SPEC,/docs/results/h2020/803/803096_PS/mpc-web.jpg,2025-03-13 17:28:33,803113
5,772286_PS,Periodic Reporting for period 4 - SOLWET (Elec...,772286,SOLWET,,2025-04-22 17:08:44,863559
6,848137_PS,Periodic Reporting for period 3 - MENTUPP (Men...,848137,MENTUPP,/docs/results/h2020/848/848137_PS/mentupp-hub-...,2025-04-08 15:36:00,1278904
7,101001081_PS,Periodic Reporting for period 2 - UniTED (Unra...,101001081,UniTED,,2025-04-22 13:52:11,1056501
8,101000216_PS,Periodic Reporting for period 2 - Code Re-farm...,101000216,Code Re-farm,/docs/results/h2020/101000/101000216_PS/04-saa...,2025-02-24 09:58:48,1255560
9,101002629_PS,Periodic Reporting for period 2 - CrossOver (M...,101002629,CrossOver,,2025-04-22 13:51:28,1281781



🔹 topics.csv


Unnamed: 0,projectID,topic,title
0,672641,BIOTEC-5a-2014-1,SME boosting biotechnology-based industrial pr...
1,816332,EIC-SMEInst-2018-2020,SME instrument
2,750248,MSCA-IF-2016,Individual Fellowships
3,855867,EIC-SMEInst-2018-2020,SME instrument
4,778157,MSCA-RISE-2017,Research and Innovation Staff Exchange
5,798830,MSCA-IF-2017,Individual Fellowships
6,821431,CE-SC5-01-2018,Methods to remove hazardous substances and con...
7,832787,MSCA-IF-2018,Individual Fellowships
8,831756,MSCA-IF-2018,Individual Fellowships
9,699794,MSCA-IF-2015-GF,Marie Skłodowska-Curie Individual Fellowships ...



🔹 webItem.csv


Unnamed: 0,language,availableLanguages,uri,title,type,source,represents
0,en,"en,any",/docs/projects/files/956/956874/232820.pdf,Optimised Usage of Hybrid HPC Infrastructures,relatedPrint,editorial,
1,en,"en,any",/docs/projects/files/792/792037/215035.jpg,,relatedImage,editorial,projectLogo
2,en,"en,any",/docs/projects/files/770/770045/dialls-black.jpg,,relatedImage,editorial,projectLogo
3,en,en,/docs/projects/files/896/896651/227621.png,,relatedImage,editorial,projectLogo
4,en,"en,any",/docs/projects/files/812/812753/220825.png,,relatedImage,editorial,projectLogo
5,en,"en,any",/docs/projects/files/955/955646/230763.png,,relatedImage,editorial,projectLogo
6,en,"en,any",/docs/projects/files/642/642154/fissac-logo-co...,,relatedImage,editorial,projectLogo
7,en,"en,any",/docs/projects/files/101/101000236/236283.pdf,1st issue of the GEroNIMO newsletter,relatedPrint,editorial,
8,en,"en,any",/docs/projects/files/723/723311/c-mobile-logo-...,,relatedImage,editorial,projectLogo
9,en,"en,any",/docs/projects/files/654/654408/s2l-flyer-prin...,"Unlimited renewable fuel supply from H2O, CO2 ...",relatedPrint,editorial,



🔹 webLink.csv


Unnamed: 0,projectID,physUrl,id,availableLanguages,status,archivedDate,type,source,represents
0,816332,https://inecoinstitute.com/dev/,f6bfb66eb366297bee858fa3d903867f,en,,,relatedWebsite,corda,project
1,750248,http://www.javiermontenegrochemistry.com/http_...,5f281d32198034ca942fe4191c495692,en,,,relatedWebsite,corda,project
2,855867,https://nanoflowsizer.com/products/modules/,284a6c9f1a8fc7520e20148f8313903b,en,,,relatedWebsite,corda,project
3,778157,https://ec.europa.eu/research/participants/doc...,43aee9169fb44c80b5ab726273238815,en,,,projectDeliverable,corda,project
4,778157,https://ec.europa.eu/research/participants/doc...,46817d80dd484174ae81b2fa089b89ec,en,,,projectDeliverable,corda,project
5,778157,https://ec.europa.eu/research/participants/doc...,c0a9d56af8b1dd9b99c3d4fe99f5228e,en,,,projectDeliverable,corda,
6,778157,https://ec.europa.eu/research/participants/doc...,91043e70c0f1f11fd146a677f07cd5d7,en,,,projectDeliverable,corda,project
7,778157,https://ec.europa.eu/research/participants/doc...,c8a150ba2241b5b7f718fbbd4bd3848f,en,,,projectDeliverable,corda,
8,778157,https://ec.europa.eu/research/participants/doc...,1fc127e064d5d0deb1de7fb221d36e49,en,,,projectDeliverable,corda,project
9,778157,https://ec.europa.eu/research/participants/doc...,9078b3eaab19679578247f001d8b247c,en,,,projectDeliverable,corda,project


In [7]:
import pandas as pd
from IPython.display import display, HTML
import os
import csv

data_path = r"C:\Users\jawri\Data course\Capstone\CORDIS data\Projects"

filenames = [
    "euroSciVoc.csv",
    "legalBasis.csv",
    "organization.csv",
    "projectDeliverables.csv",
    "projectIrps_h2020.csv",
    "projectPublications.csv",
    "reportSummaries.csv",
    "topics.csv",
    "webItem.csv",
    "webLink.csv"
]

# Describe numeric stats in each file
for file in filenames:
    print(f"\n🔹 {file}")
    try:
        df = pd.read_csv(os.path.join(data_path, file), sep=';', encoding='utf-8', quoting=csv.QUOTE_ALL, on_bad_lines='skip')
        df.head()
        desc = df.describe()
        display(HTML(desc.to_html()))
    except Exception as e:
        print(f"❌ Failed to describe {file}: {e}")



🔹 euroSciVoc.csv


Unnamed: 0,projectID,euroSciVocDescription
count,112005.0,0.0
mean,8678214.0,
std,27031170.0,
min,115797.0,
25%,700540.0,
50%,769513.0,
75%,854157.0,
max,101052000.0,



🔹 legalBasis.csv


Unnamed: 0,projectID
count,65799.0
mean,10638870.0
std,29847300.0
min,115797.0
25%,722956.0
50%,798157.0
75%,873084.0
max,101052000.0



🔹 organization.csv


Unnamed: 0,projectID,organisationID,rcn,order,ecContribution,netEcContribution
count,178015.0,178015.0,178015.0,178015.0,170246.0,177978.0
mean,11680060.0,974183700.0,2059347.0,14.585411,400615.1,383204.6
std,31212740.0,35624270.0,283360.6,74.819269,1073181.0,1006503.0
min,115797.0,873615300.0,1905548.0,1.0,0.0,0.0
25%,717861.5,949716500.0,1906589.0,2.0,79750.0,73576.25
50%,787054.0,998776700.0,1919454.0,6.0,212933.8,201165.2
75%,871525.0,999902400.0,2001494.0,12.0,440000.0,417034.7
max,101052000.0,999997900.0,3146864.0,1001.0,178169200.0,178169200.0



🔹 projectDeliverables.csv


Unnamed: 0,projectID,rcn
count,184832.0,184832.0
mean,9705678.0,658737.6
std,28566400.0,313461.3
min,115797.0,203948.0
25%,693382.0,335946.8
50%,776691.0,662224.5
75%,862848.0,912029.2
max,101046700.0,1290566.0



🔹 projectIrps_h2020.csv


Unnamed: 0,"projectID,type,title,organisationID,applicantName,applicationDate,applicationPrefix,applicationIdentifier,applicationKind,epoAppUrl,priorityDate,patentType,awardDate,awardPrefix,awardIdentifier,awardKind,epoPubUrl,patentFamilyIdentifier"
count,2060
unique,1980
top,",923089734.0,AM TECHNOLOGY LIMITED,2015-03-25,PL,15720434,T,https://worldwide.espacenet.com/patent/search?q=PL15720434T,2014-03-26,BACKGROUND,2019-01-31,PL,3122695,T3,https://worldwide.espacenet.com/patent/search?q=PL3122695T3,50819834.0\r\n738719,PATENT,Cement-based photocatalytic composition, and use thereof for obtaining water paints, in particular for outdoor applications"
freq,19



🔹 projectPublications.csv


  df = pd.read_csv(os.path.join(data_path, file), sep=';', encoding='utf-8', quoting=csv.QUOTE_ALL, on_bad_lines='skip')


Unnamed: 0,publishedYear,projectID,rcn
count,444889.0,444942.0,444942.0
mean,2020.197047,7078492.0,793795.4
std,2.31565,24367630.0,221338.9
min,2001.0,115797.0,482511.0
25%,2018.0,688156.0,595884.2
50%,2020.0,760173.0,768083.5
75%,2022.0,837793.8,960153.8
max,2047.0,101046700.0,1290601.0



🔹 reportSummaries.csv


Unnamed: 0,projectID,rcn
count,34227.0,34227.0
mean,9876284.0,580730.6
std,28799600.0,321907.5
min,115797.0,184713.0
25%,709132.5,259735.5
50%,790155.0,478099.0
75%,867469.0,863826.5
max,101052000.0,1283565.0



🔹 topics.csv


Unnamed: 0,projectID
count,35389.0
mean,10793390.0
std,30062470.0
min,115797.0
25%,711332.0
50%,793159.0
75%,870697.0
max,101052000.0



🔹 webItem.csv


Unnamed: 0,language,availableLanguages,uri,title,type,source,represents
count,10,10,10,3,10,10,7
unique,1,2,10,3,2,1,1
top,en,"en,any",/docs/projects/files/956/956874/232820.pdf,Optimised Usage of Hybrid HPC Infrastructures,relatedImage,editorial,projectLogo
freq,10,9,1,1,7,10,7



🔹 webLink.csv


Unnamed: 0,projectID
count,217704.0
mean,9803608.0
std,28706310.0
min,115797.0
25%,695996.0
50%,777111.0
75%,862957.0
max,101052000.0


In [31]:
import pandas as pd
from IPython.display import display, HTML
import os
import csv

data_path = r"C:\Users\jawri\Data course\Capstone\CORDIS data\Projects"

filenames = [
    "euroSciVoc.csv",
    "legalBasis.csv",
    "organization.csv",
    "projectDeliverables.csv",
    "projectIrps_h2020.csv",
    "projectPublications.csv",
    "reportSummaries.csv",
    "topics.csv",
    "webItem.csv",
    "webLink.csv"
]

for file in filenames:
    print(f"\n🔍 Inspecting {file}")
    try:
        df = pd.read_csv(
            os.path.join(data_path, file),
            sep=';',
            encoding='utf-8',
            quoting=csv.QUOTE_ALL,
            on_bad_lines='skip'
        )
        
        summary = []
        for col in df.columns:
            non_null = df[col].notna().sum()
            nulls = df[col].isna().sum()
            unique = df[col].nunique(dropna=True)
            most_common = df[col].mode().iloc[0] if not df[col].mode().empty else "N/A"
            freq = df[col].value_counts(dropna=True).iloc[0] if not df[col].value_counts().empty else "N/A"
            summary.append([col, non_null, nulls, unique, most_common, freq])

        summary_df = pd.DataFrame(summary, columns=[
            "Column", "Non-null", "Null", "Unique", "Most Frequent", "Frequency"
        ])
        display(HTML(summary_df.to_html(index=False)))
    
    except Exception as e:
        print(f"❌ Failed to process {file}: {e}")



🔍 Inspecting euroSciVoc.csv


Column,Non-null,Null,Unique,Most Frequent,Frequency
projectID,112005,0,32149,115843,5.0
euroSciVocCode,112005,0,1020,/23/47/307,1795.0
euroSciVocPath,112005,0,1020,/natural sciences/computer and information sciences/software,1795.0
euroSciVocTitle,112005,0,1020,software,1795.0
euroSciVocDescription,0,112005,0,,



🔍 Inspecting legalBasis.csv


Column,Non-null,Null,Unique,Most Frequent,Frequency
projectID,65799,0,35389,818116,10
legalBasis,65799,0,219,H2020-EU.1.3.,11808
title,65799,0,219,EXCELLENT SCIENCE - Marie Skłodowska-Curie Actions,11808
uniqueProgrammePart,35400,30399,1,True,35400



🔍 Inspecting organization.csv


Column,Non-null,Null,Unique,Most Frequent,Frequency
projectID,178015,0,35386,785219,210
projectAcronym,178015,0,32975,GrapheneCore2,210
organisationID,178015,0,41732,999997930,1875
vatNumber,154105,23910,33585,FR40180089013,1875
name,178015,0,41656,CENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE CNRS,1875
shortName,137820,40195,27202,CNRS,1877
SME,177499,516,2,False,142361
activityType,177069,946,5,HES,59470
street,176783,1232,38378,RUE MICHEL ANGE 3,1875
postCode,176029,1986,19046,1000,2533



🔍 Inspecting projectDeliverables.csv


Column,Non-null,Null,Unique,Most Frequent,Frequency
id,184832,0,184832,101000002_10_DELIV,1
title,184831,1,164617,Data Management Plan,1686
deliverableType,184832,0,7,"Documents, reports",132170
description,183649,1183,169902,Migration,438
projectID,184832,0,12561,662287,207
projectAcronym,184832,0,11843,CONCERT,249
url,184828,4,184828,https://ec.europa.eu/research/participants/documents/downloadPublic?documentIds=080166e50000835d&appId=PPGMS,1
collection,184832,0,1,Project deliverable,184832
contentUpdateDate,184832,0,73227,2020-08-07 14:23:42,5123
rcn,184832,0,184832,203948,1



🔍 Inspecting projectIrps_h2020.csv


Column,Non-null,Null,Unique,Most Frequent,Frequency
"projectID,type,title,organisationID,applicantName,applicationDate,applicationPrefix,applicationIdentifier,applicationKind,epoAppUrl,priorityDate,patentType,awardDate,awardPrefix,awardIdentifier,awardKind,epoPubUrl,patentFamilyIdentifier",2060,0,1980,",923089734.0,AM TECHNOLOGY LIMITED,2015-03-25,PL,15720434,T,https://worldwide.espacenet.com/patent/search?q=PL15720434T,2014-03-26,BACKGROUND,2019-01-31,PL,3122695,T3,https://worldwide.espacenet.com/patent/search?q=PL3122695T3,50819834.0\r\n738719,PATENT,Cement-based photocatalytic composition, and use thereof for obtaining water paints, in particular for outdoor applications",19



🔍 Inspecting projectPublications.csv


  df = pd.read_csv(


Column,Non-null,Null,Unique,Most Frequent,Frequency
id,444942,0,444942,101000014_967109_PUBLI,1
title,444941,1,418860,Introduction,109
isPublishedAs,444941,1,7,Peer reviewed articles,283673
authors,444930,12,393331,Ayelet Peer,305
journalTitle,412720,32222,105550,Nature Communications,5110
journalNumber,273380,171562,51903,1,13105
publishedYear,444889,53,21,2021.0,69728
publishedPages,223635,221307,111405,1-6,2297
issn,295039,149903,24098,2041-1723,5548
isbn,43801,401141,20941,978-1-5386-4862-9,80



🔍 Inspecting reportSummaries.csv


Column,Non-null,Null,Unique,Most Frequent,Frequency
id,34227,0,34227,101000002_PS,1
title,34227,0,34168,Periodic Reporting for period 1 - ERBSN 4 H2020 (Eastern Romanian Business Support Network support for innovative SMEs),4
projectID,34227,0,34227,115797,1
projectAcronym,34227,0,31920,INSPIRE,15
attachment,31023,3204,31023,/docs/results/h2020/101/101000002_PS/a2eic.jpg,1
contentUpdateDate,34227,0,17522,2020-09-24 09:59:54,106
rcn,34227,0,34227,184713,1



🔍 Inspecting topics.csv


Column,Non-null,Null,Unique,Most Frequent,Frequency
projectID,35389,0,35389,115797,1
topic,35389,0,3489,EIC-SMEInst-2018-2020,2482
title,35389,0,3267,Individual Fellowships,7150



🔍 Inspecting webItem.csv


Column,Non-null,Null,Unique,Most Frequent,Frequency
language,10,0,1,en,10
availableLanguages,10,0,2,"en,any",9
uri,10,0,10,/docs/projects/files/101/101000236/236283.pdf,1
title,3,7,3,1st issue of the GEroNIMO newsletter,1
type,10,0,2,relatedImage,7
source,10,0,1,editorial,10
represents,7,3,1,projectLogo,7



🔍 Inspecting webLink.csv


Column,Non-null,Null,Unique,Most Frequent,Frequency
projectID,217704,0,31412,662287,208
physUrl,217704,0,216403,https://www.kuleuven.be/onderzoek/portaal/,24
id,217702,2,216410,00f06fb6fd07ae8571862285df62ed3a,24
availableLanguages,217704,0,19,en,217511
status,326,217378,3,invalid,138
archivedDate,325,217379,155,2022-02-18 00:00:00,68
type,217704,0,7,projectDeliverable,184469
source,217704,0,3,corda,215359
represents,194183,23521,11,project,193453


In [33]:
with open(os.path.join(data_path, "projectIrps_h2020.csv"), 'r', encoding='utf-8') as f:
    lines = f.readlines()
print(f"Lines in file (including header): {len(lines)}")

Lines in file (including header): 2490


In [35]:
file = "projectIrps_h2020.csv"
path = os.path.join(data_path, file)

# Attempt with robust settings
df = pd.read_csv(
    path,
    sep=';',
    encoding='utf-8',
    quoting=csv.QUOTE_ALL,
    on_bad_lines='skip',
    engine='python'  # More forgiving with line breaks
)

# Show result
print(f"✅ Loaded {file} — shape: {df.shape}")
display(HTML(df.head(10).to_html(notebook=True)))

✅ Loaded projectIrps_h2020.csv — shape: (1968, 1)


Unnamed: 0,"projectID,type,title,organisationID,applicantName,applicationDate,applicationPrefix,applicationIdentifier,applicationKind,epoAppUrl,priorityDate,patentType,awardDate,awardPrefix,awardIdentifier,awardKind,epoPubUrl,patentFamilyIdentifier"
0,"641518,PATENT,Method for the acquisition and p..."
1,"635734,PATENT,KETOREDUCTASES,986597283.0,C-LEC..."
2,"653203,PATENT,Mecanisme de pivotement de dossi..."
3,"653203,PATENT,Ouvrant en élytre vers l'arrière..."
4,"653203,PATENT,Système de fixation et de réglag..."
5,"653203,PATENT,Suspension Hydropneumatique sur ..."
6,"653203,PATENT,Gestion du roulis pour véhicules..."
7,"653203,PATENT,Montage de roue AR d'un bras osc..."
8,"653203,PATENT,Groupe hydraulique simplifié - ..."
9,"653203,PATENT,Contrôle semi-actif des véhicule..."


In [37]:
file = "projectIrps_h2020.csv"
path = os.path.join(data_path, file)

# Show first 5 lines raw
print("🔍 First 5 raw lines:")
with open(path, 'r', encoding='utf-8') as f:
    for _ in range(5):
        print(f.readline())

# Try loading with both delimiters
print("\n✅ Trying delimiter: COMMA")
try:
    df_comma = pd.read_csv(path, sep=',', encoding='utf-8', engine='python', on_bad_lines='skip')
    print(f"Shape: {df_comma.shape}")
    print(df_comma.head())
except Exception as e:
    print(f"❌ Failed with comma: {e}")

print("\n✅ Trying delimiter: SEMICOLON")
try:
    df_semi = pd.read_csv(path, sep=';', encoding='utf-8', engine='python', on_bad_lines='skip')
    print(f"Shape: {df_semi.shape}")
    print(df_semi.head())
except Exception as e:
    print(f"❌ Failed with semicolon: {e}")

🔍 First 5 raw lines:
projectID,type,title,organisationID,applicantName,applicationDate,applicationPrefix,applicationIdentifier,applicationKind,epoAppUrl,priorityDate,patentType,awardDate,awardPrefix,awardIdentifier,awardKind,epoPubUrl,patentFamilyIdentifier

641518,PATENT,Method for the acquisition and processing of geographical information of a path,996169922.0,GEONUMERICS SL,2013-07-22,US,201314417435,A,https://worldwide.espacenet.com/patent/search?q=US201314417435A,2012-07-26,BACKGROUND,2015-07-09,US,2015192928,A1,https://worldwide.espacenet.com/patent/search?q=US2015192928A1,47528706.0

635734,PATENT,KETOREDUCTASES,986597283.0,C-LECTA GMBH,2017-07-14,EP,2018069058,W,https://worldwide.espacenet.com/patent/search?q=EP2018069058W,,BACKGROUND,2019-01-17,WO,2019012095,A1,https://worldwide.espacenet.com/patent/search?q=WO2019012095A1,62837946.0

653203,PATENT,Mecanisme de pivotement de dossier de siege,999420877.0,PSA AUTOMOBILES SA,2015-04-08,FR,1553008,A,https://worldwide.espacenet.com

In [39]:
file = "projectIrps_h2020.csv"
path = os.path.join(data_path, file)

# Show first 5 raw lines
print("🔍 First 5 raw lines:")
with open(path, 'r', encoding='utf-8') as f:
    for _ in range(5):
        print(f.readline())

# Try COMMA delimiter
print("\n✅ Trying delimiter: COMMA")
try:
    df_comma = pd.read_csv(path, sep=',', encoding='utf-8', engine='python', on_bad_lines='skip')
    print(f"Shape: {df_comma.shape}")
    display(HTML(df_comma.head(10).to_html(notebook=True)))
except Exception as e:
    print(f"❌ Failed with comma: {e}")

# Try SEMICOLON delimiter
print("\n✅ Trying delimiter: SEMICOLON")
try:
    df_semi = pd.read_csv(path, sep=';', encoding='utf-8', engine='python', on_bad_lines='skip')
    print(f"Shape: {df_semi.shape}")
    display(HTML(df_semi.head(10).to_html(notebook=True)))
except Exception as e:
    print(f"❌ Failed with semicolon: {e}")

🔍 First 5 raw lines:
projectID,type,title,organisationID,applicantName,applicationDate,applicationPrefix,applicationIdentifier,applicationKind,epoAppUrl,priorityDate,patentType,awardDate,awardPrefix,awardIdentifier,awardKind,epoPubUrl,patentFamilyIdentifier

641518,PATENT,Method for the acquisition and processing of geographical information of a path,996169922.0,GEONUMERICS SL,2013-07-22,US,201314417435,A,https://worldwide.espacenet.com/patent/search?q=US201314417435A,2012-07-26,BACKGROUND,2015-07-09,US,2015192928,A1,https://worldwide.espacenet.com/patent/search?q=US2015192928A1,47528706.0

635734,PATENT,KETOREDUCTASES,986597283.0,C-LECTA GMBH,2017-07-14,EP,2018069058,W,https://worldwide.espacenet.com/patent/search?q=EP2018069058W,,BACKGROUND,2019-01-17,WO,2019012095,A1,https://worldwide.espacenet.com/patent/search?q=WO2019012095A1,62837946.0

653203,PATENT,Mecanisme de pivotement de dossier de siege,999420877.0,PSA AUTOMOBILES SA,2015-04-08,FR,1553008,A,https://worldwide.espacenet.com

Unnamed: 0,projectID,type,title,organisationID,applicantName,applicationDate,applicationPrefix,applicationIdentifier,applicationKind,epoAppUrl,priorityDate,patentType,awardDate,awardPrefix,awardIdentifier,awardKind,epoPubUrl,patentFamilyIdentifier
0,641518,PATENT,Method for the acquisition and processing of g...,996169922.0,GEONUMERICS SL,2013-07-22,US,201314417435,A,https://worldwide.espacenet.com/patent/search?...,2012-07-26,BACKGROUND,2015-07-09,US,2015192928,A1,https://worldwide.espacenet.com/patent/search?...,47528706.0
1,635734,PATENT,KETOREDUCTASES,986597283.0,C-LECTA GMBH,2017-07-14,EP,2018069058,W,https://worldwide.espacenet.com/patent/search?...,,BACKGROUND,2019-01-17,WO,2019012095,A1,https://worldwide.espacenet.com/patent/search?...,62837946.0
2,653203,PATENT,Mecanisme de pivotement de dossier de siege,999420877.0,PSA AUTOMOBILES SA,2015-04-08,FR,1553008,A,https://worldwide.espacenet.com/patent/search?...,,BACKGROUND,2018-10-12,FR,3034721,B1,https://worldwide.espacenet.com/patent/search?...,53514331.0
3,653203,PATENT,Ouvrant en élytre vers l'arrière,999420877.0,PSA AUTOMOBILES SA,2016-06-26,FR,1651536,A,https://worldwide.espacenet.com/patent/search?...,,BACKGROUND,2018-02-23,FR,3048202,B1,https://worldwide.espacenet.com/patent/search?...,56117860.0
4,653203,PATENT,Système de fixation et de réglage du dossier p...,999420877.0,PSA AUTOMOBILES SA,2015-05-21,FR,1554570,A,https://worldwide.espacenet.com/patent/search?...,,BACKGROUND,2017-06-02,FR,3036365,B1,https://worldwide.espacenet.com/patent/search?...,53758419.0
5,653203,PATENT,Suspension Hydropneumatique sur véhicule incli...,999420877.0,PSA AUTOMOBILES SA,2015-06-15,FR,1555448,A,https://worldwide.espacenet.com/patent/search?...,,BACKGROUND,2017-07-07,FR,3037279,B1,https://worldwide.espacenet.com/patent/search?...,54291421.0
6,653203,PATENT,Gestion du roulis pour véhicules pendulaires,999420877.0,PSA AUTOMOBILES SA,2016-01-11,FR,1650180,A,https://worldwide.espacenet.com/patent/search?...,,BACKGROUND,2018-01-26,FR,3046571,B1,https://worldwide.espacenet.com/patent/search?...,55650523.0
7,653203,PATENT,Montage de roue AR d'un bras oscillant sur un ...,999420877.0,PSA AUTOMOBILES SA,2015-07-02,FR,1556240,A,https://worldwide.espacenet.com/patent/search?...,,BACKGROUND,2017-07-21,FR,3038291,B1,https://worldwide.espacenet.com/patent/search?...,54066089.0
8,653203,PATENT,Groupe hydraulique simplifié - Seringue motor...,999420877.0,PSA AUTOMOBILES SA,2016-03-23,FR,1652467,A,https://worldwide.espacenet.com/patent/search?...,,BACKGROUND,2019-04-26,FR,3049230,B1,https://worldwide.espacenet.com/patent/search?...,56119553.0
9,653203,PATENT,Contrôle semi-actif des véhicules librement in...,999420877.0,PSA AUTOMOBILES SA,2016-10-29,FR,1660502,A,https://worldwide.espacenet.com/patent/search?...,,BACKGROUND,2018-11-30,FR,3058093,B1,https://worldwide.espacenet.com/patent/search?...,57861045.0



✅ Trying delimiter: SEMICOLON
Shape: (1968, 1)


Unnamed: 0,"projectID,type,title,organisationID,applicantName,applicationDate,applicationPrefix,applicationIdentifier,applicationKind,epoAppUrl,priorityDate,patentType,awardDate,awardPrefix,awardIdentifier,awardKind,epoPubUrl,patentFamilyIdentifier"
0,"641518,PATENT,Method for the acquisition and p..."
1,"635734,PATENT,KETOREDUCTASES,986597283.0,C-LEC..."
2,"653203,PATENT,Mecanisme de pivotement de dossi..."
3,"653203,PATENT,Ouvrant en élytre vers l'arrière..."
4,"653203,PATENT,Système de fixation et de réglag..."
5,"653203,PATENT,Suspension Hydropneumatique sur ..."
6,"653203,PATENT,Gestion du roulis pour véhicules..."
7,"653203,PATENT,Montage de roue AR d'un bras osc..."
8,"653203,PATENT,Groupe hydraulique simplifié - ..."
9,"653203,PATENT,Contrôle semi-actif des véhicule..."


In [41]:
summary = []

for col in df_comma.columns:
    non_null = df_comma[col].notna().sum()
    nulls = df_comma[col].isna().sum()
    unique = df_comma[col].nunique(dropna=True)
    most_common = df_comma[col].mode().iloc[0] if not df_comma[col].mode().empty else "N/A"
    freq = df_comma[col].value_counts(dropna=True).iloc[0] if not df_comma[col].value_counts().empty else "N/A"
    summary.append([col, non_null, nulls, unique, most_common, freq])

summary_df = pd.DataFrame(summary, columns=[
    "Column", "Non-null", "Null", "Unique", "Most Frequent", "Frequency"
])

from IPython.display import display, HTML
display(HTML(summary_df.to_html(index=False)))


Column,Non-null,Null,Unique,Most Frequent,Frequency
projectID,2324,0,879,738719,80
type,2324,0,1,PATENT,2324
title,2324,0,1838,"Cement-based photocatalytic composition, and use thereof for obtaining water paints, in particular for outdoor applications\n",36
organisationID,2323,1,767,923089734.0,80
applicantName,2324,0,778,AM TECHNOLOGY LIMITED,80
applicationDate,2321,3,1139,2015-03-25,36
applicationPrefix,2324,0,48,EP,1141
applicationIdentifier,2324,0,2003,15720434,25
applicationKind,2324,0,4,A,1597
epoAppUrl,2324,0,2007,https://worldwide.espacenet.com/patent/search?q=PL15720434T,25


In [45]:
# Load euroSciVoc.csv (use sep=';' if you haven’t already loaded it)
euro_df = pd.read_csv(
    os.path.join(data_path, "euroSciVoc.csv"),
    sep=';',
    encoding='utf-8',
    quoting=csv.QUOTE_ALL,
    on_bad_lines='skip'
)

# Split the euroSciVocPath into hierarchical levels
hierarchy_split = euro_df['euroSciVocPath'].str.strip('/').str.split('/', expand=True)

# Rename columns for clarity
hierarchy_split.columns = [f'Level_{i+1}' for i in hierarchy_split.columns]

# Combine with original title for context (optional)
hierarchy_split['Title'] = euro_df['euroSciVocTitle']

# Drop duplicates to show unique topic paths
unique_hierarchy = hierarchy_split.drop_duplicates().reset_index(drop=True)

# Display the hierarchy nicely
from IPython.display import display, HTML
display(HTML(unique_hierarchy.head(100).to_html(index=False)))


Level_1,Level_2,Level_3,Level_4,Level_5,Level_6,Level_7,Title
engineering and technology,mechanical engineering,manufacturing engineering,,,,,manufacturing engineering
medical and health sciences,clinical medicine,surgery,,,,,surgery
natural sciences,chemical sciences,inorganic chemistry,transition metals,,,,transition metals
engineering and technology,nanotechnology,nano-materials,,,,,nano-materials
medical and health sciences,clinical medicine,cardiology,,,,,cardiology
engineering and technology,environmental engineering,energy and fuels,liquid fuels,,,,liquid fuels
engineering and technology,environmental engineering,energy and fuels,fossil energy,petroleum,,,petroleum
natural sciences,biological sciences,biochemistry,biomolecules,proteins,,,proteins
natural sciences,biological sciences,cell biology,,,,,cell biology
natural sciences,biological sciences,genetics,nucleotides,,,,nucleotides


In [47]:
# Split the hierarchy path into levels
hierarchy_split = euro_df['euroSciVocPath'].str.strip('/').str.split('/', expand=True)

# Rename for readability
hierarchy_split.columns = [f'Level_{i+1}' for i in hierarchy_split.columns]

# Count unique values at each level
unique_counts = hierarchy_split.nunique().reset_index()
unique_counts.columns = ['Level', 'Unique Values']

# Display as a nice table
from IPython.display import display, HTML
display(HTML(unique_counts.to_html(index=False)))


Level,Unique Values
Level_1,6
Level_2,40
Level_3,253
Level_4,447
Level_5,231
Level_6,37
Level_7,9


In [49]:
summary = []

for col in hierarchy_split.columns:
    non_blank = hierarchy_split[col].dropna().astype(str).str.strip()
    unique = non_blank[non_blank != 'None'].nunique()
    blanks = hierarchy_split[col].isna().sum() + (hierarchy_split[col] == 'None').sum()
    summary.append([col, unique, blanks])

summary_df = pd.DataFrame(summary, columns=['Level', 'Unique Values', 'Blank Count'])

from IPython.display import display, HTML
display(HTML(summary_df.to_html(index=False)))


Level,Unique Values,Blank Count
Level_1,6,0
Level_2,40,98
Level_3,253,2056
Level_4,447,34904
Level_5,231,87854
Level_6,37,108443
Level_7,9,111579


In [9]:
# Show unique types and their counts
type_counts = projectIrps['type'].value_counts(dropna=False).reset_index()
type_counts.columns = ['type', 'count']
print(type_counts)


NameError: name 'projectIrps' is not defined