# Fetching the Relevant Companies

Currently we have fetched roughly 80 technologies and 6 000 papers. Now we need to fetch companies. In our papers dataset we have the property **institutions**. These are the insitutions affiliated to the authors in the paper. We know explore how useful this data is.

In [None]:
# How many papers do have institutions assigned to them
import json 

def compute_institutions_coverage(file_path):
    total_count = 0
    with_intsitutions_count = 0
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try: 
                paper = json.loads(line)
            except json.JSONDecodeError:
                continue
            
            total_count += 1
            institutions = paper.get('institutions') 
            if isinstance(institutions, list) and len(institutions) > 0:
                with_intsitutions_count += 1
                
    percentage = (with_intsitutions_count / total_count) * 100 if total_count > 0 else 0
    return with_intsitutions_count, total_count, percentage

data_file = "../data/papers-data/cleaned_papers.jsonl"
count, total, pct = compute_institutions_coverage(data_file)
print(f"{count} out of {total} papers have institutions assigned to them ({pct:.2f}%)")


4409 out of 6000 papers have institutions assigned to them (73.48%)


In [8]:
# Create a simple dataframe of the institutions and how often they show up
import pandas as pd
from collections import Counter

def compute_institutions_frequency(file_path):
    counter = Counter()
    total_papers = 0
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try: 
                paper = json.loads(line)
            except json.JSONDecodeError:
                continue
            
            total_papers += 1
            institutions = paper.get('institutions') 
            if isinstance(institutions, list):
                for inst in institutions:
                    if isinstance(inst, dict): 
                        name = inst.get("display_name")
                        if isinstance(name, str) and name.strip():
                            counter[name.strip()] += 1
    
    df = pd.DataFrame(counter.items(), columns=['institution', 'count'])
    df = df.sort_values(by='count', ascending=False).reset_index(drop=True)
    return df

df_counts = compute_institutions_frequency(data_file)
# convert dataframe to csv 
output_file = "../data/company-data/institutions_counts.csv"
df_counts.to_csv(output_file, index=False)
print(f"Total number of institutions: {len(df_counts)}")


# create new dataframe with only instituions that dont have "university" in their name to get a rough sense how many universities are in the data
df_non_universities = df_counts[~df_counts['institution'].str.contains("university", case=False, na=False)]
output_file_non_universities = "../data/company-data/institutions_counts_non_universities.csv"
df_non_universities.to_csv(output_file_non_universities, index=False)
print(f"Total number of institutions: {len(df_non_universities)}")

Total number of institutions: 7649
Total number of institutions: 5420
