[Reference](https://khuyentran1476.medium.com/visualize-similarities-between-companies-with-graph-database-212af872fbf6)

In [1]:
import gdown

url = "https://drive.google.com/uc?id=1owa_NIBnj-Q5uwDyM2xBJeZPipjmkJsH"
output = "artificial_intelligence.json"

gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1owa_NIBnj-Q5uwDyM2xBJeZPipjmkJsH
To: /content/artificial_intelligence.json
100%|██████████| 30.3M/30.3M [00:00<00:00, 98.9MB/s]


'artificial_intelligence.json'

In [2]:
import json

with open("artificial_intelligence.json") as f:
    output = json.load(f)
    
data = output["data"]

# View the keys of the first company
data[0].keys()

dict_keys(['twitterUri', 'nbActiveEmployeeEdges', 'type', 'allNames', 'revenue', 'yearlyRevenues', 'logo', 'id', 'stock', 'nbOrigins', 'sicClassification', 'foundingDate', 'image', 'images', 'wikipediaUri', 'irsEmployerIdentificationNumbers', 'diffbotUri', 'nbIncomingEdges', 'nbEmployeesMin', 'ipo', 'parentCompany', 'angellistUri', 'name', 'motto', 'nbEmployeesMax', 'totalInvestment', 'allOriginHashes', 'linkedInUri', 'naicsClassification', 'nbEmployees', 'githubUri', 'isDissolved', 'importance', 'origin', 'description', 'homepageUri', 'founders', 'ceo', 'investments', 'blogUri', 'descriptors', 'isNonProfit', 'origins', 'isPublic', 'categories', 'crawlTimestamp', 'nbUniqueInvestors', 'facebookUri', 'secCentralIndexKeys', 'summary', 'types', 'boardMembers', 'allUris', 'nbLocations', 'crunchbaseUri', 'industries', 'allDescriptions', 'location', 'locations', 'subsidiaries'])

In [4]:
from datetime import datetime
import pandas as pd

def get_founding_year(company: dict):
    date_str = company.get("foundingDate", {}).get("str")
    if date_str:
        return date_str.split("-")[0][1:]
    return date_str
    
df = pd.DataFrame(
    {
        "company": [company.get("name") for company in data],
        "revenue": [company.get("revenue", {}).get("value") for company in data],
        "nbEmployees": [company.get("nbEmployees", None) for company in data],
        "founding_year": [get_founding_year(company) for company in data],
        "locations": [
            company.get("location", {}).get("country", {}).get("name")
            for company in data
        ],
        "isPublic": [company.get("isPublic") for company in data],
        "industries": [company.get("industries") for company in data],
    }
)

In [5]:
df = df[df.notnull().all(axis=1)]

df["founding_year"] = df["founding_year"].astype(int)

# Get number of years from now
df["year_from_now"] = df["founding_year"].apply(lambda row: datetime.now().year - row)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [6]:
df.head(10)

Unnamed: 0,company,revenue,nbEmployees,founding_year,locations,isPublic,industries,year_from_now
0,Google,66001000000.0,150000,1998,United States of America,True,"[Manufacturing Companies, Electronic Products ...",24
1,ABB,34312000000.0,60000,1988,Switzerland,True,"[Software Companies, Artificial Intelligence C...",34
2,Google,490000000.0,35000,1998,United States of America,False,"[Software Companies, Artificial Intelligence C...",24
3,Rockwell Automation Inc.,6666000000.0,23500,1903,United States of America,True,"[Manufacturing Companies, Software Companies, ...",119
4,Keyence,4958000000.0,7500,1974,Japan,True,"[Manufacturing Companies, Electronic Products ...",48
5,IBM INDIA PRIVATE LIMITED,320000000.0,7500,1992,India,False,"[Software Companies, Artificial Intelligence C...",30
6,Toyota Motor,27234520000000.0,350000,1937,Japan,True,"[Manufacturing Companies, Engine Manufacturers...",85
7,Visteon Corporation,2548000000.0,10000,2000,United States of America,True,"[Manufacturing Companies, Electronic Products ...",22
9,"EXLSERVICE HOLDINGS, INC.",958434000.0,35000,1999,United States of America,True,"[Software Companies, Artificial Intelligence C...",23
10,gA,29000000.0,3000,1992,United States of America,False,"[Software Companies, Artificial Intelligence C...",30


In [7]:
df.to_csv("artificial_intelligence.csv", index=False)