In [266]:
import pandas as pd
import requests
import json
from pandas import json_normalize
from requests.auth import HTTPBasicAuth

pd.options.display.max_seq_items = 20000
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)

### 0.1 Notes and Thoughts

Hard Filters that are needed for the Dataset are:
- B2C
- Europe 
- (Later industry, first iteration without industry specificat)

All Available fields:
id
name
type
deleted
path
tagline
about
url,website_url,twitter_url,facebook_url,linkedin_url,google_url,crunchbase_url,angellist_url
playmarket_app_id,appstore_app_id
images
employees,employees_latest
industries,sub_industries,corporate_industries,service_industries
technologies
income_streams
growth_stage
traffic_summary
hq_locations
client_focus
revenues
tags
ownerships
delivery_method
launch_year,launch_month
has_promising_founder,has_strong_founder,has_super_founder
total_funding,total_funding_source,last_funding,last_funding_source
company_status,last_updated,last_updated_utc,created_utc
facebook_likes_chart,twitter_tweets_chart,twitter_followers_chart,twitter_favorites_chart
employees_chart
similarweb_3_months_growth_unique,similarweb_3_months_growth_percentile,similarweb_3_months_growth_relative,similarweb_3_months_growth_delta,similarweb_6_months_growth_unique,similarweb_6_months_growth_percentile,similarweb_6_months_growth_relative,similarweb_6_months_growth_delta,similarweb_12_months_growth_unique,similarweb_12_months_growth_percentile,similarweb_12_months_growth_relative,similarweb_12_months_growth_delta

app_3_months_growth_unique,app_3_months_growth_percentile,app_3_months_growth_relative,app_6_months_growth_unique,app_6_months_growth_percentile,app_6_months_growth_relative,app_12_months_growth_unique,app_12_months_growth_percentile,app_12_months_growth_relative,employee_3_months_growth_unique

employee_3_months_growth_percentile,employee_3_months_growth_relative,employee_3_months_growth_delta,employee_6_months_growth_unique,employee_6_months_growth_percentile,employee_6_months_growth_relative,employee_6_months_growth_delta,employee_12_months_growth_unique,employee_12_months_growth_percentile,employee_12_months_growth_relative,employee_12_months_growth_delta
kpi_summary
team
investors
fundings
traffic
similarweb_chart
job_openings
exits
trading_multiple
app_downloads_ios_chart,app_downloads_android_chart,app_downloads_ios_incremental_chart,app_downloads_android_incremental_chart
tech_stack_predictleads
sustainable_development_goals
core_side_value
data_type
pic_number
patents_count


### 0.2 Getting Available Filters for the Post Requests

In [39]:

api_url="https://api.dealroom.co/api/v1/companies/filters"
API_KEY=""
auth=HTTPBasicAuth(API_KEY, '')

headers = {'Content-Type': 'application/json'}
data={"fields":"hq_locations",
              "limit": 1,
              "offset": 0
             }
response = requests.post(api_url,data=json.dumps(data),auth=auth,headers=headers)
data=response.json()



# 1.1 Building the Core DF

1. Variables for the DF:

1.1: Independent Variables
- Team
- patents (patents_count)
- Round (fundings)
- Investors
- employees (latest, chart)
- Num. Investors (investors)
- (Industry Category) (industries)
- Description Length (about)
- Has Domain (website_url)
- Has Facebook (facebook_url)
- Has Twitter (twitter_url)
- Has LinkedIn (linkedin_url)
- Geographical Region (country, city/-code) (hq_locations)
- Total Funding (total_funding) (in mio)

1.2 Dependent Variables:
- Acquisition (company_status)
- Failure (company_status)
- Public (ownerships)

### Importing the Json file through according API Call:

In [269]:
fields="id,name,about,tagline,investors,industries,website_url,facebook_url,twitter_url,linkedin_url,hq_locations,total_funding,company_status,ownerships,team,fundings,patents_count,employees_latest,employees_chart"
api_url="https://api.dealroom.co/api/v1/companies/bulk"
API_KEY=""
auth=HTTPBasicAuth(API_KEY, '')
headers = {'Content-Type': 'application/json'}

json_list=[]
next_page_id = ''

while True:
    data = {
        'form_data': {'must':{'hq_locations': ['Europe'], 'client_focus':['business']}},
        'fields': fields,
        'next_page_id': next_page_id,
        'limit': 100,
        }

    headers = {'Content-Type': 'application/json'}

    response = requests.post(api_url,data=json.dumps(data),auth=auth,headers=headers)

    res=json.loads(response.text)
    json_list.append(res)

    next_page_id = res['next_page_id']
    if next_page_id == None:
        break

ConnectionError: HTTPSConnectionPool(host='api.dealroom.co', port=443): Max retries exceeded with url: /api/v1/companies/bulk (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x15b1bd2d0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))

0

In [83]:
with open("first_json.json", "w") as fp:
        json.dump(json_list, fp)

### 1.2 Converting Json to Pandas DF

In [None]:
df_list=[]
for i in range(len(json_list)):
    df_1=pd.json_normalize(json_list[i]["items"],sep="_").drop(columns=["hq_locations","industries"])
    df_1[["country_name","city_name"]]=pd.json_normalize(json_list[0]["items"],"hq_locations")[["country.name","city.name"]]
    df_1["industry_name"]=pd.json_normalize(json_list[0]["items"],"industries")["name"]
    df_list.append(df_1)

In [None]:
df=pd.concat(df_list)
df.to_csv("Raw_Dataset.csv",index=False) #saves the Data to file


## 1.3 DataCleaning

In [241]:
x=df.isna().sum()
y=df.isna().sum()/len(df)*100
missing_values=pd.concat([x, y], axis=1)
missing_values.columns=["Missing Values","In percent"]
print(missing_values)


                 Missing Values  In percent
id                            0    0.000000
name                          0    0.000000
about                     57585   25.960121
website_url                 611    0.275447
facebook_url             175456   79.098011
twitter_url              112630   50.775175
linkedin_url              50617   22.818849
total_funding                 0    0.000000
company_status               85    0.038319
ownerships                    0    0.000000
investors_items               0    0.000000
investors_total               0    0.000000
country_name                  0    0.000000
city_name                 55456   25.000338
industry_name              6654    2.999716


In [242]:
df.dropna(subset=["website_url"],inplace=True) #companies without website will be deleted to avoid ghost companies
df.dropna(subset=["company_status"],inplace=True) #companies without status (only 85 so can be dismissed)

In [270]:
fields="id,name,about,tagline,investors,industries,website_url,facebook_url,twitter_url,linkedin_url,hq_locations,total_funding,company_status,ownerships,team,fundings,patents_count,employees_latest,employees_chart"
api_url="https://api.dealroom.co/api/v1/companies/bulk"
API_KEY=""
auth=HTTPBasicAuth(API_KEY, '')
headers = {'Content-Type': 'application/json'}

json_list=[]
next_page_id = ''

data = {
        'form_data': {'must':{'hq_locations': ['Europe'], 'client_focus':['business']}},
        'fields': fields,
        'next_page_id': next_page_id,
        'limit': 100,
        }

headers = {'Content-Type': 'application/json'}

response = requests.post(api_url,data=json.dumps(data),auth=auth,headers=headers)

res=json.loads(response.text)


ConnectionError: HTTPSConnectionPool(host='api.dealroom.co', port=443): Max retries exceeded with url: /api/v1/companies/bulk (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x19395ff40>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))

In [267]:
pd.json_normalize(res["items"],sep="_")

Unnamed: 0,id,name,about,tagline,industries,website_url,facebook_url,twitter_url,linkedin_url,hq_locations,total_funding,company_status,ownerships,patents_count,employees_latest,employees_chart,investors_items,investors_total,team_items,team_total,fundings_items,fundings_total
0,2519508,Lockspot,"Simple, hassle-free self storage for personal,...",Lockspot | Business and Domestic Self Storage ...,"[{'id': 100111, 'name': 'transportation'}]",http://lockspot.co.uk,,https://twitter.com/lockspotuk,https://www.linkedin.com/company/lockspot/,"[{'id': 2184529, 'address': 'Bynea Business Pa...",0.0,operational,[],0,1.0,"[{'date': '2021-07-05', 'value': 1}, {'date': ...",[],0,[],0,[],0
1,2013268,Loftus Bradford,Loftus Bradford works with the talent and bran...,Boutique Executive Search Firm | Loftus Bradford,"[{'id': 100108, 'name': 'jobs recruitment'}]",https://loftusbradford.com,,https://twitter.com/loftusbradford,https://www.linkedin.com/company/loftus-bradford/,"[{'id': 2574264, 'address': '8001 Barcelona, C...",0.0,operational,[],0,21.0,"[{'date': '2021-07-05', 'value': 16}, {'date':...",[],0,[],0,[],0
2,1988049,LOGITIO,"Entreprise innovante, Logitio propose une suit...",Offers you software solutions to optimize the ...,"[{'id': 100108, 'name': 'jobs recruitment'}]",https://www.logitio.com/,,https://twitter.com/logitiorh,https://www.linkedin.com/company/logitio,"[{'id': 1995453, 'address': '9 Place Kléber, 6...",0.0,operational,[],0,4.0,"[{'date': '2020-09-21', 'value': 3}, {'date': ...",[],0,[],0,[],0
3,3340541,Logic Tours,Trusted by Leading Organisations\nOnline marke...,,"[{'id': 100129, 'name': 'marketing'}]",http://logic.tours,https://www.facebook.com/logicgroupltd,,https://www.linkedin.com/company/logic-tours,"[{'id': 3003048, 'address': 'Rugby, Warwickshi...",0.0,operational,[],0,4.0,"[{'date': '2021-11-14', 'value': 4}, {'date': ...",[],0,[],0,[],0
4,1706767,Inrate,Inrate is a leading independent sustainability...,Independent swiss sustainability rating agency,"[{'id': 1264, 'name': 'fintech'}]",http://inrate.com,,,https://www.linkedin.com/company/inrate,"[{'id': 1683548, 'address': '8001 Zurich, Cant...",0.0,operational,[],0,25.0,"[{'date': '2021-02-03', 'value': 25}]",[],0,[],0,[],0
5,2486018,Lindeas,Working on GNU/Linux ideas\n- System administr...,Lindeas – Working on GNU/Linux ideas,"[{'id': 1262, 'name': 'security'}]",http://lindeas.com,,https://twitter.com/lindeascom,https://www.linkedin.com/company/lindeas/,"[{'id': 2737892, 'address': 'Bulgaria', 'stree...",0.0,operational,[],0,2.0,"[{'date': '2022-01-10', 'value': 2}, {'date': ...",[],0,[],0,[],0
6,2119216,LRC & Co. KG,We are focused on liquidity risk since more th...,LRC - LRC focuses on Liquidity Ris for more th...,"[{'id': 100092, 'name': 'robotics'}]",http://liqrisk.com,,,https://www.linkedin.com/company/liquidity-ris...,"[{'id': 2057040, 'address': 'Wetzbach 26A, 646...",0.0,operational,[],0,2.0,"[{'date': '2021-07-05', 'value': 2}, {'date': ...",[],0,[],0,[],0
7,2119487,Exact Systems,Global Quality Leader. Exact Systems has been ...,,"[{'id': 100111, 'name': 'transportation'}]",http://exactsystems.com,,,https://www.linkedin.com/company/exactsystems,"[{'id': 2057410, 'address': 'Am Klinikum 7, 02...",0.0,operational,[bootstrapped],0,640.0,"[{'date': '2018-12-01', 'value': 471}, {'date'...",[],0,"[{'id': 1864745, 'name': 'Bryan M.', 'path': '...",1,[],0
8,2134608,LIS Logistische Informationssysteme AG,Unsere Logistiksoftware WinSped ist das führen...,LIS - Get in touch with the leading Transport-...,"[{'id': 100108, 'name': 'jobs recruitment'}]",http://lis.eu,,,https://www.linkedin.com/company/lis-logistisc...,"[{'id': 2080578, 'address': 'Hansaring 27, 482...",0.0,operational,[],0,121.0,"[{'date': '2019-07-01', 'value': 75}, {'date':...",[],0,[],0,[],0
9,2480853,Live Soft,Live Soft is a rapidly growing software develo...,,"[{'id': 100039, 'name': 'hosting'}, {'id': 100...",http://livesoft-bg.com,,,https://www.linkedin.com/company/live-soft-ltd-/,"[{'id': 2300231, 'address': 'Sofia, Drochia, M...",0.0,operational,[],0,8.0,"[{'date': '2020-01-21', 'value': 9}, {'date': ...",[],0,[],0,[],0
