This notebook shows the different steps from a list of linkedin URL from dealroom to generating features helping to discriminate between deeptechs and non-deeptechs.

These features can then be added to dealroom extracted features to generate a predict based on the trained model

In [10]:
#List used for the demo
List_already_in_train_set = ["https://www.linkedin.com/company/airmems",
        "https://www.linkedin.com/company/paygenius",
       "https://www.linkedin.com/company/aledia",
        "https://www.linkedin.com/company/datategy",
        "https://www.linkedin.com/company/cornis",
        "https://www.linkedin.com/company/10574009",
        "https://www.linkedin.com/company/bioentech",
        "https://www.linkedin.com/company/mobeye-app",
        "https://www.linkedin.com/company/hubtobee/",
        "https://www.linkedin.com/company/v-motech"
        ]

In [20]:
#List with "new" French companies
List_not_in_train = [
    "https://www.linkedin.com/company/verkor/",
    "https://www.linkedin.com/company/angell/people/",
    "https://www.linkedin.com/company/carester/",
    "https://www.linkedin.com/company/carbios/",
    "https://www.linkedin.com/company/mastergrid/",
    "https://www.linkedin.com/company/epigene-labs/",
    "https://www.linkedin.com/company/spacesense-ai/",
    "https://www.linkedin.com/company/kraaft-co/",
    "https://www.linkedin.com/company/pasqal/",
    "https://www.linkedin.com/company/gourmey/"
        ]

## Script for scraping companies

In [21]:
import pandas as pd
from bpideep.scraping_scripting import make_script_company_scraping

In [22]:
#For this demo, we create a dataframe from the list of url (but in real life, pass the dataframe from the dealroom api call)
df = pd.DataFrame({'linkedin_url':List_not_in_train})

Scripts are generated and printed. To use this script, download the webscraper extension (for instance on chrome).
Then "import new sitemap" -> "create sitemap", copypaste the dictionary/JSON in "sitemap JSON".
Optional: give a name to the sitemap, then "import sitemap", then click on "sitemap_the_name_you_choose" 
and finally "scrape"

In [23]:
class str2(str):
    def __repr__(self):
        # remove the outer two characters, single quotes, and replace them with double quotes.
        # Used to generate scraping url with double quotes compatible with webscraper
        return ''.join(('"', super().__repr__()[1:-1], '"'))

In [24]:
def make_scripts_company_scraping(df_dealroom_data, batch_size):
    """This function generates scraping scripts to be used on webscraper, to scrape people 
    names, titles and profile_urls for a given company. 
    X should be a dataframe with urls in a column 'linkedin_url', as per data provided by
    dearlroom. Scraping takes about 1min per company"""
    # Optional: the csv can be opened in the function, choose one of the two paths
    # depending on usage.
    # Path to open in notebooks: "df_dealroom data = pd.read_csv('../bpideep/rawdata/data.csv')""
    # Path to open in from location of this module:
    # "Path = os.path.join(os.path.dirname(__file__),'rawdata/') ""
    # "df_dealroom data = pd.read_csv(path + 'datacsv')
    company_count = df_dealroom_data.shape[0]
    batches = int(company_count/batch_size)
    urls = df_dealroom_data[['linkedin_url']]
    for i in range(0, batches+1):
        name = f"script_batch_{i}"
        batch = []
        for j in range (i*batch_size, (i+1)*batch_size):
            if j > company_count-1:
                break
            else:
                url= f'{urls.iloc[j,0]}/people'
                #str2 is used to replace single quotes by double quotes (webscraper compatibility)
                company = str2(url)
                batch.append(company)
        #the script below was written to scrape people names, titles and profile_urls for a given company, via webscraper       
        script= f'{{"_id":"scraping","startUrl":{batch},"selectors":[\
                    {{"id":"container","type":"SelectorElementScroll","parentSelectors":["_root"],"selector":"div.org-people-profile-card__profile-info","multiple":true,"delay":"1234"}},\
                        {{"id":"name","type":"SelectorText","parentSelectors":["container"],"selector":"div.org-people-profile-card__profile-title","multiple":false,"regex":"","delay":0}},\
                        {{"id":"title","type":"SelectorText","parentSelectors":["container"],"selector":"div.lt-line-clamp--multi-line","multiple":false,"regex":"","delay":0}},\
                        {{"id":"profile","type":"SelectorLink","parentSelectors":["container"],"selector":"a.link-without-visited-state","multiple":false,"delay":0}}]}}'
        script.replace("\\", "")
        # The function outputs a string for each batch as a text file
        path = '../bpideep/scraping_data/scraping_scripts/'
        with open(path + f"{name}.txt", "w") as text_file:
            text_file.write(script)
        # Another output is the printed scripts, from which you can copy/paste in webscraper.
        print(name)
        print(script)
    return None

In [25]:
make_script_company_scraping(df,10)

script_batch_0
{"_id":"scraping","startUrl":["https://www.linkedin.com/company/verkor//people", "https://www.linkedin.com/company/angell/people//people", "https://www.linkedin.com/company/carester//people", "https://www.linkedin.com/company/carbios//people", "https://www.linkedin.com/company/mastergrid//people", "https://www.linkedin.com/company/epigene-labs//people", "https://www.linkedin.com/company/spacesense-ai//people", "https://www.linkedin.com/company/kraaft-co//people", "https://www.linkedin.com/company/pasqal//people", "https://www.linkedin.com/company/gourmey//people"],"selectors":[                    {"id":"container","type":"SelectorElementScroll","parentSelectors":["_root"],"selector":"div.org-people-profile-card__profile-info","multiple":true,"delay":"1234"},                        {"id":"name","type":"SelectorText","parentSelectors":["container"],"selector":"div.org-people-profile-card__profile-title","multiple":false,"regex":"","delay":0},                        {"id":"

## Scraping companies

These scripts will be needed to use web scraper:
https://chrome.google.com/webstore/detail/web-scraper-free-web-scra/jnhgnonknehpejjnehehllkliplmbmhn

**How to use webscraper to generate csv**
+ Install the extension
+ On Chrome: click vertical three points on top right> More tools> Developper tools
+ Then on right click > Inspect (on any page) you should see "Webscraper" as one of the tools
    + It is recommended to use "Dock to bottom" configuration in "Dock side" parameter for a better view
    
    
+ Click Webscraper > **Create New Site Map > Import Site Map**
+ Paste the description from the script obtained above in "Sitemap JSON"
    + Each script starts with { and ends with }
    + The different scripts are delimited by "script_batch_0", "script_batch_1"… you can only do one at a time
+ Give the script name (e.g. script_batch_0) in "Rename Sitemap" (it will be the name of the csv file you'll obtain)
+ Click "Import sitemap"
+ Click the "Sitemap (your chosen name)" menu
+ Click "Scrape"
+ Click "Start Scraping"
    + A browser window opens and loads the pages to scrape
    + you can keep working on other things meanwhile
    + You may occasionnaly be signed out from Linked In: just sign in again and reload scraping
    + You can hit the "refresh" button on the initial page to see already scraped data
    + You know it is finished when the new browser window closes
+ Once finished, 
    + click the "Sitemap (your chosen name)" menu again then "Import data as CSV"
    + click "Download now"
    + Chose folder `bpi_deep/scraping_data/companies_people/` (create it if needed, as it is not uploaded on github)
+ Then repeat from " Create New Site Map > Import Site Map" for the next script until all scripts have been covered
    + When a script has "startUrl":[ ] and no pop up window opens, it's that all requested companies have been coeverd

The csv containing the scraped data should be included in a folder `bpi_deep/scraping_data/companies_people/`
prior to calling the function "build employee_df"

## Building the companies dataframe

In [6]:
from bpideep.process_scraped_data import build_employee_df, process_employee_data

In [7]:
df_employees= process_employee_data(build_employee_df())

In [8]:
df_employees

Unnamed: 0,employee_name,title,profile-href,linkedin_url,technical,founder,phd
0,Agnès Mathé,responsable communication,https://www.linkedin.com/in/agn%C3%A8s-math%C3...,https://www.linkedin.com/company/carbios,0,0,0
1,Loic Zangara,vice-president france & operations,https://www.linkedin.com/in/loic-zangara-b8190...,https://www.linkedin.com/company/mastergrid,0,0,0
2,Gilles Stedile,superviseur chantier,https://www.linkedin.com/in/gilles-stedile-28b...,https://www.linkedin.com/company/mastergrid,0,0,0
3,,directeur technique,,https://www.linkedin.com/company/mastergrid,1,0,0
4,Meryl Merloz,purchaser,https://www.linkedin.com/in/merylmerloz/,https://www.linkedin.com/company/mastergrid,0,0,0
...,...,...,...,...,...,...,...
213,Sébastien FAURE,expert technique,https://www.linkedin.com/in/s%C3%A9bastien-fau...,https://www.linkedin.com/company/mastergrid,1,0,0
214,Gregory Arnal,researcher in enzyme engineering,https://www.linkedin.com/in/gregory-arnal-687a...,https://www.linkedin.com/company/carbios,1,0,0
215,Kaoutar Faiz,digital marketing manager,https://www.linkedin.com/in/kaoutar-faiz-22991...,https://www.linkedin.com/company/spacesense-ai,0,0,0
216,Olivier Dufour,co-founder - stakeholder engagement,https://www.linkedin.com/in/oldufour/,https://www.linkedin.com/company/verkor,0,1,0


## Script and scraping for employees

In [9]:
from bpideep.scraping_scripting import make_script_employee_scraping

Create the folder `bpideep/scraping_data/scraping_scripts`

In [12]:
make_script_employee_scraping(df_employees, 100, founders = True)

script_batch_0
{"_id":"profiles","startUrl":["https://www.linkedin.com/in/antoine-davydoff-35a569149/", "https://www.linkedin.com/in/alain-marty-40251539/", "https://www.linkedin.com/in/christophe-mille-506729/", "https://www.linkedin.com/in/nicolasmorinforest/", "https://www.linkedin.com/in/pauline-de-breteuil/", "https://www.linkedin.com/in/sylvainpaineau/", "https://www.linkedin.com/in/dekelpersi/", "https://www.linkedin.com/in/philippechain/", "https://www.linkedin.com/in/victor-sayous-a70190106/", "https://www.linkedin.com/in/matthieu-marquenet/", "https://www.linkedin.com/in/marc-negre-9548a58b/", "https://www.linkedin.com/in/fran%C3%A7ois-dechelette-357b481a/", "https://www.linkedin.com/in/eliott-raoult/", "https://www.linkedin.com/in/akpelinordor/", "https://www.linkedin.com/in/benoit-l-89772a2/", "https://www.linkedin.com/in/barriere/", "https://www.linkedin.com/in/christophe-jurczak/", "https://www.linkedin.com/in/sami-yacoubi-05902992/", "https://www.linkedin.com/in/martin-j

[["https://www.linkedin.com/in/antoine-davydoff-35a569149/",
  "https://www.linkedin.com/in/alain-marty-40251539/",
  "https://www.linkedin.com/in/christophe-mille-506729/",
  "https://www.linkedin.com/in/nicolasmorinforest/",
  "https://www.linkedin.com/in/pauline-de-breteuil/",
  "https://www.linkedin.com/in/sylvainpaineau/",
  "https://www.linkedin.com/in/dekelpersi/",
  "https://www.linkedin.com/in/philippechain/",
  "https://www.linkedin.com/in/victor-sayous-a70190106/",
  "https://www.linkedin.com/in/matthieu-marquenet/",
  "https://www.linkedin.com/in/marc-negre-9548a58b/",
  "https://www.linkedin.com/in/fran%C3%A7ois-dechelette-357b481a/",
  "https://www.linkedin.com/in/eliott-raoult/",
  "https://www.linkedin.com/in/akpelinordor/",
  "https://www.linkedin.com/in/benoit-l-89772a2/",
  "https://www.linkedin.com/in/barriere/",
  "https://www.linkedin.com/in/christophe-jurczak/",
  "https://www.linkedin.com/in/sami-yacoubi-05902992/",
  "https://www.linkedin.com/in/martin-j-stepha

The scripts are saved in the folder `bpideep/scraping_data/scraping_scripts`, you can open them with Sublime Text or another text editor.

Using the same process as before with Webscraper, scrape Employees using the scripts generated.
Save them in a folder: `bpi_deep/scraping_data/founders_files/`


## Building the employee dataframe

In [14]:
from bpideep.process_scraped_data import open_founder_profile_files, inline_profile, build_founders_dataframe, generate_founders_features

In [15]:
# Prior to calling the function "open_founder_profile_files", the csv containing the scrapped data from founders
# should be included in a folder 'bpi_deep/scraping_data/founders_files/'
df_founders_raw = open_founder_profile_files()

ValueError: No objects to concatenate

In [42]:
df_founders_raw.head(5)

Unnamed: 0,web-scraper-order,profile-href,title,company,institution,degree,field,exp_description,type,amount,text_content
0,1608043965-295,https://www.linkedin.com/in/eliott-raoult/,Company Name\n Melting Vote,,,,,,,,
1,1608044755-441,https://www.linkedin.com/in/christophe-mille-5...,Lithium-Ion Battery - Pilot Line Manager,CEA,,,,Pilot Line implementation: Dry & Clean Room Fa...,,,
2,1608044263-347,https://www.linkedin.com/in/nicolasmorinforest/,Co-Founder & CEO,GOURMEY,,,,On a mission to bring delicious cultured meat ...,,,
3,1608044114-323,https://www.linkedin.com/in/dekelpersi/,Board Observer,Codota,,,,,,,
4,1608044706-434,https://www.linkedin.com/in/pauline-de-breteuil/,,,ESCP Europe / ESCP-EAP,ESCP EUROPE,"General management : strategy, finance, marketing",,,,


In [43]:
#The function "build_founders_dataframe" processes the raw df and returns a df with one line per founder
#The function "generate_founders_features" generates the new relevant features such as "founder_has_phd" etc..
df_founders = generate_founders_features(build_founders_dataframe(df_founders_raw))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf[f'{field}_{i+1}'] = subdf.loc[i, field]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf[f'{field}_{i+1}'] = subdf.loc[i, field]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf[f'{field}_{i+1}'] = subdf.loc[i, field]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html

In [45]:
df_founders.reset_index(inplace = True, drop = True)

In [44]:
df_founders.head(5)

Unnamed: 0,profile-href,title,company,exp_description,title_2,company_2,exp_description_2,title_3,company_3,exp_description_3,...,type_4,amount_4,text_content_4,type_5,amount_5,text_content_5,founder_has_phd,founder_from_institute,founder_pat_pub,technical_founder
0,https://www.linkedin.com/in/eliott-raoult/,Intern in charge of the web marketing,LE COMPTOIR DES SAVONNIERS PARIS,Environment : \nRestructuration of the company...,Creation of an e-commerce website,Vamana Sunglasses,E-commerce website that was selling sunglasses...,Vice-president and treasurer,L'Instant Start-Up,,...,,,,,,,0,0,0,0
1,https://www.linkedin.com/in/christophe-mille-5...,Lithium-Ion Battery - Pilot Line Manager,CEA,Pilot Line implementation: Dry & Clean Room Fa...,Battery Engineering,Apple,,Battery Cell Development,BMW,,...,,,,,,,0,1,0,1
2,https://www.linkedin.com/in/nicolasmorinforest/,Co-Founder & CEO,GOURMEY,On a mission to bring delicious cultured meat ...,Advisor,Agriculture Cellulaire France,Cellular Agriculture France\n(Unpaid advisor),International Product Manager,"L'Oréal / Vichy, Europe’s #1 pharmacy brand",Active Cosmetics Division - Vichy Internationa...,...,,,,,,,0,0,0,0
3,https://www.linkedin.com/in/dekelpersi/,Board Observer,Codota,,"Co Founder, Managing Partner",TPY Capital,Investing now from our second $100m venture fu...,Board Member,Signals Analytics,,...,,,,,,,0,0,0,0
4,https://www.linkedin.com/in/pauline-de-breteuil/,Investor,Lifen,Smart and secure medical exchange\nhttps://www...,Investor,Epigene Labs,Genomic data-driven drug design for precision ...,Board member,Sym Optic,"Sym developed the first clip-on, interchangeab...",...,,,,,,,0,0,0,0


In [45]:
# Finally, we merge founders to the full employee DF, update the feature ("technical"), and aggregate into companies
from bpideep.process_scraped_data import companies_technical_stats_with_founders_features, update_technical
df_employees_full = update_technical(df_employees, df_founders)
df_companies_stats_with_founders_features = companies_technical_stats_with_founders_features(df_employees_full)
df_companies_stats_with_founders_features

Unnamed: 0,linkedin_url,technical,phd_found_linkedin,employee__linkedin_count,founder_from_institute,founder_has_phd,founder_pat_pub,technical_founder
0,https://www.linkedin.com/company/carbios,0.315789,0,38,0.0,1.0,0.0,1.0
1,https://www.linkedin.com/company/epigene-labs,0.25,0,12,0.0,1.0,0.0,1.0
2,https://www.linkedin.com/company/gourmey,0.526316,4,19,2.0,1.0,0.0,2.0
3,https://www.linkedin.com/company/kraaft-co,0.142857,0,14,0.0,0.0,0.0,0.0
4,https://www.linkedin.com/company/mastergrid,0.183908,0,87,0.0,0.0,0.0,0.0
5,https://www.linkedin.com/company/pasqal,0.5,2,16,0.0,2.0,2.0,2.0
6,https://www.linkedin.com/company/spacesense-ai,0.307692,1,13,0.0,0.0,0.0,0.0
7,https://www.linkedin.com/company/verkor,0.333333,0,12,1.0,0.0,1.0,1.0


In [None]:
# Last optional step: merge the DF with new company features witht the dealroom df.
from bpideep.process_scraped_data import merge_initial_companies_with_founder
df_full = pd.read_csv('../bpideep/rawdata/data2020-12-03.csv')
final = merge_initial_companies_with_founder(df_full, df_companies_stats_with_founders_features)