In [45]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import pandas as pd
import numpy as np
import json
import re
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler
import matplotlib.pyplot as plt

# Convert the json data in string (transformed by the pd.read_csv) to dict

In [3]:
def load_json_field(bad_json):
    regex = r"\w\'\w"
    subst = ""
    bad_json = re.sub(regex, subst, bad_json)
    bad_json = bad_json.replace("d' Arrouzat", "darrouzat")
    good_json = bad_json.replace("\'", "\"").replace("None", "null").replace("True", "true").replace("False", "false")
    
    return json.loads(good_json)

In [4]:
data = pd.read_csv("../bpideep/rawdata/data2020-12-03.csv")

#these are the columns for which load_json_field is needed
data["team"] = data["team"].apply(load_json_field)
data["industries"] = data["industries"].apply(load_json_field)
data["investors"] = data["investors"].apply(load_json_field)
data["fundings"] = data["fundings"].apply(load_json_field)

# Import data + data cleaning (imputing missing values with info retrieved from Linkedin or with our own method)

## DATA CLEANING

In [5]:
def impute_missing_launch_year(data):
    """ 
    function that replaces the missing launch year value
    for 33 companies which do not have growth_stage either
    """

    names = ['Amypore',
             'Kinnov Therapeutics',
             'Lipofabrik',
             'Step pharma',
             'LiMM Therapeutics',
             'Ilek',
             'LysPackaging',
             'TexiSense',
             "Institut de Prise en Charge de l'Obésité",
             'Izi Family',
             'Arthur Dupuy',
             'Gen.Orph',
             'Uniris',
             'NANOZ',
             'Akwatyx',
             'Black-line',
             'Eyye',
             "O'Sol",
             'Treenox',
             'Aqualeg',
             'Co-assit',
             'Wind my roof',
             'APPARTOO',
             'BimBamJob',
             'Buddytherobot.com',
             'Bcm',
             'CCI Paris Ile de France',
             'Datarocks',
             'EFFICIENCIA',
             'EONEF',
             'FEALINX',
             'INERIS',
             'Marguerite',
             'TokTokDoc',
             'Novaquark',
             'Peopeo',
             'Sloclap',
             'Swift',
             'Sword',
             'XT-VISION',
             'Ryax',
             'Sylha',
             'Opta LP']

    launch_years = [2018, 2015, 2012, 2014, 2018, 2016, 2015, 2010, 2011, 2016, 2015, 
                   2012, 2017, 2012, 2015, 2017, 2016, 2016, 2018, 2011, 2016, 2018, 2015, 2015, 2014, 2014, 2013,
                   2014, 2012, 2016, 1997, 1990, 2012, 2016, 2014, 2017, 2015, 1973, 2000, 2011, 2017, 2019, 2013]

    zipbObj = zip(names, launch_years)
    launch_year_dict = dict(zipbObj)

    data["launch_year_clean"] = data['launch_year']
    
    for name, year in launch_year_dict.items() : 
        data.loc[data.name == name, "launch_year_clean"] = year
    return data

def get_growth_dict(df):
    """ 
    function that creates a dictionary 
    giving the mode of the growth_stage
    according to the launch_year of all companies in the dataset
    """
    
    table = df[["growth_stage", "launch_year_clean", "id"]].groupby(by=["launch_year_clean", "growth_stage"]).count()
    column = list(table.unstack(level = -1).columns)
    growth_dict = table.unstack(level = -1).fillna(0).apply(lambda x: column[x.argmax()][1], axis = 1).to_dict()
    
    return growth_dict
    

def fill_missing_growth(growth_dict, growth_stage, launch_year_clean):
    """ 
    function that fills all the missing growth stage
    according to the company's launch year
    based on the dictionary created thanks to the function get_growth_dict() 
    """

    if type(growth_stage) == str:
        return growth_stage    
    elif launch_year_clean in growth_dict:
        return growth_dict[launch_year_clean]
    else:
        return growth_stage
    
    
def load_json_field(bad_json):
    """ 
    function that transforms the type of the cells 
    from string to dictionary as the pd.read_csv converts
    the dictionary to string (use it only in notebooks)
    """
    
    regex = r"\w\'\w"
    subst = ""
    bad_json = re.sub(regex, subst, bad_json)
    bad_json = bad_json.replace("d' Arrouzat", "darrouzat")
    good_json = bad_json.replace("\'", "\"").replace("None", "null").replace("True", "true").replace("False", "false")
    
    return json.loads(good_json)


def get_industries(x):
    '''
    function that extracts info from 'industries' column through mapping
    data['industries_list'] = data['industries'].map(lambda x: industries(x))
    '''
    industries_list = []
    
    industries = x.apply(load_json_field)

    for u in range(len(industries)):
        if len(industries[u]) > 0:
            industries_list.append(industries[u][0]['name'])
        else:
            industries_list.append("")
    return industries_list

def get_health(x):
    '''
    function that encodes :
    - 0 if "health" is not part of the industry list created by the function "get_industries"
    - 1 if "health" is part of the industry list 
    '''
    industries = get_industries(x) # list of industries

    health_industry = []
    
    for element in industries:
        if element == 'health':
            health_industry.append(1)
        else:
            health_industry.append(0)
    return health_industry

def investors_type(x) :
    '''
    function that extracts info from 'investors' column 
    '''
    investors_list = []
    investors = x
    if investors['total'] > 0 :
        for y in range(len(investors['items'])):
                investors_list.append(investors['items'][y]['type'])
    return investors_list


def fund_investors(x):
    """
    function that encodes :
    - 1 if the selected investors are part of the list created by the function investors_type()
    - 0 if not
    
    If needed, the selected investors can be modified. You can choose among the following :
        list_investor_type = ['fund',
                             'investor',
                             'corporate',
                             'government_nonprofit',
                             'service_provider',
                             'company',
                             'crowdfunding',
                             'workspace']
    """
    
    
    for row in range(len(x)):
        if "fund" in x["investors_type"][row] or "investors" in x["investors_type"][row] :
            x["investors_type"][row] = 1

        else :
            x["investors_type"][row] = 0
    return x




## GET_CLEAN_DATA

save the data accordingly :

- folder "rawdata" : dataset extracted from Dealroom
- folder "data" : missing_employee_count and technical profile info scraped from LinkedIn, patents dataset 

In [6]:
class GetCleanData():

    """
    GetData.get_data() is a function that returns a clean dataset which is already saved in the folder 'rawdata'
    (imputing missing values thanks to manual imputing, LinkedIn, and creating new features)
    """
    
    def get_clean_data():
        
        #1. Use the function Getfulldata to get an updated dataset from Dealroom 
        #2. Save the updated dataset in the folder 'rawdata'
        #3. Load the dataset (please rename the csv file below)
        data = pd.read_csv("../bpideep/rawdata/data2020-12-03_with_corrections.csv")
        
        #4. Select only the firms labeled 'deeptech' or 'non_deeptech'
        data = data[data.deep_or_not != "almost_deeptech"]

        
        #5. Select the needed columns
        data = data[["id", "name", "target", "deep_or_not", "total_funding_source", "employees",
                     "employees_latest", "launch_year", "growth_stage", "linkedin_url", "industries", "investors"]]


        #6. Drop 2 duplicated companies Lalilo and Pixyl (cf. the explanation below)
        data.drop(data[(data.id == 1787891) | (data.id == 1893232)].index, inplace = True)
        
        #7. Harmonizing the growth stage : change the "not meaningful" growth stage status of 15789 Insoft to mature
        data.loc[data.id == 15789, "growth_stage"] = "mature"
        
        #8. Imputing the missing launch year (only 33 done manually -> to be automatized if a new source displays it)
        impute_missing_launch_year(data)
        
        #9. Imputing missing growth_stage (with the mode of the launch year)
        growth_table = get_growth_dict(data)
        data['growth_stage_imputed'] = data.apply(lambda row: fill_missing_growth(
                growth_table,
                row['growth_stage'], 
                row['launch_year_clean']), 
            axis=1
        )
        
        #10. Imputing missing employees values from LinkedIn scraping
        missing = pd.read_csv("../bpideep/data/missing_employee_count.csv")
        data["employees_clean"] = data.employees_latest
        data.loc[data.name == "CCI Paris Ile de France", "employees_clean"] = 1793

        for url in missing.linkedin_url:
            replace_value = missing[missing.linkedin_url == url]["check"].iloc[0]
            data.loc[data.linkedin_url == url, "employees_clean"] = replace_value
        
        #11. Computing the age of companies
        data["age"] = 2020 - data.launch_year_clean
        
        
        #12. Get the number of patents (from Google Patents)
        patent = pd.read_csv("../bpideep/data/patents.csv")
        data = pd.merge(data, patent, on="id", how = "left")
        
        #13. Create a new feature "investors_type"
        data["investors"] = data["investors"].apply(load_json_field)
        data["investors_type"] = pd.DataFrame(data["investors"].apply(lambda row: investors_type(row)))
        data["investors_type"] = fund_investors(data[["investors_type"]])
        
        #14. Create a new feature "health_industry"  
        data["health_industry"] = pd.DataFrame(get_health(data["industries"]))
        
        #15. Get if the compagny has a doctor or no + proportion of technical among the employees (from Linkedin)
        #the column "No_people_input" tells the model whether the info was present on LinkedIn
        doctors = pd.read_csv("../bpideep/data/extra_features.csv").drop(columns = "Unnamed: 0")[["id", "company_has_phd"]]
        doctors2 = pd.read_csv("../bpideep/data/extra_features_v2.csv").drop(columns = "Unnamed: 0")[["id",
                                                                                                      "proportion_technical", 
                                                                                                      "founder_from_institute",
                                                                                                      "founder_has_phd"]]
        
        data = data.merge(doctors, on = "id", how = "left")
        data = data.merge(doctors2, on = "id", how = "left")
        
        data["No_people_input"] = 0
        data.loc[data.proportion_technical.isna(), "No_people_input"] = 1
        
        
        data[["proportion_technical", "founder_from_institute","founder_has_phd"]] = data[[
            "proportion_technical", "founder_from_institute","founder_has_phd"]].fillna(0)
        
        return data

#         *    
#                 + 3 duplicated names but with different id : 
#                     1/ Lalilo : 926521 (http://www.lalilo.com/) vs. 1787891 (http://lalilo.fr) 
#             -> same launch date, french website no longer exists + observation  almost filled by NAN + same obs as the .com 
#             -> drop the french Lalilo (1787891)
#                     2/ Pixyl : 892048 vs 1893232 (different websites mentioned but same website page when launched) 
#             -> kept  892048 because more info + the Dealroom profile was verified by Dealroom team on Sept, 1st 2020 vs. pending verification
#                     3/ NANOZ : 1836121 vs 1660543 -> kept both as different companies but the second one is German 

In [7]:
data = GetCleanData.get_clean_data()
X = data.drop(columns = ["target", "deep_or_not"])
y = data.target

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["investors_type"] = fund_investors(data[["investors_type"]])


# PIPELINE

##### employees imputing  : age

In [8]:
import re
import math
from sklearn.base import BaseEstimator, TransformerMixin


def average_list(range_list):
    """
    function that returns the mean of employees 
    it is used to compute the value of the dictionary with the range as a key (column "employees")
    """
    return sum(range_list)/len(range_list)

def compute_employees_mean(data):
    """
    function that creates a dictionary 
    which is used for imputing missing employees number 
    according the employees range of the company if it is mentioned
    column "employees"
    """
    
    range_list = list(data.employees.unique())
    try:
        range_list.remove("n.a.")
    except:
        pass
    keys_list = []
    means_list = []

    for i in range(len(range_list)) :
        if type(range_list[i]) == str :
            temp = re.findall(r'\d+', range_list[i]) 
            res = list(map(int, temp)) 
            mean = average_list(res)
            means_list.append(mean)
            keys_list.append(range_list[i])
        else:
            pass

    zip_iterator = zip(keys_list, means_list)
    range_dict = dict(zip_iterator)
    
    return range_dict

def replace_employees(df):
    """
    function that replaces the missing employees value
    by the mean of the range indicated in the column "employees"
    """

    dictionary = compute_employees_mean(df)
    for key, value in dictionary.items():
        df.loc[(df.employees == key) & (df.employees_latest.isna()), "employees_clean"] = value
    return df
            
            
class EmployeeImputer(BaseEstimator, TransformerMixin):
    
    """
    Customized imputer : impute the missing values in the column "employees_latest" in 3 steps 
        1. impute the missing employees number by the mean of the range if the range is indicated (column "employees")
        2. for the companies launched after 2010 
            -> impute according to the median of the companies launched after 2010
        3. for the companies launched before 2010 or which do not have a launch year
            -> impute according to the median of the dataset 
    """
    
    def fit(self, X, y=None):
        
        # create the dictionary to impute the missing employees number according to the range mean
        data = X.copy()
        data = replace_employees(data)
        
        self.yg_median = data[["employees_clean", "launch_year_clean"]].groupby(by=["launch_year_clean"]).median()
        self.years = data.launch_year_clean.unique()
        self.years = [nb for nb in self.years if nb >= 2010]

        return self
    
    def transform(self, X, y=None):
        # 1. impute the missing employees number by the mean of the range if the range is indicated (column "employees")
        X = replace_employees(X)
        
        # 2. for the companies launched after 2010 : impute according to the median of the companies launched after 2010
        for year in self.years:
            replace_value = self.yg_median.loc[(year)][0]
            boolean_condition = ((X.launch_year_clean == year) & (X.employees_clean.isna()))
            X.loc[boolean_condition, "employees_clean"] = replace_value
        
        # 3. for the companies launched before 2010 or the company that has no launch year : impute according to the median of the dataset 
        median_all_dataset = X[X.launch_year_clean.notna()][["employees_clean"]].median()
        X.loc[X.employees_clean.isna(), "employees_clean"] = replace_value

        return X

##### Patent_transformer

In [9]:
from sklearn.pipeline import make_pipeline

patent_transformer = make_pipeline(
                                SimpleImputer(missing_values=np.nan, strategy='constant', fill_value = 0),
                                RobustScaler())

##### Ratio_transformer  : funding / employees ratio

In [10]:
from sklearn.preprocessing import FunctionTransformer

funding_employees_ratio_constructor = FunctionTransformer(
    lambda df: pd.DataFrame(df["total_funding_source"] / df["employees_clean"]))


In [11]:
from sklearn.pipeline import Pipeline

ratio_transformer = Pipeline([
    ("imputer1", EmployeeImputer()),
    ("ratio", funding_employees_ratio_constructor),
    ("scaler",  RobustScaler())
])

##### Growth_transformer

###### Age imputer

In [12]:
age_imputer = make_pipeline(SimpleImputer(missing_values=np.nan, strategy='mean'))

###### Growth stage Transformer (1-4)

In [13]:
dictionary = {'mature' : 4, 'late growth' : 3,'early growth' : 2, 'seed' : 1}


growth_stage_transformer = FunctionTransformer(
    lambda df: df[['growth_stage_imputed']].applymap(lambda x: dictionary[x]))


In [14]:
from sklearn.compose import ColumnTransformer

growth_stage_age_preparator = ColumnTransformer([
    ["age_imputer", age_imputer, ["age"]],
    ["growth_stage_transformer", growth_stage_transformer, ["growth_stage_imputed"]],
])

###### Growth_stage_age_ratio

In [15]:
def compute_growth_age_ratio(arr):
    arr[arr[:,0]== 0, 0] = 1
    return np.expand_dims(arr[:,1]/arr[:,0], -1)

growth_stage_age_ratio_constructor = FunctionTransformer(compute_growth_age_ratio)


In [16]:
growth_transformer = Pipeline([
    ["growth_stage_age_preparator", growth_stage_age_preparator], 
    ["growth_stage_age_ratio_constructor", growth_stage_age_ratio_constructor],
])

##### Health industry, investors_type, company_has_phd, proportion_technical, No_people_input, founder_from_institute, founder_has_phd

In [17]:
class NoTransformer(BaseEstimator, TransformerMixin):
    """
    Customized identity transformer
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X

    
No_Transformer = ColumnTransformer([
    ["health_notransformer", NoTransformer(), ["health_industry"]],
    ["fund_investor_notransformer", NoTransformer(), ["investors_type"]],
    ["doctor_notransformer", NoTransformer(), ["company_has_phd"]],
    ["technical_notransformer", NoTransformer(), ["proportion_technical"]],
    ["no_people_input_notransformer", NoTransformer(), ["No_people_input"]],
    ["founder_from_institute_notransformer", NoTransformer(), ["founder_from_institute"]],
    ["founder_has_phd_notransformer", NoTransformer(), ["founder_has_phd"]],
])


##### Preprocessor

In [18]:
from sklearn import set_config; set_config(display='diagram')
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ("growth_transformer", growth_transformer, ["growth_stage_imputed", "age"]),
    ("ratio_transformer", ratio_transformer, ["employees_clean", "employees", "launch_year_clean", "employees_latest", "total_funding_source"]),
    ("patent_transformer", patent_transformer, ["nb_patents"]),
    ("identity", No_Transformer, ["health_industry", 
                                 "investors_type", 
                                 "company_has_phd", 
                                 "proportion_technical", 
                                 "No_people_input",
                                "founder_from_institute",
                                "founder_has_phd",
                                ]),
    ])

##### Pipeline

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from  sklearn.ensemble import RandomForestClassifier

pipemodel = Pipeline(steps=[
                            ('features', preprocessor),
                            ('model', RandomForestClassifier(n_jobs = -1))
                             ])
pipemodel

# RUN MODEL

In [20]:
data = GetCleanData.get_clean_data()
X = data.drop(columns = ["target", "deep_or_not"])
y = data.target

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["investors_type"] = fund_investors(data[["investors_type"]])


In [21]:
pipemodel.fit(X, y)

In [22]:
from sklearn.model_selection import cross_validate

# Cross validate pipeline
cv_result = cross_validate(pipemodel, X, y, cv=10, scoring=["accuracy", "recall", "f1", "precision"])

In [23]:
scoring=["test_accuracy", "test_recall", "test_f1", "test_precision"]

for metric in scoring:
    print(f"{metric} : {cv_result[metric].mean() * 100 : .2f}%")

test_accuracy :  74.17%
test_recall :  71.59%
test_f1 :  72.76%
test_precision :  74.46%


## CROSS VAL

##### y_pred1

In [28]:
data = GetCleanData.get_clean_data()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["investors_type"] = fund_investors(data[["investors_type"]])


In [29]:
data.shape

(1332, 24)

In [30]:
data.growth_stage_imputed.isnull().sum()

0

In [31]:
X_train = pd.read_csv("../bpideep/data/X_train.csv")
y_train = pd.read_csv("../bpideep/data/y_train.csv")

X_test = pd.read_csv("../bpideep/data/X_test.csv")
y_test = pd.read_csv("../bpideep/data/y_test.csv")


X_train = X_train[['id']]
X_test = X_test[["id"]]

In [32]:
print(X_train.shape, X_test.shape)

(892, 1) (440, 1)


In [33]:
train_data = X_train.merge(data, on= "id", how = "left")
X_train = train_data.drop(columns = ["target", "deep_or_not"])
y_train = train_data["target"]


test_data = X_test.merge(data, on= "id", how = "left")
X_test = test_data.drop(columns = ["target", "deep_or_not"])
y_test = test_data["target"]

In [34]:
pipemodel.fit(X_train, y_train)

In [35]:
y_pred1 = pipemodel.predict_proba(X_test)
y_pred1

array([[0.67      , 0.33      ],
       [0.09      , 0.91      ],
       [0.28      , 0.72      ],
       [0.45      , 0.55      ],
       [0.685     , 0.315     ],
       [0.66133333, 0.33866667],
       [0.64      , 0.36      ],
       [0.74583333, 0.25416667],
       [0.82      , 0.18      ],
       [0.88      , 0.12      ],
       [0.4       , 0.6       ],
       [0.09      , 0.91      ],
       [0.26      , 0.74      ],
       [0.96366667, 0.03633333],
       [0.98      , 0.02      ],
       [0.1525    , 0.8475    ],
       [0.08      , 0.92      ],
       [0.964     , 0.036     ],
       [0.43448665, 0.56551335],
       [0.22      , 0.78      ],
       [0.42      , 0.58      ],
       [0.06      , 0.94      ],
       [0.45      , 0.55      ],
       [0.86      , 0.14      ],
       [0.87      , 0.13      ],
       [0.48      , 0.52      ],
       [0.45      , 0.55      ],
       [0.85      , 0.15      ],
       [0.505     , 0.495     ],
       [0.63933333, 0.36066667],
       [0.

In [36]:
y_pred1.shape

(440, 2)

In [37]:
y_test.shape

(440,)

In [None]:
pd.DataFrame(y_pred1).to_csv("y_pred1.csv")

##### y_pred2

In [None]:
data2 = GetCleanData.get_clean_data()

In [None]:
X_train2 = pd.read_csv("../bpideep/data/X_train2.csv")
y_train2 = pd.read_csv("../bpideep/data/y_train2.csv")

X_test2 = pd.read_csv("../bpideep/data/X_test2.csv")
y_test2 = pd.read_csv("../bpideep/data/y_test2.csv")


X_train2 = X_train2[['id']]
X_test2 = X_test2[["id"]]

In [None]:
train_data2 = X_train2.merge(data2, on= "id", how = "left")
X_train2 = train_data2.drop(columns = ["target", "deep_or_not"])
y_train2 = train_data2["target"]


test_data2 = X_test2.merge(data2, on= "id", how = "left")
X_test2 = test_data2.drop(columns = ["target", "deep_or_not"])
y_test2 = test_data2["target"]

In [None]:
pipemodel.fit(X_train2, y_train2)
y_pred2 = pipemodel.predict_proba(X_test2)
y_pred2 = pd.DataFrame(y_pred2)
y_pred2.to_csv("y_pred2.csv")

In [None]:
y_pred2.shape

##### y_pred3

In [None]:
data3 = GetCleanData.get_clean_data()

In [None]:
X_train3 = pd.read_csv("../bpideep/data/X_train3.csv")
y_train3 = pd.read_csv("../bpideep/data/y_train3.csv")

X_test3 = pd.read_csv("../bpideep/data/X_test3.csv")
y_test3 = pd.read_csv("../bpideep/data/y_test3.csv")


X_train3 = X_train3[['id']]
X_test3 = X_test3[["id"]]

In [None]:
train_data3 = X_train3.merge(data3, on= "id", how = "left")
X_train3 = train_data3.drop(columns = ["target", "deep_or_not"])
y_train3 = train_data3["target"]


test_data3 = X_test3.merge(data3, on= "id", how = "left")
X_test3 = test_data3.drop(columns = ["target", "deep_or_not"])
y_test3 = test_data3["target"]

In [None]:
pipemodel.fit(X_train3, y_train3)
y_pred3 = pipemodel.predict_proba(X_test3)
y_pred3 = pd.DataFrame(y_pred3)
y_pred3.to_csv("y_pred3.csv")
y_pred3.shape

# Test import stt

In [38]:
! pip install -e ..

Obtaining file:///Users/catherinechen/code/bpifrance_deeptech_analysis




Installing collected packages: bpideep
  Attempting uninstall: bpideep
    Found existing installation: bpideep 1.0
    Uninstalling bpideep-1.0:
      Successfully uninstalled bpideep-1.0
  Running setup.py develop for bpideep
Successfully installed bpideep


In [56]:
from bpideep import GetCleanData
data = GetCleanData.get_clean_data()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["investors_type"] = pd.DataFrame(data["investors"].apply(lambda row: investors_type(row)))


In [61]:
data.head()

Unnamed: 0,id,name,target,deep_or_not,total_funding_source,employees,employees_latest,launch_year,growth_stage,linkedin_url,...,employees_clean,age,nb_patents,investors_type,health_industry,company_has_phd,proportion_technical,founder_from_institute,founder_has_phd,No_people_input
0,1742681,Healthcardionexion.com,1.0,deeptech,0,2-10,8.0,2015.0,seed,https://www.linkedin.com/company/--health,...,8.0,5.0,,0,1,0,0.0,0.0,0.0,1
1,1743314,4P Pharma,1.0,deeptech,0,11-50,12.0,2014.0,early growth,https://www.linkedin.com/company/4p-pharma,...,12.0,6.0,7.0,0,1,1,0.7,0.0,1.0,0
2,1598607,AFYREN,1.0,deeptech,21000000,11-50,31.0,2012.0,early growth,https://www.linkedin.com/company/9419299,...,31.0,8.0,84.0,1,0,1,0.3,1.0,2.0,0
3,894817,Abbelight,1.0,deeptech,3250000,11-50,20.0,2016.0,early growth,https://www.linkedin.com/company/abbelight,...,20.0,4.0,,1,1,1,0.764706,2.0,2.0,0
4,1683057,AblaCare,1.0,deeptech,10000000,2-10,4.0,2017.0,seed,https://www.linkedin.com/company/ablacare,...,4.0,3.0,,1,1,0,0.0,0.0,0.0,0


In [67]:
X = data.drop(columns = ["target", "deep_or_not"])

In [70]:
from bpideep.EmployeeImputer import EmployeeImputer

e = EmployeeImputer()

In [74]:
e.fit_transform(X)

(1332, 22)


Unnamed: 0,id,name,total_funding_source,employees,employees_latest,launch_year,growth_stage,linkedin_url,industries,investors,...,employees_clean,age,nb_patents,investors_type,health_industry,company_has_phd,proportion_technical,founder_from_institute,founder_has_phd,No_people_input
0,1742681,Healthcardionexion.com,0,2-10,8.0,2015.0,seed,https://www.linkedin.com/company/--health,"[{'id': 1254, 'name': 'health'}]","{'items': [], 'total': 0}",...,8.0,5.0,,0,1,0,0.000000,0.0,0.0,1
1,1743314,4P Pharma,0,11-50,12.0,2014.0,early growth,https://www.linkedin.com/company/4p-pharma,"[{'id': 1254, 'name': 'health'}, {'id': 1264, ...","{'items': [], 'total': 0}",...,12.0,6.0,7.0,0,1,1,0.700000,0.0,1.0,0
2,1598607,AFYREN,21000000,11-50,31.0,2012.0,early growth,https://www.linkedin.com/company/9419299,"[{'id': 100023, 'name': 'energy'}]","{'items': [{'id': 17581, 'name': 'Sofinnova Pa...",...,31.0,8.0,84.0,1,0,1,0.300000,1.0,2.0,0
3,894817,Abbelight,3250000,11-50,20.0,2016.0,early growth,https://www.linkedin.com/company/abbelight,"[{'id': 1254, 'name': 'health'}]","{'items': [{'id': 885471, 'name': 'Agoranov', ...",...,20.0,4.0,,1,1,1,0.764706,2.0,2.0,0
4,1683057,AblaCare,10000000,2-10,4.0,2017.0,seed,https://www.linkedin.com/company/ablacare,"[{'id': 1254, 'name': 'health'}]","{'items': [{'id': 1237998, 'name': 'Sofinnova ...",...,4.0,3.0,,1,1,0,0.000000,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1327,1448492,Zaack,1000000,2-10,10.0,2017.0,seed,https://www.linkedin.com/company/zaack,[],"{'items': [{'id': 1448490, 'name': 'IGIENAIR',...",...,10.0,3.0,,0,0,0,0.200000,0.0,0.0,0
1328,869196,Zelup,1000000,2-10,4.0,2012.0,seed,https://www.linkedin.com/company/zelup-sas/,"[{'id': 100023, 'name': 'energy'}]","{'items': [], 'total': 0}",...,4.0,8.0,5.0,0,0,0,0.000000,0.0,0.0,0
1329,1528826,Zenpark,16400000,51-200,62.0,2011.0,late growth,https://www.linkedin.com/company/zenpark,"[{'id': 100111, 'name': 'transportation'}]","{'items': [{'id': 20628, 'name': 'Demeter Part...",...,62.0,9.0,3.0,1,0,1,0.176471,0.0,1.0,0
1330,1483225,ZestMeUp,800000,11-50,36.0,2015.0,early growth,https://www.linkedin.com/company/zestmeup,[],"{'items': [], 'total': 0}",...,36.0,5.0,,0,0,1,0.096774,0.0,0.0,0


In [82]:
from bpideep.New_Trainer import NewTrainer

X = data.drop(columns = ["target", "deep_or_not"])
y = data["target"]

t = NewTrainer(X, y)
t.train()

(1332, 5)


In [85]:
t.pipeline.predict(X)

array([1., 1., 1., ..., 0., 0., 1.])