In [None]:
%load_ext autoreload
%autoreload 2

In [1]:
import pandas as pd
import numpy as np
import json
import re
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler
import matplotlib.pyplot as plt

# Convert the json data in string (transformed by the pd.read_csv) to string 

In [2]:
def load_json_field(bad_json):
    regex = r"\w\'\w"
    subst = ""
    bad_json = re.sub(regex, subst, bad_json)
    bad_json = bad_json.replace("d' Arrouzat", "darrouzat")
    good_json = bad_json.replace("\'", "\"").replace("None", "null").replace("True", "true").replace("False", "false")
    
    return json.loads(good_json)

In [3]:
data = pd.read_csv("../bpideep/rawdata/data2020-12-03.csv")
data["team"] = data["team"].apply(load_json_field)
data["industries"] = data["industries"].apply(load_json_field)
data["investors"] = data["investors"].apply(load_json_field)
data["fundings"] = data["fundings"].apply(load_json_field)

# Import data + data cleaning thanks to info retrieved from Linkedin

## DATA CLEANING

In [14]:
def impute_missing_launch_year(data):

    names = ['Amypore',
             'Kinnov Therapeutics',
             'Lipofabrik',
             'Step pharma',
             'LiMM Therapeutics',
             'Ilek',
             'LysPackaging',
             'TexiSense',
             "Institut de Prise en Charge de l'Obésité",
             'Izi Family',
             'Arthur Dupuy',
             'Gen.Orph',
             'Uniris',
             'NANOZ',
             'Akwatyx',
             'Black-line',
             'Eyye',
             "O'Sol",
             'Treenox',
             'Aqualeg',
             'Co-assit',
             'Wind my roof',
             'APPARTOO',
             'BimBamJob',
             'Buddytherobot.com',
             'Bcm',
             'CCI Paris Ile de France',
             'Datarocks',
             'EFFICIENCIA',
             'EONEF',
             'FEALINX',
             'INERIS',
             'Marguerite',
             'TokTokDoc',
             'Novaquark',
             'Peopeo',
             'Sloclap',
             'Swift',
             'Sword',
             'XT-VISION',
             'Ryax',
             'Sylha',
             'Opta LP']

    # inserting missing launch year and month for the 33 companies with fundraising
    launch_years = [2018, 2015, 2012, 2014, 2018, 2016, 2015, 2010, 2011, 2016, 2015, 
                   2012, 2017, 2012, 2015, 2017, 2016, 2016, 2018, 2011, 2016, 2018, 2015, 2015, 2014, 2014, 2013,
                   2014, 2012, 2016, 1997, 1990, 2012, 2016, 2014, 2017, 2015, 1973, 2000, 2011, 2017, 2019, 2013]

    zipbObj = zip(names, launch_years)
    launch_year_dict = dict(zipbObj)

    data["launch_year_clean"] = data['launch_year']
    
    for name, year in launch_year_dict.items() : 
        data.loc[data.name == name, "launch_year_clean"] = year
    return data

def get_growth_dict(df):
    table = df[["growth_stage", "launch_year_clean", "id"]].groupby(by=["launch_year_clean", "growth_stage"]).count()
    column = list(table.unstack(level = -1).columns)
    growth_dict = table.unstack(level = -1).fillna(0).apply(lambda x: column[x.argmax()][1], axis = 1).to_dict()
    
    return growth_dict
    

def fill_missing_growth(growth_dict, growth_stage, launch_year_clean):
   
    if type(growth_stage) == str:
        return growth_stage    
    elif launch_year_clean in growth_dict:
        return growth_dict[launch_year_clean]
    else:
        return growth_stage
    
    
def load_json_field(bad_json):
    regex = r"\w\'\w"
    subst = ""
    bad_json = re.sub(regex, subst, bad_json)
    bad_json = bad_json.replace("d' Arrouzat", "darrouzat")
    good_json = bad_json.replace("\'", "\"").replace("None", "null").replace("True", "true").replace("False", "false")
    
    return json.loads(good_json)


def get_industries(x):
    '''
    function that extracts info from 'industries' column through mapping
    data['industries_list'] = data['industries'].map(lambda x: industries(x))
    '''
    industries_list = []
    
    industries = x.apply(load_json_field)

    for u in range(len(industries)):
        if len(industries[u]) > 0:
            industries_list.append(industries[u][0]['name'])
        else:
            industries_list.append("")
    return industries_list

def get_health(x):
    
    industries = get_industries(x) # list of industries

    health_industry = []
    
    for element in industries:
        if element == 'health':
            health_industry.append(1)
        else:
            health_industry.append(0)
    return health_industry

def investors_type(x) :
    investors_list = []
    investors = x
    if investors['total'] > 0 :
        for y in range(len(investors['items'])):
                investors_list.append(investors['items'][y]['type'])
    return investors_list

def fund_investors(x):

    for row in range(len(x)):
        if "fund" in x["investors_type"][row] or "investors" in x["investors_type"][row] :
            x["investors_type"][row] = 1

        else :
            x["investors_type"][row] = 0
    return x


list_investor_type = ['fund',
 'investor',
 'corporate',
 'government_nonprofit',
 'service_provider',
 'company',
 'crowdfunding',
 'workspace']

In [6]:
doctors = pd.read_csv("../bpideep/data/extra_features.csv").drop(columns = "Unnamed: 0")
doctors.head()

Unnamed: 0,id,name,company_has_phd,proportion_technical,founder_from_institute,founder_has_phd,founder_pat_pub,No_people_input
0,1742681,Healthcardionexion.com,0,,,,,1
1,1743314,4P Pharma,1,0.7,0.0,1.0,0.0,0
2,1598607,AFYREN,1,0.3,0.0,2.0,1.0,0
3,894817,Abbelight,1,0.764706,1.0,2.0,1.0,0
4,1683057,AblaCare,0,0.0,0.0,0.0,0.0,0


In [7]:
doctors[["company_has_phd"]].value_counts()

company_has_phd
0                  1080
1                   433
dtype: int64

In [8]:
tmp = pd.read_csv("../bpideep/rawdata/data2020-12-03.csv")

In [10]:
tmp_doctors = tmp.merge(doctors, left_on= "id", right_on= "id")

## TEST cleaning

In [None]:

tmp = pd.read_csv("../bpideep/rawdata/data2020-12-03.csv")
tmp.drop(tmp[(tmp.id == 1787891) | (tmp.id == 1893232)].index, inplace = True)

tmp[tmp.growth_stage.isna()].shape


In [None]:
impute_missing_launch_year(tmp)

In [None]:
tmp['growth_stage_imputed'] = tmp.apply(lambda row: fill_missing_growth(dico,
                row['growth_stage'], 
                row['launch_year_clean']), 
            axis=1
        )
        


tmp[(tmp.growth_stage_imputed.isna())].to_csv("missing_launchyear_growthstage.csv")

In [None]:
missing_list = list(tmp[(tmp.growth_stage_imputed.isna())]["name"])

In [None]:
missing_list

In [None]:
missing_years = [2012, 2017, 2012, 2015, 2017, 2016, 2016, 2018, 2011, 2016, 2018, 2015, 2015, 2014, 2014, 2013]

In [None]:
tmp[(tmp.growth_stage_imputed.isna())]

In [None]:
tmp['growth_stage_imputed']

## GET_DATA

In [20]:
class GetData():

    def get_data():
        data = pd.read_csv("../bpideep/rawdata/data2020-12-03.csv")

        
        #selection of need columns
        data = data[["id", "name", "target", "deep_or_not", "total_funding_source", "employees",
                     "employees_latest", "launch_year", "growth_stage", "linkedin_url", "industries", "investors"]]

#         data["industries"] = data["industries"].apply(load_json_field)
        data["investors"] = data["investors"].apply(load_json_field)
        data["investors_type"] = pd.DataFrame(data["investors"].apply(lambda row: investors_type(row)))
        data["investors_type"] = fund_investors(data[["investors_type"]])
        
        #Health industry detection 
        data["health_industry"] = pd.DataFrame(get_health(data["industries"]))
    
        #imputing the missing launch year (only 33 done manually -> to be automatized if a new source displays it)
        #LAUNCH_YEAR_CLEAN
        impute_missing_launch_year(data)
        
        
        
        # drop 2 duplicated companies Lalilo and Pixyl*
        data.drop(data[(data.id == 1787891) | (data.id == 1893232)].index, inplace = True)
        
        # change the "not meaningful" growth stage status of 15789 Insoft to mature
        data.loc[data.id == 15789, "growth_stage"] = "mature"
        
        
        #imputing missing growth_stage (mode of the launch year)
        #GROWTH_STAGE_IMPUTED
        growth_table = get_growth_dict(data)
        
#         import ipdb ; ipdb.set_trace()
        data['growth_stage_imputed'] = data.apply(lambda row: fill_missing_growth(
                growth_table,
                row['growth_stage'], 
                row['launch_year_clean']), 
            axis=1
        )
        
        
        
        #imputing missing employees values from LinkedIn scraping
        #EMPLOYEES_CLEAN
        missing = pd.read_csv("../bpideep/data/missing_employee_count.csv")
        data["employees_clean"] = data.employees_latest
        data.loc[data.name == "CCI Paris Ile de France", "employees_clean"] = 1793

        for url in missing.linkedin_url:
            replace_value = missing[missing.linkedin_url == url]["check"].iloc[0]
            data.loc[data.linkedin_url == url, "employees_clean"] = replace_value
        
        #computing the age of companies
        data["age"] = 2020 - data.launch_year_clean
        
        
        # get the number of patents
        patent = pd.read_csv("../bpideep/data/patents.csv")
        data = pd.merge(data, patent, on= ["id", "id"], how = "left")
        
        
        #get if the compagny has a doctor or no (from Linkedin)
        doctors = pd.read_csv("../bpideep/data/extra_features.csv").drop(columns = "Unnamed: 0")[["id", "company_has_phd"]]
        data = data.merge(doctors, left_on= "id", right_on= "id")
        
        return data

#         *    
#                 + 3 duplicated names but with different id : 
#                     1/ Lalilo : 926521 (http://www.lalilo.com/) vs. 1787891 (http://lalilo.fr) 
#             -> same launch date, french website no longer exists + observation  almost filled by NAN + same obs as the .com 
#             -> drop the french Lalilo (1787891)
#                     2/ Pixyl : 892048 vs 1893232 (different websites mentioned but same website page when launched) 
#             -> kept  892048 because more info + the Dealroom profile was verified by Dealroom team on Sept, 1st 2020 vs. pending verification
#                     3/ NANOZ : 1836121 vs 1660543 -> kept both as different companies but the second one is German 

In [21]:
data = GetData.get_data()
X = data.drop(columns = ["target", "deep_or_not"])
y = data.target
y[y ==0.5] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["investors_type"] = fund_investors(data[["investors_type"]])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[y ==0.5] = 0


In [22]:
X.head()

Unnamed: 0,id,name,total_funding_source,employees,employees_latest,launch_year,growth_stage,linkedin_url,industries,investors,investors_type,health_industry,launch_year_clean,growth_stage_imputed,employees_clean,age,nb_patents,company_has_phd
0,1742681,Healthcardionexion.com,0,2-10,8.0,2015.0,seed,https://www.linkedin.com/company/--health,"[{'id': 1254, 'name': 'health'}]","{'items': [], 'total': 0}",0,1,2015.0,seed,8.0,5.0,,0
1,1743314,4P Pharma,0,11-50,12.0,2014.0,early growth,https://www.linkedin.com/company/4p-pharma,"[{'id': 1254, 'name': 'health'}, {'id': 1264, ...","{'items': [], 'total': 0}",0,1,2014.0,early growth,12.0,6.0,7.0,1
2,1598607,AFYREN,21000000,11-50,31.0,2012.0,early growth,https://www.linkedin.com/company/9419299,"[{'id': 100023, 'name': 'energy'}]","{'items': [{'id': 17581, 'name': 'Sofinnova Pa...",1,0,2012.0,early growth,31.0,8.0,84.0,1
3,894817,Abbelight,3250000,11-50,20.0,2016.0,early growth,https://www.linkedin.com/company/abbelight,"[{'id': 1254, 'name': 'health'}]","{'items': [{'id': 885471, 'name': 'Agoranov', ...",1,1,2016.0,early growth,20.0,4.0,,1
4,1683057,AblaCare,10000000,2-10,4.0,2017.0,seed,https://www.linkedin.com/company/ablacare,"[{'id': 1254, 'name': 'health'}]","{'items': [{'id': 1237998, 'name': 'Sofinnova ...",1,1,2017.0,seed,4.0,3.0,,0


## TEST

In [None]:
data = GetData.get_data()

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data[(data.employees.notna()) & (data.employees_clean.isna())].shape

présence de math.nan ?

In [None]:
data[(data.employees.isna()) & (data.employees_clean.isna())]

In [None]:
res = replace_employees(data)

In [None]:
res[res.employees_imputed.isna()][["name", "employees_latest", "employees", "employees_clean", "employees_imputed", "launch_year_clean"]].groupby(by = "launch_year_clean").count()

In [None]:
res[(res.employees_imputed.isna()) & (res.launch_year_clean < 2010)][["name", "employees_latest", "employees", "employees_clean", "employees_imputed", "launch_year_clean"]].groupby(by = "launch_year_clean").count()

In [None]:
yg_median = res[["employees_imputed", "launch_year_clean"]].groupby(by=["launch_year_clean"]).median()
years = data.launch_year_clean.unique()
years = [nb for nb in years if nb >= 2010]

print(yg_median)


for year in years:
    replace_value = yg_median.loc[(year)][0]
    boolean_condition = ((res.launch_year_clean == year) & (res.employees_imputed.isna()))
    res.loc[boolean_condition, "employees_imputed"] = replace_value


In [None]:
res[res.employees_imputed.isna()][["name", "employees_latest", "employees", "employees_clean", "employees_imputed", "launch_year_clean"]].groupby(by = "launch_year_clean").count()

In [None]:
#median_bef_2010 
res[res.launch_year_clean < 2010][["employees_imputed"]].median()

In [None]:
res[res.launch_year_clean.notna()][["employees_imputed"]].median()

# PIPELINE

##### employees imputing  : age

In [25]:
import re
import math
from sklearn.base import BaseEstimator, TransformerMixin


def average_list(range_list):
    return sum(range_list)/len(range_list)

def compute_employees_mean(data):
    range_list = list(data.employees.unique())
    try:
        range_list.remove("n.a.")
    except:
        pass
    keys_list = []
    means_list = []

    for i in range(len(range_list)) :
        if type(range_list[i]) == str :
            temp = re.findall(r'\d+', range_list[i]) 
            res = list(map(int, temp)) 
            mean = average_list(res)
            means_list.append(mean)
            keys_list.append(range_list[i])
        else:
            pass

    zip_iterator = zip(keys_list, means_list)
    range_dict = dict(zip_iterator)
    
    return range_dict

def replace_employees(df):

    dictionary = compute_employees_mean(df)
    for key, value in dictionary.items():
        df.loc[(df.employees == key) & (df.employees_latest.isna()), "employees_clean"] = value
    return df
            
            
class EmployeeImputer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        
        # impute according to the range mean
        data = X.copy()
        data = replace_employees(data)

        
        
        self.yg_median = data[["employees_clean", "launch_year_clean"]].groupby(by=["launch_year_clean"]).median()
        self.years = data.launch_year_clean.unique()
        self.years = [nb for nb in self.years if nb >= 2010]

        return self
    
    def transform(self, X, y=None):
        X = replace_employees(X)
        
        # impute according to the median of the companies launched after 2010
        for year in self.years:
            replace_value = self.yg_median.loc[(year)][0]
            boolean_condition = ((X.launch_year_clean == year) & (X.employees_clean.isna()))
            X.loc[boolean_condition, "employees_clean"] = replace_value
        
        
        median_all_dataset = X[X.launch_year_clean.notna()][["employees_clean"]].median()
        X.loc[X.employees_clean.isna(), "employees_clean"] = replace_value

        return X
    
    



##### Patent_transformer

In [28]:
from sklearn.pipeline import make_pipeline

patent_transformer = make_pipeline(
                                SimpleImputer(missing_values=np.nan, strategy='constant', fill_value = 0),
                                RobustScaler())
    
### ROBUSTSCALER or log(1+x) ??

##### Ratio_transformer  : funding / employees ratio

In [29]:
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer

funding_employees_ratio_constructor = FunctionTransformer(
    lambda df: pd.DataFrame(df["total_funding_source"] / df["employees_clean"]))


In [30]:
from sklearn.pipeline import Pipeline

ratio_transformer = Pipeline([
    ("imputer1", EmployeeImputer()),
    ("ratio", funding_employees_ratio_constructor),
    ("scaler",  RobustScaler())
])

##### Growth_transformer

###### Age imputer

In [31]:
age_imputer = make_pipeline(SimpleImputer(missing_values=np.nan, strategy='mean'))

###### Growth stage Transformer (1-4)

In [32]:
dictionary = {'mature' : 4, 'late growth' : 3,'early growth' : 2, 'seed' : 1}


growth_stage_transformer = FunctionTransformer(
    lambda df: df[['growth_stage_imputed']].applymap(lambda x: dictionary[x]))


In [33]:
from sklearn.compose import ColumnTransformer

growth_stage_age_preparator = ColumnTransformer([
    ["age_imputer", age_imputer, ["age"]],
    ["growth_stage_transformer", growth_stage_transformer, ["growth_stage_imputed"]],
])

###### Growth_stage_age_ratio

In [34]:
def compute_growth_age_ratio(arr):
#     import ipdb ; ipdb.set_trace()
    arr[arr[:,0]== 0, 0] = 1
    return np.expand_dims(arr[:,1]/arr[:,0], -1)

growth_stage_age_ratio_constructor = FunctionTransformer(compute_growth_age_ratio)


In [35]:
growth_transformer = Pipeline([
    ["growth_stage_age_preparator", growth_stage_age_preparator], 
    ["growth_stage_age_ratio_constructor", growth_stage_age_ratio_constructor],
])

##### Health industry 

In [36]:
class NoTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X

NoTransformer = ColumnTransformer([
    ["health_notransformer", NoTransformer(), ["health_industry"]],
    ["fund_investor_notransformer", NoTransformer(), ["investors_type"]],
    ["doctor_notransformer", NoTransformer(), ["company_has_phd"]]
])

##### Fund in investors type

In [37]:

# or "crowdfunding" in X[row] or "corporate" in X[row] or 'government_nonprofit' in X[row]

# def fund_in_investor(arr):
# #     import ipdb ; ipdb.set_trace()
#     arr["fund" in arr[:,0], 0] = 1
#     arr["investors" in arr[:,0], 0] = 1
#     arr[arr[:,0] != 1, 0] = 0
#     return arr



# def fund_in_investor(arr):
#     arr["fund" in arr[:,0], 0] = 1
#     return arr


# NoTransformer2 = ColumnTransformer([
#     ["fund_investor_transformer"], NoTransformer(), ["investors_type"]]
# ])

# fund_in_type_constructor = FunctionTransformer(
#     lambda df: df[["investors_type"]].apply(fund_in_investor)
#     )

##### company_has_phd

NoTransformer (cf. above)

old["investors_type"] = pd.DataFrame(old["investors"].apply(lambda row: investors_type(row)))

# fund_in_type_constructor.fit_transform(old[["investors_type"]]).value_counts()


#output : 
#1    971
#0    543
fund_in_type_constructor.fit_transform(old["investors_type"])

##### Preprocessor

In [39]:
from sklearn import set_config; set_config(display='diagram')
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ("growth_transformer", growth_transformer, ["growth_stage_imputed", "age"]),
    ("ratio_transformer", ratio_transformer, ["employees_clean", "employees", "launch_year_clean", "employees_latest", "total_funding_source"]),
    ("patent_transformer", patent_transformer, ["nb_patents"]),
    ("identity", NoTransformer, ["health_industry", "investors_type", "company_has_phd"]),
    ])

##### Pipeline

In [40]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

pipemodel = Pipeline(steps=[
                            ('features', preprocessor),
                            ('model', LogisticRegression(C = 1.52, penalty = 'l1', solver= 'liblinear'))
                             ])
pipemodel

##### RUN MODEL

In [41]:
data = GetData.get_data()
X = data.drop(columns = ["target", "deep_or_not"])
y = data.target
y[y ==0.5] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["investors_type"] = fund_investors(data[["investors_type"]])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[y ==0.5] = 0


In [42]:
pipemodel.fit(X, y)

In [43]:
from sklearn.model_selection import cross_validate

# Cross validate pipeline
cv_result = cross_validate(pipemodel, X, y, cv=10, scoring=["accuracy", "recall", "f1", "precision"])

In [44]:
scoring=["test_accuracy", "test_recall", "test_f1", "test_precision"]

for metric in scoring:
    print(f"{metric} : {cv_result[metric].mean() * 100 : .2f}%")

test_accuracy :  71.08%
test_recall :  56.42%
test_f1 :  62.08%
test_precision :  70.53%


1st run with 3 features only : test_accuracy :  65.12%
test_recall :  32.10%
test_f1 :  43.24%
test_precision :  69.23%


2nd run with "health_industry" : test_accuracy :  69.82%
test_recall :  51.40%
test_f1 :  58.98%
test_precision :  69.88%

3rd run with "fund/investors" : test_accuracy :  69.95%
test_recall :  50.17%
test_f1 :  58.51%
test_precision :  71.17%


4th run with "company_has_phd": test_accuracy :  71.08%
test_recall :  56.42%
test_f1 :  62.08%
test_precision :  70.53%

In [45]:
pd.set_option('display.max_rows', 500)
features = ["growth_stage_age_ratio", "funding_employees_ratio", "nb_patents", "health_industry", "investors_type", "company_has_phd"]

In [46]:
y_pred = pipemodel.predict(X)
xp = preprocessor.fit_transform(X)
y_pred = pd.DataFrame(y_pred, columns = ["y_pred"])
x_prepro = pd.DataFrame(xp, columns = features)

In [47]:
outputs = data[["id", "name", "deep_or_not", "target"]].join(x_prepro).join(y_pred)

In [48]:
FN = outputs[(outputs.deep_or_not == "deeptech") & (outputs.y_pred == 0)]
FP = outputs[(outputs.deep_or_not == "non_deeptech") | (outputs.deep_or_not == "almost_deeptech") & (outputs.y_pred == 1)]


In [None]:
import seaborn as sns
sns.pairplot(FN[features])

In [None]:
sns.pairplot(FP[features])

In [None]:
from pandasgui import show
show(outputs)