# Startup-Acquisition-Status-Prediction with pipeline

### Data preprocessing

In [2]:
#import necessary libraries  
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
warnings.filterwarnings('ignore')

In [4]:
company = pd.read_csv("D:\\technocalobs\\2nd project\\Data_companies\companies.csv")

In [5]:
company.shape

(196553, 44)

In [6]:
# to display all columns
pd.set_option('display.max_columns', None)

In [7]:
# Droping  irrelevant and redundant information
company.drop(['region','city','state_code'], axis=1,inplace=True)
company.drop(['id', 'Unnamed: 0.1', 'entity_type', 'entity_id', 'parent_id', 'created_by', 'created_at', 'updated_at'], axis=1,inplace=True)
company.drop([ 'domain', 'homepage_url', 'twitter_username', 'logo_url', 'logo_width', 'logo_height', 'short_description', 'description', 'overview','tag_list', 'name', 'normalized_name', 'permalink', 'invested_companies'], axis=1,inplace=True)

In [8]:
# check for duplicate values
company.duplicated().any()

True

In [9]:
# check number of duplicate values
company.duplicated().sum()

87089

In [8]:
# Let's delete all the duplicate values
company.drop_duplicates(inplace=True)

In [10]:
# check if any left
company.duplicated().any()

True

In [11]:
# # Since we can see it has more than 96% of null values, it would not make sense to impute these data. So, lets drop it.
company.drop(['first_investment_at','last_investment_at','investment_rounds','ROI'], axis=1,inplace=True)

In [12]:
#lets check number of missing values in each rows
company.isna().sum()

category_code          73367
status                     0
founded_at            105326
closed_at             193933
country_code          108563
first_funding_at      165046
last_funding_at       165046
funding_rounds        164846
funding_total_usd     168679
first_milestone_at    104854
last_milestone_at     104854
milestones            104854
relationships          66886
lat                   112701
lng                   112701
dtype: int64

In [13]:
company.shape

(196553, 15)

In [14]:
#Delete instances with missing values for 'status', 'country_code', 'category_code' and 'founded_at'.
company.dropna(subset=['status', 'country_code', 'category_code','founded_at'],inplace=True)

In [15]:
company.shape

(64099, 15)

#### Handling outliers by IQR method

In [21]:
# For funding_total_usd
#Type your code here!
Q1_FTU = company.funding_total_usd.quantile(0.25)
Q3_FTU = company.funding_total_usd.quantile(0.75)
Q1_FTU, Q3_FTU
IQR_FTU = Q3_FTU - Q1_FTU
print(f"IQR of funding_total_usd is: {IQR_FTU}\n")

# For funding_rounds
#Type your code here!
Q1_FR = company.funding_rounds.quantile(0.25)
Q3_FR = company.funding_rounds.quantile(0.75)
Q1_FR, Q3_FR
IQR_FR = Q3_FR - Q1_FR
print(f"IQR of funding_rounds is: {IQR_FR}")

IQR of funding_total_usd is: 11489705.0

IQR of funding_rounds is: 1.0


In [22]:
# For funding_total_usd
lower_limit_FTU = Q1_FTU - 1.5*IQR_FTU
upper_limit_FTU = Q3_FTU + 1.5*IQR_FTU
print("funding_total_usd")
print(f"lower limit is: {lower_limit_FTU}")
print(f"upper limit is: {upper_limit_FTU}\n")


# For funding_rounds
lower_limit_FR = Q1_FR - 1.5*IQR_FR
upper_limit_FR = Q3_FR + 1.5*IQR_FR
print("funding_total_usd")
print(f"lower limit is: {lower_limit_FR}")
print(f"upper limit is: {upper_limit_FR}")

funding_total_usd
lower limit is: -16724262.5
upper limit is: 29234557.5

funding_total_usd
lower limit is: -0.5
upper limit is: 3.5


In [24]:
# For funding_total_usd
funding_total_usd_outliers=company[(company.funding_total_usd<lower_limit_FTU)|(company.funding_total_usd>upper_limit_FTU)]
print(funding_total_usd_outliers.shape)

# For funding_rounds
funding_rounds_outliers=company[(company.funding_rounds<lower_limit_FR)|(company.funding_rounds>upper_limit_FR)]
print(funding_rounds_outliers.shape)

(0, 15)
(0, 15)


In [25]:
# For funding_total_usd
company.drop(company[(company.funding_total_usd<lower_limit_FTU)|(company.funding_total_usd>upper_limit_FTU)].index,inplace=True)


# For funding_rounds
company.drop(company[(company.funding_rounds<lower_limit_FR)|(company.funding_rounds>upper_limit_FR)].index,inplace=True)
company.shape

(60501, 15)

In [26]:
# For founded_at
company['founded_at']=pd.to_datetime(company['founded_at'], format='%Y-%m-%d').dt.year

# closed_at
company['closed_at']=pd.to_datetime(company['closed_at'], format='%Y-%m-%d').dt.year

# first_funding_at
company['first_funding_at']=pd.to_datetime(company['first_funding_at'], format='%Y-%m-%d').dt.year

# last_funding_at
company['last_funding_at']=pd.to_datetime(company['last_funding_at'], format='%Y-%m-%d').dt.year

# # first_milestone_at
company['first_milestone_at']=pd.to_datetime(company['first_milestone_at'], format='%Y-%m-%d').dt.year


# # last_milestone_at
company['last_milestone_at']=pd.to_datetime(company['last_milestone_at'], format='%Y-%m-%d').dt.year

 #### Generalize the categorical data i.e. category_code and  country_code 

In [27]:
#category_encoading
category_others_index = company.category_code.value_counts()[15:].index
company['category_code'] = company['category_code'].replace(to_replace=category_others_index,value='other')

In [28]:
# Let's check if we've more than 15 columns
print('Unique no of category_code : ',company['category_code'].nunique())
company.category_code.value_counts()

Unique no of category_code :  15


category_code
other               12910
software            11010
web                  7870
ecommerce            4897
mobile               3715
advertising          3597
consulting           2818
games_video          2807
enterprise           2474
biotech              2008
public_relations     1565
hardware             1438
network_hosting      1257
education            1156
search                979
Name: count, dtype: int64

In [29]:
# Lets keep the country as same of it falls under above 10 otherwise let's replaceit with other.
country_code_others_index = company['country_code'].value_counts()[9:].index
company['country_code'] = company['country_code'].replace(to_replace=country_code_others_index,value='other')

In [30]:
# Let's check if we've more than 10 columns
print('Unique no of country_code : ',company['country_code'].nunique())
company.country_code.value_counts()

Unique no of country_code :  10


country_code
USA      34122
other    10363
GBR       5063
IND       3038
CAN       2632
DEU       1369
FRA       1233
AUS       1033
ESP        882
ISR        766
Name: count, dtype: int64

### 2. Create new variables¶
    a. Create new feature isClosed from closed_at and status.
    b. Create new feature 'active_days'

#### 2.a. Create new feature isClosed from closed_at and status.
     - if the value in status is 'operating' or 'ipo', Let's put 0.
     - Where as if the value is 'acquired' or 'closed', let's put 1.

In [31]:
def isClosed(row):
    if row['status'] == 'operating' or row['status'] == 'ipo':
        return 0
    else:
        return 1
company['isClosed'] = company.apply(lambda rw: isClosed(rw),axis=1)
company

Unnamed: 0,category_code,status,founded_at,closed_at,country_code,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,first_milestone_at,last_milestone_at,milestones,relationships,lat,lng,isClosed
5,advertising,operating,2007,,other,,,,,,,,2.0,30.427755,-9.598107,0
6,other,operating,2008,,IND,,,,,,,,,22.307159,73.181219,0
12,advertising,operating,2008,,USA,,,,,2008.0,2008.0,1.0,2.0,35.686975,-105.937799,0
13,web,acquired,2007,,USA,2008.0,2008.0,1.0,5000000.0,2008.0,2012.0,3.0,14.0,37.386052,-122.083851,1
15,games_video,operating,2008,,USA,,,,,2008.0,2008.0,1.0,3.0,33.078655,-116.601964,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196548,ecommerce,operating,2007,,USA,,,,,2013.0,2013.0,2.0,5.0,37.774929,-122.419415,0
196549,public_relations,operating,2007,,USA,2008.0,2008.0,1.0,750000.0,2013.0,2013.0,1.0,14.0,37.338208,-121.886329,0
196550,consulting,operating,1959,,USA,,,,,2012.0,2013.0,3.0,44.0,38.882334,-77.171091,0
196551,search,operating,2008,,USA,,,,,,,,1.0,34.052234,-118.243685,0


In [32]:
company['closed_at']=(np.where((company['status']=='operating')|(company['status']=='ipo'),2021,company['closed_at']))

In [33]:
company.dropna(axis=0, subset=['closed_at'], inplace=True)

In [34]:
closed_at=company['closed_at']
founded_at=company['founded_at']

In [35]:
active_days=365*((closed_at).astype('float64')-(founded_at).astype('float64'))
company['Active_Days']=active_days
index_name=company['Active_Days'].sort_values().head(68).index
company.drop(index_name,inplace=True)
company['Active_Days']

5          5110.0
6          4745.0
12         4745.0
15         4745.0
20         6570.0
           ...   
196548     5110.0
196549     5110.0
196550    22630.0
196551     4745.0
196552     5110.0
Name: Active_Days, Length: 56968, dtype: float64

In [36]:
company.drop(['closed_at'], axis=1,inplace=True)

In [37]:
company

Unnamed: 0,category_code,status,founded_at,country_code,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,first_milestone_at,last_milestone_at,milestones,relationships,lat,lng,isClosed,Active_Days
5,advertising,operating,2007,other,,,,,,,,2.0,30.427755,-9.598107,0,5110.0
6,other,operating,2008,IND,,,,,,,,,22.307159,73.181219,0,4745.0
12,advertising,operating,2008,USA,,,,,2008.0,2008.0,1.0,2.0,35.686975,-105.937799,0,4745.0
15,games_video,operating,2008,USA,,,,,2008.0,2008.0,1.0,3.0,33.078655,-116.601964,0,4745.0
20,other,operating,2003,USA,2011.0,2012.0,3.0,10125293.0,2010.0,2010.0,1.0,6.0,30.267153,-97.743061,0,6570.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196548,ecommerce,operating,2007,USA,,,,,2013.0,2013.0,2.0,5.0,37.774929,-122.419415,0,5110.0
196549,public_relations,operating,2007,USA,2008.0,2008.0,1.0,750000.0,2013.0,2013.0,1.0,14.0,37.338208,-121.886329,0,5110.0
196550,consulting,operating,1959,USA,,,,,2012.0,2013.0,3.0,44.0,38.882334,-77.171091,0,22630.0
196551,search,operating,2008,USA,,,,,,,,1.0,34.052234,-118.243685,0,4745.0


In [38]:
company.isna().sum()

category_code             0
status                    0
founded_at                0
country_code              0
first_funding_at      39348
last_funding_at       39348
funding_rounds        39214
funding_total_usd     41444
first_milestone_at    26711
last_milestone_at     26711
milestones            26711
relationships         14818
lat                    2276
lng                    2276
isClosed                  0
Active_Days               0
dtype: int64

In [39]:
company.drop(['status'], axis=1,inplace=True)

In [40]:
company.dtypes

category_code          object
founded_at              int32
country_code           object
first_funding_at      float64
last_funding_at       float64
funding_rounds        float64
funding_total_usd     float64
first_milestone_at    float64
last_milestone_at     float64
milestones            float64
relationships         float64
lat                   float64
lng                   float64
isClosed                int64
Active_Days           float64
dtype: object

In [41]:
X = company.drop("isClosed", axis=1)
y = company["isClosed"]

In [42]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [44]:
!pip install xgboost

Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/24/ec/ad387100fa3cc2b9b81af0829b5ecfe75ec5bb19dd7c19d4fea06fb81802/xgboost-2.0.3-py3-none-win_amd64.whl.metadata
  Using cached xgboost-2.0.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Using cached xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.0.3


### Creating pipline

In [45]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
import xgboost as xgb

In [46]:
# Display Pipeline

from sklearn import set_config
set_config(display='diagram')

In [47]:
company

Unnamed: 0,category_code,founded_at,country_code,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,first_milestone_at,last_milestone_at,milestones,relationships,lat,lng,isClosed,Active_Days
5,advertising,2007,other,,,,,,,,2.0,30.427755,-9.598107,0,5110.0
6,other,2008,IND,,,,,,,,,22.307159,73.181219,0,4745.0
12,advertising,2008,USA,,,,,2008.0,2008.0,1.0,2.0,35.686975,-105.937799,0,4745.0
15,games_video,2008,USA,,,,,2008.0,2008.0,1.0,3.0,33.078655,-116.601964,0,4745.0
20,other,2003,USA,2011.0,2012.0,3.0,10125293.0,2010.0,2010.0,1.0,6.0,30.267153,-97.743061,0,6570.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196548,ecommerce,2007,USA,,,,,2013.0,2013.0,2.0,5.0,37.774929,-122.419415,0,5110.0
196549,public_relations,2007,USA,2008.0,2008.0,1.0,750000.0,2013.0,2013.0,1.0,14.0,37.338208,-121.886329,0,5110.0
196550,consulting,1959,USA,,,,,2012.0,2013.0,3.0,44.0,38.882334,-77.171091,0,22630.0
196551,search,2008,USA,,,,,,,,1.0,34.052234,-118.243685,0,4745.0


In [48]:
company.info()

<class 'pandas.core.frame.DataFrame'>
Index: 56968 entries, 5 to 196552
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   category_code       56968 non-null  object 
 1   founded_at          56968 non-null  int32  
 2   country_code        56968 non-null  object 
 3   first_funding_at    17620 non-null  float64
 4   last_funding_at     17620 non-null  float64
 5   funding_rounds      17754 non-null  float64
 6   funding_total_usd   15524 non-null  float64
 7   first_milestone_at  30257 non-null  float64
 8   last_milestone_at   30257 non-null  float64
 9   milestones          30257 non-null  float64
 10  relationships       42150 non-null  float64
 11  lat                 54692 non-null  float64
 12  lng                 54692 non-null  float64
 13  isClosed            56968 non-null  int64  
 14  Active_Days         56968 non-null  float64
dtypes: float64(11), int32(1), int64(1), object(2)
memory usag

In [49]:
trf1 = ColumnTransformer([
    ('impute_num',SimpleImputer(),slice(3,13)),
],remainder='passthrough')

In [50]:
first_step = trf1.fit_transform(X_train)
first_step

array([[2010.5018481660506, 2011.0453511515495, 1.4159198362853715, ...,
        2007, 'FRA', 5110.0],
       [2010.5018481660506, 2011.0453511515495, 1.4159198362853715, ...,
        2007, 'USA', 5110.0],
       [2010.5018481660506, 2011.0453511515495, 1.4159198362853715, ...,
        2011, 'USA', 3650.0],
       ...,
       [2010.5018481660506, 2011.0453511515495, 1.4159198362853715, ...,
        2007, 'USA', 5110.0],
       [2010.5018481660506, 2011.0453511515495, 1.4159198362853715, ...,
        2010, 'USA', 4015.0],
       [2010.0, 2010.0, 1.0, ..., 2006, 'USA', 2190.0]], dtype=object)

In [51]:
pd.DataFrame(first_step).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,2010.501848,2011.045351,1.41592,4252913.935554,2009.616698,2010.118266,1.321096,1.0,46.323716,-0.464777,web,2007,FRA,5110.0
1,2010.501848,2011.045351,1.41592,4252913.935554,2008.0,2008.0,1.0,3.702353,32.715738,-117.161084,mobile,2007,USA,5110.0
2,2010.501848,2011.045351,1.41592,4252913.935554,2011.0,2011.0,1.0,4.0,40.712775,-74.005973,web,2011,USA,3650.0
3,2012.0,2012.0,1.0,1000000.0,2012.0,2013.0,2.0,18.0,26.215907,-98.325293,other,2011,USA,3650.0
4,2010.501848,2011.045351,1.41592,4252913.935554,2009.616698,2010.118266,1.321096,3.702353,41.424473,-91.043205,other,1945,USA,27740.0


In [52]:
# checking for missing values
pd.DataFrame(first_step).isna().sum().any()

False

In [53]:
pd.DataFrame(first_step).shape

(45574, 14)

In [55]:
# one hot encoding
trf2 = ColumnTransformer([
    ('ohe_category', OneHotEncoder(handle_unknown='ignore'), [10]),
    ('ohe_country', OneHotEncoder(handle_unknown='ignore'), [12])
], remainder='passthrough')


In [56]:
second_step = trf2.fit_transform(first_step)
pd.DataFrame(second_step).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2010.501848,2011.045351,1.41592,4252913.935554,2009.616698,2010.118266,1.321096,1.0,46.323716,-0.464777,2007,5110.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2010.501848,2011.045351,1.41592,4252913.935554,2008.0,2008.0,1.0,3.702353,32.715738,-117.161084,2007,5110.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2010.501848,2011.045351,1.41592,4252913.935554,2011.0,2011.0,1.0,4.0,40.712775,-74.005973,2011,3650.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2012.0,2012.0,1.0,1000000.0,2012.0,2013.0,2.0,18.0,26.215907,-98.325293,2011,3650.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2010.501848,2011.045351,1.41592,4252913.935554,2009.616698,2010.118266,1.321096,3.702353,41.424473,-91.043205,1945,27740.0


In [57]:
pd.DataFrame(second_step).shape

(45574, 37)

In [58]:
# imbalanced dataset
# trf3 = ColumnTransformer([
#     ('oversampling',RandomOverSampler(sampling_strategy = 'minority'),slice(0,37))
# ],remainder='passthrough')


In [59]:
# third_step = trf3.fit_transform(second_step)
# pd.DataFrame(third_step).head()

In [60]:
# pd.DataFrame(third_step).shape

In [61]:
# Scaling
trf4 = ColumnTransformer([
    ('scale',StandardScaler(),slice(0,37))
])

In [63]:
fourth_step = trf4.fit_transform(second_step)
pd.DataFrame(fourth_step).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36
0,-0.252083,-0.182181,-0.226342,-0.299174,-0.142979,-0.205058,-0.219179,-0.156314,-0.256095,-0.144569,-0.524993,-0.162679,-0.129965,-0.464361,2.594915,-0.133499,-0.212667,-0.152817,-0.123256,6.857345,-0.305356,-0.235046,-0.113438,-1.113135,-0.463557,2.379133e-10,-1.053729e-10,1.544941e-12,1.011694e-12,-2.416288e-10,-2.566324e-10,-1.131645e-12,-0.247817,0.577405,0.670427,0.117708,-0.090082
1,-0.252083,-0.182181,-0.226342,-0.299174,-0.142979,-0.205058,-0.219179,-0.156314,3.904794,-0.144569,-0.524993,-0.162679,-0.129965,-0.464361,-0.385369,-0.133499,-0.212667,-0.152817,-0.123256,-0.145829,-0.305356,-0.235046,-0.113438,0.898364,-0.463557,2.379133e-10,-1.053729e-10,1.544941e-12,1.011694e-12,-0.5402695,-0.7309224,-0.7087295,2.581954e-14,-0.275311,-0.995445,0.117708,-0.090082
2,-0.252083,-0.182181,-0.226342,-0.299174,-0.142979,-0.205058,-0.219179,-0.156314,-0.256095,-0.144569,-0.524993,-0.162679,-0.129965,-0.464361,2.594915,-0.133499,-0.212667,-0.152817,-0.123256,-0.145829,-0.305356,-0.235046,-0.113438,0.898364,-0.463557,2.379133e-10,-1.053729e-10,1.544941e-12,1.011694e-12,0.4622728,0.3042485,-0.7087295,0.0272955,0.225807,-0.379394,0.536166,-0.502169
3,-0.252083,-0.182181,-0.226342,-0.299174,-0.142979,-0.205058,-0.219179,-0.156314,-0.256095,-0.144569,1.904787,-0.162679,-0.129965,-0.464361,-0.385369,-0.133499,-0.212667,-0.152817,-0.123256,-0.145829,-0.305356,-0.235046,-0.113438,0.898364,-0.463557,1.085593,0.7349132,-1.143376,-1.026026,0.7964536,0.9943625,1.498493,1.311154,-0.68261,-0.726559,0.536166,-0.502169
4,-0.252083,-0.182181,-0.226342,-0.299174,-0.142979,-0.205058,-0.219179,-0.156314,-0.256095,-0.144569,1.904787,-0.162679,-0.129965,-0.464361,-0.385369,-0.133499,-0.212667,-0.152817,-0.123256,-0.145829,-0.305356,-0.235046,-0.113438,0.898364,-0.463557,2.379133e-10,-1.053729e-10,1.544941e-12,1.011694e-12,-2.416288e-10,-2.566324e-10,-1.131645e-12,2.581954e-14,0.270404,-0.622605,-6.368389,6.297266


In [64]:
trf5 = xgb.XGBClassifier()

In [65]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf4',trf4),
    ('trf5',trf5)
])

In [66]:
pipe.fit(X_train, y_train)

In [67]:
# Predict
y_pred = pipe.predict(X_test)

In [68]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9997367035281727

In [69]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[11109     0]
 [    3   282]]
0.9997367035281727
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11109
           1       1.00      0.99      0.99       285

    accuracy                           1.00     11394
   macro avg       1.00      0.99      1.00     11394
weighted avg       1.00      1.00      1.00     11394



In [70]:
# Saving model to disk
import pickle 
pickle.dump(pipe, open('model_pipe.pkl','wb'))

In [71]:
X_train[:145455].head(50)

Unnamed: 0,category_code,founded_at,country_code,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,first_milestone_at,last_milestone_at,milestones,relationships,lat,lng,Active_Days
132104,web,2007,FRA,,,,,,,,1.0,46.323716,-0.464777,5110.0
156718,mobile,2007,USA,,,,,2008.0,2008.0,1.0,,32.715738,-117.161084,5110.0
7208,web,2011,USA,,,,,2011.0,2011.0,1.0,4.0,40.712775,-74.005973,3650.0
191184,other,2011,USA,2012.0,2012.0,1.0,1000000.0,2012.0,2013.0,2.0,18.0,26.215907,-98.325293,3650.0
125596,other,1945,USA,,,,,,,,,41.424473,-91.043205,27740.0
32621,web,2011,USA,,,,,2012.0,2012.0,1.0,5.0,33.684567,-117.826505,3650.0
82263,biotech,1998,USA,2013.0,2013.0,1.0,3570000.0,,,,9.0,39.680037,-105.897947,8395.0
173254,ecommerce,2010,USA,,,,,2010.0,2010.0,1.0,2.0,43.16103,-77.610922,4015.0
125073,other,2013,USA,,,,,2012.0,2012.0,1.0,4.0,33.49417,-111.926052,2920.0
117424,other,2011,USA,,,,,2011.0,2011.0,1.0,1.0,40.718155,-73.354287,3650.0


In [72]:
y_train[145455]

1