In [1]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import RFE

import numpy as np
import pandas as pd
import csv
from scipy.stats import uniform

In [2]:
# Importing and exploring data
startup_data = []

with open('big_startup_secsees_dataset.csv', 'r') as file:
    reader = csv.DictReader(file)

    for row in reader:
        startup_data.append(row)

startup_df = pd.DataFrame(startup_data).drop(columns=['permalink', 'homepage_url'])

In [3]:
startup_df.head()

Unnamed: 0,name,category_list,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,first_funding_at,last_funding_at
0,#fame,Media,10000000,operating,IND,16,Mumbai,Mumbai,1,,2015-01-05,2015-01-05
1,:Qounter,Application Platforms|Real Time|Social Network...,700000,operating,USA,DE,DE - Other,Delaware City,2,2014-09-04,2014-03-01,2014-10-14
2,"(THE) ONE of THEM,Inc.",Apps|Games|Mobile,3406878,operating,,,,,1,,2014-01-30,2014-01-30
3,0-6.com,Curated Web,2000000,operating,CHN,22,Beijing,Beijing,1,2007-01-01,2008-03-19,2008-03-19
4,004 Technologies,Software,-,operating,USA,IL,"Springfield, Illinois",Champaign,1,2010-01-01,2014-07-24,2014-07-24


In [4]:
y = startup_df['status']
X = startup_df.drop(columns=['status', 'name', 'region', 'country_code', 'state_code']) # Name is not a factor that contributes to startup success

In [5]:
# Cleaning data
y = np.where((y == 'acquired') | (y == 'ipo'), 1, 
                np.where(y == 'closed', 0, np.where(y == 'operating', -1, np.nan)))

In [6]:
for col in X.columns.tolist():
    X[col] = np.where((X[col] == '-') | (X[col] == ''), np.nan, X[col])
   
    try:
        X[col] = pd.to_numeric(X[col], errors='raise')
    except:
        pass

X.info(verbose=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66368 entries, 0 to 66367
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   category_list      63220 non-null  object 
 1   funding_total_usd  53583 non-null  float64
 2   city               58340 non-null  object 
 3   funding_rounds     66368 non-null  int64  
 4   founded_at         51147 non-null  object 
 5   first_funding_at   66344 non-null  object 
 6   last_funding_at    66368 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 3.5+ MB


In [7]:
for col in ['founded_at', 'first_funding_at', 'last_funding_at']:
    X[col] = X[col].apply(lambda x: x if isinstance(x, str) is False else x.split('-')[0])

    X[col] = np.where(X[col] == '2105', '2015', X[col])

X.head()

Unnamed: 0,category_list,funding_total_usd,city,funding_rounds,founded_at,first_funding_at,last_funding_at
0,Media,10000000.0,Mumbai,1,,2015,2015
1,Application Platforms|Real Time|Social Network...,700000.0,Delaware City,2,2014.0,2014,2014
2,Apps|Games|Mobile,3406878.0,,1,,2014,2014
3,Curated Web,2000000.0,Beijing,1,2007.0,2008,2008
4,Software,,Champaign,1,2010.0,2014,2014


In [8]:
X['category_list'] = X['category_list'].apply(lambda x: x if isinstance(x, str) is False else x.split('|')[0])

X.head()

Unnamed: 0,category_list,funding_total_usd,city,funding_rounds,founded_at,first_funding_at,last_funding_at
0,Media,10000000.0,Mumbai,1,,2015,2015
1,Application Platforms,700000.0,Delaware City,2,2014.0,2014,2014
2,Apps,3406878.0,,1,,2014,2014
3,Curated Web,2000000.0,Beijing,1,2007.0,2008,2008
4,Software,,Champaign,1,2010.0,2014,2014


In [9]:
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=42)

In [11]:
num_pipe = Pipeline([('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())])
cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('ohe', OneHotEncoder(handle_unknown='ignore'))])

In [12]:
preprocessor = ColumnTransformer(
    transformers=[('num_pipe', num_pipe, numerical_cols), ('cat_pipe', cat_pipe, categorical_cols)]
)

In [13]:
pipe = Pipeline([('preprocessor', preprocessor), ('clf', LogisticRegression(max_iter=1000, solver='liblinear'))])

In [14]:
pipe.fit(X_train, y_train)

In [64]:
param_distributions = {
    'clf__penalty': ['l1'],
    'clf__C': [10]
}

In [65]:
gs = GridSearchCV(estimator=pipe, param_grid=param_distributions)