In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, confusion_matrix

import numpy as np
import pandas as pd
import csv

In [None]:
# Importing and exploring data
startup_data = []

with open('big_startup_secsees_dataset.csv', 'r') as file:
    reader = csv.DictReader(file)

    for row in reader:
        startup_data.append(row)

startup_df = pd.DataFrame(startup_data).drop(columns=['permalink', 'homepage_url'])

In [None]:
startup_df.head()

In [None]:
y = startup_df['status']
# Create feature cols and drop all redundant cols (ex:, 'city' is technically the same as 'region')
X = startup_df.drop(columns=['status', 'name', 'region', 'country_code', 'state_code']) 

In [None]:
# Cleaning data
y = np.where((y == 'acquired') | (y == 'ipo'), 1, 
                np.where(y == 'closed', 0, np.where(y == 'operating', -1, np.nan)))


In [None]:
for col in X.columns.tolist():
    X[col] = np.where((X[col] == '-') | (X[col] == ''), np.nan, X[col])
   
    try:
        X[col] = pd.to_numeric(X[col], errors='raise')
    except:
        pass

X.info(verbose=True)


In [None]:
for col in ['founded_at', 'first_funding_at', 'last_funding_at']:
    X[col] = X[col].apply(lambda x: x if isinstance(x, str) is False else x.split('-')[0])

    X[col] = np.where(X[col] == '2105', '2015', X[col]) # One of the values in the three cols is a typo of 2105 instead of 2015

X.head()

In [None]:
X['category_list'] = X['category_list'].apply(lambda x: x if isinstance(x, str) is False else x.split('|')[0])

X.head()

In [None]:
print(X['category_list'].value_counts())
print()
print(X['city'].value_counts())
print()
print(X['funding_rounds'].value_counts())

In [None]:
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=42)

In [None]:
num_pipe = Pipeline([('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())])
cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('ohe', OneHotEncoder(handle_unknown='ignore'))])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[('num_pipe', num_pipe, numerical_cols), ('cat_pipe', cat_pipe, categorical_cols)]
)

In [None]:
pipe_lr = Pipeline([('preprocessor', preprocessor), ('clf', LogisticRegression(max_iter=1000, solver='liblinear'))])
pipe_dt = Pipeline([('preprocessor', preprocessor), ('clf', DecisionTreeClassifier(max_depth=7, random_state=42))])

In [None]:
# Two choices: 
    # (1) Use original pipeline to train the model directly  (using the full dataset will take 1-5 secs. to train)
    # (2) Use GridSearchCV() for better params, but with a smaller dataset (using the full dataset will take 2 hrs. to train)

In [None]:
# Choice 1
pipe_lr.fit(X_train, y_train)
pipe_dt.fit(X_train, y_train)

In [None]:
y_pred_lr = pipe_lr.predict(X_test)
y_pred_dt = pipe_dt.predict(X_test)

In [None]:
print(pipe_lr.score(X_test, y_test))
print(pipe_dt.score(X_test, y_test))

In [None]:
confusion_matrix_lr = confusion_matrix(y_test, y_pred_lr, labels=[1, 0, -1])
index = ['actual_success', 'actual_unsuccess', 'actual_operating']
columns = ['predicted_success', 'predicted_unsuccess', 'predicted_operating']

test_confusion_matrix = pd.DataFrame(data=confusion_matrix_lr, index=index, columns=columns)
test_confusion_matrix

In [None]:
confusion_matrix_dt = confusion_matrix(y_test, y_pred_dt, labels=[1, 0, -1])
index = ['actual_success', 'actual_unsuccess', 'actual_operating']
columns = ['predicted_success', 'predicted_unsuccess', 'predicted_operating']

test_confusion_matrix = pd.DataFrame(data=confusion_matrix_dt, index=index, columns=columns)
test_confusion_matrix

In [None]:
# Testing the model
startup_info = {
    'category_list': ['Apps', 'Curated Web', 'Application Platforms', 'Cloud Computing', 'Software'],
    'funding_total_usd': [90000000, 100000, 5000000, 12000000, 70000000],
    'city': ['San Francisco', 'San Francisco', 'Austin', 'Palo Alto', 'Palo Alto'],
    'funding_rounds': [2, 1, 1, 2, 3],
    'founded_at': [2020, 2015, 2019, 2024, 2015],
    'first_funding_at': [2021, 2018, 2020, 2024, 2021],
    'last_funding_at': [2023, 2022, 2024, 2024, 2024]
}

startup_info = pd.DataFrame(startup_info)

y_pred_lr_result = pipe_lr.predict(startup_info)
y_pred_dt_result = pipe_dt.predict(startup_info)

print(y_pred_lr_result.tolist())
print(y_pred_dt_result.tolist())

In [None]:
coefficients = pipe_lr.named_steps['clf'].coef_[0]

feature_names = X_train.columns.tolist()
coef_features = list(zip(feature_names, coefficients))

coef_features.sort(key=lambda x: np.abs(x[-1]), reverse=True)

most_important_factors = coef_features[:3]
most_important_factors

In [None]:
importances = pipe_dt.named_steps['clf'].feature_importances_
feature_names = X_train.columns.tolist()
importances_features = list(zip(feature_names, importances))

importances_features.sort(key=lambda x: np.abs(x[-1]), reverse=True)

most_important_factors = importances_features[:3]
most_important_factors

In [None]:
# Choice 2
param_grid = {
    'clf__penalty': ['l1', 'l2'],
    'clf__C': [1, 5, 10, 50]
}

In [None]:
gs_lr = GridSearchCV(estimator=pipe_lr, param_grid=param_grid, n_jobs=5, cv=3)
gs_dt = GridSearchCV(estimator=pipe_dt, param_grid=param_grid, n_jobs=5, cv=3)

In [None]:
X_train_subset = X_train[:3000] # If the entire 40,000+ rows are used, training would take 2+ hrs
y_train_subset = y_train[:3000]

gs_lr.fit(X_train_subset, y_train_subset)
gs_dt.fit(X_train_subset, y_train_subset)

In [None]:
y_pred_lr = gs_lr.predict(X_test)
y_pred_dt = gs_dt.predict(X_test)

In [None]:
score_lr = accuracy_score(y_test, y_pred_lr)
score_dt = accuracy_score(y_test, y_pred_dt)

print(score_lr)
print(score_dt)

In [None]:
# Testing the model
startup_info = {
    'category_list': ['Apps', 'Curated Web', 'Application Platforms', 'Cloud Computing', 'Software'],
    'funding_total_usd': [90000000, 100000, 5000000, 12000000, 70000000],
    'city': ['San Francisco', 'San Francisco', 'Austin', 'Palo Alto', 'Palo Alto'],
    'funding_rounds': [2, 1, 1, 2, 3],
    'founded_at': [2020, 2015, 2019, 2024, 2015],
    'first_funding_at': [2021, 2018, 2020, 2024, 2021],
    'last_funding_at': [2023, 2022, 2024, 2024, 2024]
}

startup_info = pd.DataFrame(startup_info)

y_pred_lr_result = gs_lr.predict(startup_info)
y_pred_dt_result = gs_dt.predict(startup_info)

print(y_pred_lr_result.tolist())
print(y_pred_dt_result.tolist())