In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import matplotlib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score

from sklearn.cluster import KMeans

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
import xgboost as xgb
import ipywidgets as widgets
from IPython.display import display, HTML


In [2]:
# Load the three CSV files
active_startups = pd.read_csv('active-20mm-startups-hult-project.csv')
closed_startups = pd.read_csv('closed-startups-hult-project.csv')
exited_startups = pd.read_csv('exited-20mm-sartups-hult-project.csv')

# Add a new column 'Status' with values based on the file they came from
active_startups['Status'] = 'Active'
closed_startups['Status'] = 'Closed'
exited_startups['Status'] = 'Exited'

# Merge all three datasets
merged_df = pd.concat([active_startups, closed_startups, exited_startups])

# Add a new column 'Success_or_Unsuccess' with values 1 for 'Active' and 'Exited', 0 for 'Closed'
merged_df['Success_or_Unsuccess'] = merged_df['Status'].apply(lambda x: 1 if x in ['Active', 'Exited'] else 0)

# Generate a range of potential unique numbers
potential_fill_values = range(int(merged_df['CB Rank (Company)'].str.replace(',', '').astype(float).max()) + 1, 
                              int(merged_df['CB Rank (Company)'].str.replace(',', '').astype(float).max()) + 10001)

# Identify the first number in this range not present in the column
for value in potential_fill_values:
    if not merged_df['CB Rank (Company)'].str.replace(',', '').astype(float).eq(value).any():
        fill_value = value
        break

# Fill NaN values with the identified unique number and convert to integer
merged_df['CB Rank (Company)'] = merged_df['CB Rank (Company)'].str.replace(',', '').astype(float).fillna(fill_value).astype(int)


merged_df['Founded Year'] = pd.to_datetime(merged_df['Founded Date'], errors='coerce').dt.year
# Extract year from 'Last Funding Date' and create a new column 'Last Funding Year'
merged_df['Last Funding Year'] = pd.to_datetime(merged_df['Last Funding Date'], errors='coerce').dt.year

# Split 'Headquarters Location' into separate columns for City, State, and Country
location_split = merged_df['Headquarters Location'].str.split(', ', expand=True)

# Assign new columns based on split - ensuring compatibility with different location formats
merged_df['City'] = location_split[0]
merged_df['State'] = location_split[1] if location_split.shape[1] > 1 else None
merged_df['Country'] = location_split[2] if location_split.shape[1] > 2 else location_split[1]
# Fill NA values in 'Number of Founders' and 'Founders' with 0 and 'Unknown' respectively
merged_df['Number of Founders'] = merged_df['Number of Founders'].fillna(0)
merged_df['Founders'] = merged_df['Founders'].fillna('Unknown')
merged_df['Number of Employees'] = merged_df['Number of Employees'].fillna('Unknown')
merged_df['Founded Year'] = merged_df['Founded Year'].fillna(merged_df['Founded Year'].median())
merged_df['Founded Year'] = merged_df['Founded Year'].astype('int32')



# Fill missing values in 'Funding Status' with the corresponding value from 'Last Funding Type'
merged_df['Funding Status'].fillna(merged_df['Last Funding Type'], inplace=True)


# Fill missing values in 'Last Equity Funding Amount Currency (in USD)' with 0
merged_df['Last Equity Funding Amount Currency (in USD)'].fillna(0, inplace=True)


# Fill missing values in 'Total Equity Funding Amount Currency (in USD)' with 0
merged_df['Total Equity Funding Amount Currency (in USD)'].fillna(0, inplace=True)

# Fill missing values in 'Total Funding Amount Currency (in USD)' with 0
merged_df['Total Funding Amount Currency (in USD)'].fillna(0, inplace=True)


# Fill missing values in 'Last Funding Amount Currency (in USD)' with 0
merged_df['Last Funding Amount Currency (in USD)'].fillna(0, inplace=True)


selected_columns_tree = ['CB Rank (Company)',
    'Founded Year', 'Number of Employees', 'State',
    'Number of Founders', 'Number of Funding Rounds', 'Last Funding Year',
     'Last Funding Type',
     'Success_or_Unsuccess'
]
model_tree_df = merged_df[selected_columns_tree]

# Selecting object columns
object_cols = ['Number of Employees','State','Last Funding Type']

# Apply one-hot encoding to the categorical variables
model_tree_df_encoded = pd.get_dummies(model_tree_df, columns=object_cols)



# Preparing the dataset for training
X = model_tree_df_encoded.drop(['Success_or_Unsuccess'], axis=1)
y = model_tree_df_encoded['Success_or_Unsuccess']

# Splitting the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1, 1.5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
}

# Initialize the grid search model
grid_search = GridSearchCV(estimator=xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
                           param_grid=param_grid, 
                           scoring='accuracy', 
                           cv=3, 
                           verbose=1, 
                           n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best parameters:", best_params)
print("Best cross-validation score:", best_score)

# Retraining with best parameters
xgb_clf_best = xgb.XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss')
xgb_clf_best.fit(X_train, y_train)

# Evaluate the improved model
y_train_pred_best = xgb_clf_best.predict(X_train)
y_test_pred_best = xgb_clf_best.predict(X_test)

accuracy_train_best = accuracy_score(y_train, y_train_pred_best)
accuracy_test_best = accuracy_score(y_test, y_test_pred_best)

print(f"Improved XGB Accuracy Train: {accuracy_train_best:.5f}")
print(f"Improved XGB Accuracy Test: {accuracy_test_best:.5f}")



Fitting 3 folds for each of 2187 candidates, totalling 6561 fits
Best parameters: {'colsample_bytree': 0.6, 'gamma': 0.5, 'learning_rate': 0.1, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.6}
Best cross-validation score: 0.9626047711154094
Improved XGB Accuracy Train: 0.98130
Improved XGB Accuracy Test: 0.95361


In [12]:
# Custom CSS to increase the width of the description field of the widgets
display(
    HTML("""
<style>
    .widget-label { width: 25ex !important; }
</style>
"""))

# Creating widgets for each input variable
CB_Rank_widget = widgets.IntText(description='CB Rank (Company):',
                                 style={'description_width': 'initial'})
Founded_Year_widget = widgets.IntText(description='Founded Year:',
                                      style={'description_width': 'initial'})
Number_of_Founders_widget = widgets.IntText(
    description='Number of Founders:', style={'description_width': 'initial'})
Number_of_Funding_Rounds_widget = widgets.IntText(
    description='Number of Funding Rounds:',
    style={'description_width': 'initial'})
Last_Funding_Year_widget = widgets.IntText(
    description='Last Funding Year:', style={'description_width': 'initial'})

Number_of_Employees_widget = widgets.Dropdown(
    options=[
        '1-10', '11-50', '51-100', '101-250', '251-500', '501-1000',
        '1001-5000', '5001-10000', '10001+',
        'Unknown'
    ],
    description='Number of Employees:',
    style={'description_width': 'initial'})
State_widget = widgets.Dropdown(options=[
    'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado',
    'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia',
    'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
    'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
    'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
    'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina',
    'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island',
    'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
    'Virginia', 'Washington', 'Wisconsin', 'Wyoming'
],
                                description='State:',
                                style={'description_width': 'initial'})


Last_Funding_Type_widget = widgets.Dropdown(
    options=[
    'Angel', 'Convertible Note', 'Corporate Round', 'Debt Financing',
    'Equity Crowdfunding', 'Grant', 'Non-equity Assistance', 'Post-IPO Debt',
    'Post-IPO Equity', 'Post-IPO Secondary', 'Pre-Seed', 'Private Equity',
    'Product Crowdfunding', 'Secondary Market', 'Seed', 'Series A', 'Series B',
    'Series C', 'Series D', 'Series E', 'Series F', 'Undisclosed',
    'Venture - Series Unknown'
],
    description='Last Funding Type:',
    style={'description_width': 'initial'})

# Button for prediction
predict_button = widgets.Button(description="Predict")

# Display the widgets
widgets_to_display = [
    CB_Rank_widget, Founded_Year_widget,Number_of_Founders_widget, Number_of_Funding_Rounds_widget,
    Last_Funding_Year_widget, Number_of_Employees_widget,
    State_widget, Last_Funding_Type_widget, predict_button
]

# Function to handle the click event of the predict button
def on_predict_button_clicked(b):
    # Updating the encoded lists according to the current widget values
    user_selected_state = State_widget.value
    state_encoded_list = [0] * len(states_list)
    if user_selected_state in states_list:
        index = states_list.index(user_selected_state)
        state_encoded_list[index] = 1

    user_selected_Number_of_Employees = Number_of_Employees_widget.value
    number_of_employees_encoded_list = [0] * len(number_of_employees_list)
    if user_selected_Number_of_Employees in number_of_employees_list:
        index = number_of_employees_list.index(user_selected_Number_of_Employees)
        number_of_employees_encoded_list[index] = 1

    user_selected_Last_Funding_Type = Last_Funding_Type_widget.value
    last_funding_type_encoded_list = [0] * len(last_funding_type_list)
    if user_selected_Last_Funding_Type in last_funding_type_list:
        index = last_funding_type_list.index(user_selected_Last_Funding_Type)
        last_funding_type_encoded_list[index] = 1

    # Updating the non-encoded values according to the current widget values
    cb_rank = CB_Rank_widget.value
    founded_year = Founded_Year_widget.value
    number_of_founders = Number_of_Founders_widget.value
    number_of_funding_rounds = Number_of_Funding_Rounds_widget.value
    last_funding_year = Last_Funding_Year_widget.value

    # Combine user inputs and encoded lists into a single list for the model
    model_input_vector = [
        cb_rank,
        founded_year,
        number_of_founders,
        number_of_funding_rounds,
        last_funding_year
    ] + number_of_employees_encoded_list + state_encoded_list + last_funding_type_encoded_list
    
    # Reshape the input vector to match what the model expects ([n_samples, n_features])
    model_input = [model_input_vector]
    
    # Make the prediction
    prediction = xgb_clf_best.predict(model_input)
    
    # Interpret and print the prediction result
    prediction_result = "Success" if prediction[0] == 1 else "Not Success"
    print(f"Prediction Result: {prediction_result}")

# Attach the event handler to the predict button
predict_button.on_click(on_predict_button_clicked)

# Display the widgets
for widget in widgets_to_display:
    display(widget)

IntText(value=0, description='CB Rank (Company):', style=DescriptionStyle(description_width='initial'))

IntText(value=0, description='Founded Year:', style=DescriptionStyle(description_width='initial'))

IntText(value=0, description='Number of Founders:', style=DescriptionStyle(description_width='initial'))

IntText(value=0, description='Number of Funding Rounds:', style=DescriptionStyle(description_width='initial'))

IntText(value=0, description='Last Funding Year:', style=DescriptionStyle(description_width='initial'))

Dropdown(description='Number of Employees:', options=('1-10', '11-50', '51-100', '101-250', '251-500', '501-10…

Dropdown(description='State:', options=('Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', …

Dropdown(description='Last Funding Type:', options=('Angel', 'Convertible Note', 'Corporate Round', 'Debt Fina…

Button(description='Predict', style=ButtonStyle())

Prediction Result: Success
