In [12]:
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle

# Load the dataset
data = pd.read_csv("investments_VC.csv", encoding='latin1')

# Initial class distribution
print("Initial class distribution in status:")
print(data["status"].value_counts())

# Impute missing values instead of dropping rows
# Fill missing numeric values with the median
data[" funding_total_usd "] = pd.to_numeric(data[" funding_total_usd "], errors="coerce")
data[" funding_total_usd "] = data[" funding_total_usd "].fillna(data[" funding_total_usd "].median())
data["funding_rounds"] = data["funding_rounds"].fillna(data["funding_rounds"].median())
data["founded_year"] = data["founded_year"].fillna(data["founded_year"].median())

# Fill missing status with the most frequent value
data["status"] = data["status"].fillna(data["status"].mode()[0])

# Select columns
data = data[[" funding_total_usd ", "funding_rounds", "founded_year", "status"]]

# Check dataset size and class distribution after imputation
print(f"Dataset size after imputation: {len(data)} rows")
print("Class distribution after imputation:")
print(data["status"].value_counts())

# Prepare features and target
X = data[[" funding_total_usd ", "funding_rounds", "founded_year"]].values
y = pd.factorize(data["status"])[0]  # 0-3: acquired, operating, closed, ipo

# Scale features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Convert to PyTorch tensors
X = torch.from_numpy(X).type(torch.float)
y = torch.from_numpy(y).type(torch.LongTensor)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check train and test set sizes
print(f"Training set size: {len(X_train)} rows")
print(f"Test set size: {len(X_test)} rows")

# Define model
device = "cuda" if torch.cuda.is_available() else "cpu"
class StartupPredictor(nn.Module):
    def __init__(self, input_features=3, output_features=4, hidden_units=16):
        super().__init__()
        self.linear_layer_stack = nn.Sequential(
            nn.Linear(input_features, hidden_units), nn.ReLU(),
            nn.Linear(hidden_units, hidden_units), nn.ReLU(),
            nn.Linear(hidden_units, output_features),
        )
    def forward(self, x):
        return self.linear_layer_stack(x)

model = StartupPredictor().to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# Training loop
X_train, y_train = X_train.to(device), y_train.to(device)
X_test, y_test = X_test.to(device), y_test.to(device)
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item()
    return (correct / len(y_true)) * 100

epochs = 100
for epoch in range(epochs):
    model.train()
    y_logits = model(X_train)
    y_pred = torch.softmax(y_logits, dim=1).argmax(dim=1)
    loss = loss_fn(y_logits, y_train)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        model.eval()
        with torch.inference_mode():
            test_logits = model(X_test)
            test_pred = torch.softmax(test_logits, dim=1).argmax(dim=1)
            test_acc = accuracy_fn(y_test, test_pred)
        print(f"Epoch: {epoch} | Loss: {loss:.5f} | Test Acc: {test_acc:.2f}%")

# Check test set predictions
model.eval()
with torch.inference_mode():
    test_logits = model(X_test)
    test_pred = torch.softmax(test_logits, dim=1).argmax(dim=1)
print("Test set predictions:", test_pred.cpu().numpy())
print("Test set true labels:", y_test.cpu().numpy())

# Save model and scaler
torch.save(model.state_dict(), "startup_predictor.pth")
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# Download files
from google.colab import files
files.download("startup_predictor.pth")
files.download("scaler.pkl")

Initial class distribution in status:
status
operating    41829
acquired      3692
closed        2603
Name: count, dtype: int64
Dataset size after imputation: 54294 rows
Class distribution after imputation:
status
operating    47999
acquired      3692
closed        2603
Name: count, dtype: int64
Training set size: 43435 rows
Test set size: 10859 rows
Epoch: 0 | Loss: 1.47399 | Test Acc: 0.07%
Epoch: 10 | Loss: 1.38817 | Test Acc: 0.10%
Epoch: 20 | Loss: 1.31113 | Test Acc: 82.96%
Epoch: 30 | Loss: 1.24093 | Test Acc: 88.02%
Epoch: 40 | Loss: 1.17658 | Test Acc: 88.31%
Epoch: 50 | Loss: 1.11901 | Test Acc: 88.40%
Epoch: 60 | Loss: 1.06648 | Test Acc: 88.41%
Epoch: 70 | Loss: 1.01774 | Test Acc: 88.42%
Epoch: 80 | Loss: 0.97240 | Test Acc: 88.47%
Epoch: 90 | Loss: 0.92949 | Test Acc: 88.50%
Test set predictions: [1 1 1 ... 1 1 1]
Test set true labels: [1 1 1 ... 1 1 1]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>