In [1]:
# =========================================
# Startup Risk Analyzer
# 06_startup_features.ipynb
# Cell 1: Load Cleaned Data
# =========================================

import pandas as pd

# Load cleaned startup dataset
startup_df = pd.read_csv("startup_risk_cleaned.csv")

print("Dataset Shape:", startup_df.shape)
print("\nColumns:")
print(startup_df.columns.tolist())

print("\nSample Data:")
startup_df.head()


Dataset Shape: (2065, 9)

Columns:
['date', 'startup', 'industry', 'sub_industry', 'city', 'investment_type', 'amount_usd', 'amount_usd_clean', 'investment_type_grouped']

Sample Data:


Unnamed: 0,date,startup,industry,sub_industry,city,investment_type,amount_usd,amount_usd_clean,investment_type_grouped
0,09/01/2020,BYJU’S,e-tech,E-learning,bangalore,private equity round,200000000,200000000.0,private_equity
1,13/01/2020,Shuttl,transportation,App based shuttle service,gurgaon,series c,8048394,8048394.0,series_c
2,09/01/2020,Mamaearth,e-commerce,Retailer of baby and toddler products,bangalore,series b,18358860,18358860.0,series_b
3,02/01/2020,https://www.wealthbucket.in/,fintech,Online Investment,new delhi,pre-series a,3000000,3000000.0,series_a
4,02/01/2020,Fashor,fashion and apparel,Embroiled Clothes For Women,mumbai,seed round,1800000,1800000.0,seed


In [2]:
# =========================================
# Cell 2: Create Startup Risk Labels
# =========================================

# Create funding-based risk labels using quantiles
funding_quantiles = startup_df["amount_usd_clean"].quantile([0.33, 0.66]).values

def assign_risk(amount):
    if amount <= funding_quantiles[0]:
        return "High Risk"
    elif amount <= funding_quantiles[1]:
        return "Medium Risk"
    else:
        return "Low Risk"

startup_df["risk_level"] = startup_df["amount_usd_clean"].apply(assign_risk)

print("Funding Quantiles (USD):", funding_quantiles)

print("\nRisk Level Distribution:")
print(startup_df["risk_level"].value_counts())


Funding Quantiles (USD): [ 650000. 5000000.]

Risk Level Distribution:
risk_level
Medium Risk    737
High Risk      684
Low Risk       644
Name: count, dtype: int64


In [3]:
# =========================================
# Cell 3: Feature Engineering
# =========================================

import numpy as np

# Log transform funding (reduce skew)
startup_df["log_funding"] = np.log1p(startup_df["amount_usd_clean"])

# Reduce category cardinality
def reduce_categories(series, top_n=10):
    top_categories = series.value_counts().nlargest(top_n).index
    return series.apply(lambda x: x if x in top_categories else "other")

startup_df["industry_reduced"] = reduce_categories(startup_df["industry"], top_n=10)
startup_df["city_reduced"] = reduce_categories(startup_df["city"], top_n=10)

print("Top industries after reduction:")
print(startup_df["industry_reduced"].value_counts().head(12))

print("\nTop cities after reduction:")
print(startup_df["city_reduced"].value_counts().head(12))

# Final feature set
feature_cols = [
    "log_funding",
    "industry_reduced",
    "city_reduced",
    "investment_type_grouped"
]

X = startup_df[feature_cols]
y = startup_df["risk_level"]

print("\nFeature Matrix Shape:", X.shape)
print("Target Shape:", y.shape)


Top industries after reduction:
industry_reduced
other                666
consumer internet    590
technology           310
ecommerce            170
unknown              131
finance               57
healthcare            45
e-commerce            34
logistics             23
food and beverage     20
education             19
Name: count, dtype: int64

Top cities after reduction:
city_reduced
bangalore    582
mumbai       401
gurgaon      241
new delhi    241
other        165
unknown      135
chennai       75
hyderabad     72
pune          71
noida         55
ahmedabad     27
Name: count, dtype: int64

Feature Matrix Shape: (2065, 4)
Target Shape: (2065,)


In [4]:
# =========================================
# Cell 4: Encoding + Train-Test Split
# =========================================

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

# Separate categorical & numeric columns
categorical_cols = [
    "industry_reduced",
    "city_reduced",
    "investment_type_grouped"
]
numeric_cols = ["log_funding"]

# Column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)

# Encode target
risk_encoder = LabelEncoder()
y_enc = risk_encoder.fit_transform(y)

# Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_enc,
    test_size=0.2,
    random_state=42,
    stratify=y_enc
)

# Fit-transform
X_train_enc = preprocessor.fit_transform(X_train)
X_test_enc = preprocessor.transform(X_test)

print("Encoded X_train shape:", X_train_enc.shape)
print("Encoded X_test shape:", X_test_enc.shape)

print("\nTarget Encoding:")
print(dict(zip(risk_encoder.classes_, risk_encoder.transform(risk_encoder.classes_))))


Encoded X_train shape: (1652, 30)
Encoded X_test shape: (413, 30)

Target Encoding:
{'High Risk': np.int64(0), 'Low Risk': np.int64(1), 'Medium Risk': np.int64(2)}


In [6]:
# -----------------------------
# Save artifacts (NO MODEL)
# -----------------------------
import joblib
import os

os.makedirs("/content/data/processed", exist_ok=True)
os.makedirs("/content/models", exist_ok=True)

joblib.dump(X_train_enc, "/content/data/processed/startup_X_train_enc.pkl")
joblib.dump(X_test_enc,  "/content/data/processed/startup_X_test_enc.pkl")
joblib.dump(y_train,     "/content/data/processed/startup_y_train.pkl")
joblib.dump(y_test,      "/content/data/processed/startup_y_test.pkl")

joblib.dump(preprocessor, "/content/models/startup_preprocessor.pkl")
joblib.dump(risk_encoder, "/content/models/startup_risk_encoder.pkl")

print("\n✅ Startup features & encoders saved successfully!")


✅ Startup features & encoders saved successfully!
