In [None]:
import kagglehub
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

In [None]:
# Download latest version
path = kagglehub.dataset_download("adilshamim8/startup-growth-and-investment-data")
print("Path to dataset files:", path)

# List files in the downloaded dataset folder
files = os.listdir(path)
print("Downloaded files:", files)

for file in files:
    if file.endswith(".csv"):
        csv_file = os.path.join(path, file)
        break

# Load the dataset using pandas
df = pd.read_csv(csv_file)
df = df.drop(columns=["Startup Name"])
df

In [None]:
# 인코딩 함수: 기본은 Label, encoding=True이면 One-hot
def encode_categorical(df, cols, one_hot=False):
    df = df.copy()
    if one_hot:
        df = pd.get_dummies(df, columns=cols)
        df = df.applymap(lambda x: 1 if x is True else (0 if x is False else x))
    else:
        for col in cols:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
    return df

# 로그 변환
def apply_log(df):
    df = df.copy()
    df["Valuation (USD)"] = np.log1p(df["Valuation (USD)"])
    df["Investment Amount (USD)"] = np.log1p(df["Investment Amount (USD)"])
    return df

# 이상치 제거 (상위 1%)
def remove_outliers(df):
    df = df.copy()
    threshold = df["Valuation (USD)"].quantile(0.99)
    return df[df["Valuation (USD)"] < threshold]

# Feature Engineering (One-hot 전에 수행)
def add_advanced_features(df):
    df = df.copy()
    df["Investment Per Round"] = df["Investment Amount (USD)"] / (df["Funding Rounds"] + 1e-5)
    df["Funding Efficiency Ratio"] = df["Investment Amount (USD)"] / (df["Number of Investors"] + 1e-5)
    df["Funding Rounds Per Year"] = df["Funding Rounds"] / ((2025 - df["Year Founded"]) + 1)
    df["Growth Rate Relative to Industry"] = df["Growth Rate (%)"] / df.groupby("Industry")["Growth Rate (%)"].transform("median")
    df["Investment Relative to Country"] = df["Investment Amount (USD)"] / df.groupby("Country")["Investment Amount (USD)"].transform("median")
    return df

In [None]:
categorical_cols = ['Industry', 'Country']
df_proc = apply_log(df)
df_proc = encode_categorical(df_proc, categorical_cols)
df_proc = remove_outliers(df_proc)
df_proc

In [None]:
# Hyperparameter Grids
alphas = np.logspace(-4, 4, 9)

models = {
    "Linear": LinearRegression(),
    "Ridge":  Ridge(),
    "Lasso":  Lasso(max_iter=20_000),
    "Elastic": ElasticNet(max_iter=20_000),
}

param_grids = {
    "Linear":  {"est__fit_intercept": [True, False], "est__positive": [False, True]},
    "Ridge":   {"est__alpha": alphas, "est__fit_intercept": [True, False], "est__solver": ["auto"]},
    "Lasso":   {"est__alpha": alphas, "est__fit_intercept": [True, False], "est__selection": ["cyclic", "random"]},
    "Elastic": {"est__alpha": alphas, "est__l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9], "est__fit_intercept": [True, False]},
}

In [None]:
# Model Training
variants = [
    {"name": "log O | out O", "log": True,  "out": True},
    {"name": "log O | out X", "log": True,  "out": False},
    {"name": "log X | out O", "log": False, "out": True},
    {"name": "log X | out X", "log": False, "out": False},
]

results = []

for v in variants:
    d = df.copy()
    if v["log"]: d = apply_log(d)
    d = encode_categorical(d, categorical_cols, one_hot=True)
    if v["out"]: d = remove_outliers(d)

    X = d.drop(columns=["Valuation (USD)"]).values
    y = d["Valuation (USD)"].values
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42)

    for m_name, model in models.items():
        pipe = Pipeline([("sc", StandardScaler()), ("est", model)])
        grid = GridSearchCV(
            pipe,
            param_grids[m_name],
            cv=KFold(n_splits=5, shuffle=True, random_state=42),
            scoring="neg_mean_absolute_error",
            n_jobs=-1,
        )
        grid.fit(X_tr, y_tr)

        # predictions
        y_pred_tr = grid.best_estimator_.predict(X_tr)
        y_pred_te = grid.best_estimator_.predict(X_te)

        # metrics
        results.append(
            {
                "Variant": v["name"],
                "Model": m_name,
                "MAE_train": mean_absolute_error(y_tr, y_pred_tr),
                "MSE_train": mean_squared_error(y_tr, y_pred_tr),
                "R2_train":  r2_score(y_tr, y_pred_tr),
                "MAE_test":  mean_absolute_error(y_te, y_pred_te),
                "MSE_test":  mean_squared_error(y_te, y_pred_te),
                "R2_test":   r2_score(y_te, y_pred_te),
                "Best Params": grid.best_params_,
            }
        )

In [None]:
# Results
pd.set_option("display.max_colwidth", None)
results_df = pd.DataFrame(results).sort_values(
    ["Variant", "MAE_test", "Model"]
).reset_index(drop=True)
print(results_df.to_string(index=False))


sns.set_style("whitegrid")
metrics = ["MAE_test", "MSE_test", "R2_test"]

for metric in metrics:
    pivot = results_df.pivot(index="Model", columns="Variant", values=metric)
    plt.figure(figsize=(8, 4))
    sns.heatmap(pivot, annot=True, fmt=".3f", cmap="Blues")
    plt.title(f"Test {metric}")
    plt.ylabel("")
    plt.show()

for metric in metrics:
    plt.figure(figsize=(10, 4))
    sns.barplot(data=results_df, x="Variant", y=metric, hue="Model")
    plt.title(f"{metric} by Variant & Model")
    plt.legend(bbox_to_anchor=(1, 1))
    plt.show()

In [None]:
from sklearn.ensemble import RandomForestRegressor

conditions = {
    "A": {"log": True, "outlier": True},
    "B": {"log": False, "outlier": True},
    "C": {"log": True, "outlier": False},
    "D": {"log": False, "outlier": False},
}

categorical_cols = ['Industry', 'Country']

# Grid search settings
param_grid_rf = {
    "n_estimators": [100, 200, 300, 500],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["auto", "sqrt", "log2"]
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)
final_results_rf = []

for label, opts in conditions.items():
    df_exp = df.copy()
    if opts["log"]:
        df_exp = apply_log(df_exp)
    df_exp = encode_categorical(df_exp, categorical_cols, one_hot=False)
    if opts["outlier"]:
        df_exp = remove_outliers(df_exp)

    X = df_exp.drop(columns=["Valuation (USD)"])
    y = df_exp["Valuation (USD)"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = RandomForestRegressor(random_state=42)
    grid = GridSearchCV(model, param_grid_rf, cv=cv, scoring="neg_mean_squared_error", n_jobs=1)

    print(f"Experiment {label}:")
    print(f"  Log: {opts['log']}, Outlier: {opts['outlier']}")
    grid.fit(X_train, y_train)

    y_pred_train = grid.predict(X_train)
    y_pred_test = grid.predict(X_test)
    

    final_results_rf.append({
        "log": opts["log"],
        "outlier": opts["outlier"],
        "train mae": mean_absolute_error(y_train, y_pred_train),
        "train mse": mean_squared_error(y_train, y_pred_train),
        "train r^2": r2_score(y_train, y_pred_train),
        "test mae": mean_absolute_error(y_test, y_pred_test),
        "test mse": mean_squared_error(y_test, y_pred_test),
        "test r^2": r2_score(y_test, y_pred_test),
        "best parameter": grid.best_params_
    })

results_df_RF = pd.DataFrame(final_results_rf)
results_df_RF