In [53]:
from google.colab import files
uploader = files.upload()

Saving Startup_Scoring_Dataset.csv to Startup_Scoring_Dataset (2).csv


In [54]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

df = pd.read_csv('Startup_Scoring_Dataset.csv')

def create_success_score(row):
    score = 0
    score += min(40, (row['monthly_active_users'] / 100000) * 40)
    score += min(30, (row['valuation_inr'] / (row['monthly_burn_rate_inr'] * 12 + 1)) * 5)
    score += min(15, row['team_experience'] * 1.5)
    score += min(15, (row['market_size_million_usd'] / 1000) * 15)
    return np.clip(score + np.random.normal(0, 5), 0, 100)

df['success_score'] = df.apply(create_success_score, axis=1)

features = ['team_experience', 'market_size_million_usd', 'monthly_active_users',
            'monthly_burn_rate_inr', 'funds_raised_inr', 'valuation_inr']
X = df[features].fillna(df[features].median())
y = df['success_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'RandomForest': RandomForestRegressor(random_state=42),
    'GradientBoosting': GradientBoostingRegressor(random_state=42),
    'SVR': SVR()
}

results = {}
for name, model in models.items():
    if name in ['SVR', 'Ridge', 'Lasso']:
        model.fit(X_train_scaled, y_train)
        preds = model.predict(X_test_scaled)
        cv = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')
    else:
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        cv = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')

    results[name] = {
        'model': model,
        'predictions': preds,
        'r2': r2_score(y_test, preds),
        'rmse': np.sqrt(mean_squared_error(y_test, preds)),
        'cv_score': np.mean(cv)
    }

best_model_name = max(results, key=lambda k: results[k]['cv_score'])
best_model = results[best_model_name]['model']

X_all = df[features].fillna(df[features].median())
X_all_scaled = scaler.transform(X_all) if best_model_name in ['SVR', 'Ridge', 'Lasso'] else X_all
df['ml_predicted_score'] = best_model.predict(X_all_scaled)
df['ml_rank'] = df['ml_predicted_score'].rank(ascending=False)

print(" TOP 10 STARTUPS:")
print(df[['startup_id', 'ml_predicted_score', 'success_score']].sort_values('ml_predicted_score', ascending=False).head(10))
print()

print("🥉 BOTTOM 10 STARTUPS:")
print(df[['startup_id', 'ml_predicted_score', 'success_score']].sort_values('ml_predicted_score').head(10))
print()

print(" MODEL COMPARISON:")
for name, info in results.items():
    print(f"{name:20} | R²: {info['r2']:.3f} | RMSE: {info['rmse']:.2f} | CV R²: {info['cv_score']:.3f}")

print(f"\n Best model: {best_model_name} with R² = {results[best_model_name]['r2']:.3f}")


 TOP 10 STARTUPS:
   startup_id  ml_predicted_score  success_score
5        S006           91.365045      93.410446
76       S077           88.822131      82.528475
96       S097           86.858843      86.894324
81       S082           84.954877      85.625395
32       S033           81.776109      78.563536
42       S043           81.387239      92.843781
28       S029           80.372360      79.594859
44       S045           80.346380      74.615032
12       S013           80.063706      89.780701
47       S048           77.620726      73.455253

🥉 BOTTOM 10 STARTUPS:
   startup_id  ml_predicted_score  success_score
54       S055           19.101507      16.033079
22       S023           23.119585      16.961221
87       S088           26.134480      27.090569
92       S093           27.587439      25.802280
83       S084           29.139640      18.816595
73       S074           29.854033      22.718964
51       S052           31.914569      28.906077
97       S098           32.1