# 🚀 InnovateSmart - Startup Success Predictor

## AI-Powered Investment Intelligence Platform

**Author**: Diprazz  
**Date**: 2025  

This notebook implements a machine learning system to predict startup success probability using various business and operational metrics.

## 1. Data Loading & Exploration

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("🚀 InnovateSmart - Startup Success Prediction Analysis")

In [None]:
# Load the dataset
df = pd.read_csv('data/innovatesmart_startups.csv')

print("📊 Dataset Overview:")
print(f"Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nSuccess Rate: {(df['Success_Label'] == 'Success').mean():.1%}")

# Display first few rows
df.head()

In [None]:
# Basic statistics
print("📈 Basic Statistics:")
print(df.describe())

print("\n🔍 Missing Values:")
print(df.isnull().sum())

## 2. Exploratory Data Analysis (EDA)

In [None]:
# Success rate by industry
plt.figure(figsize=(12, 6))
industry_success = df.groupby('Industry')['Success_Probability'].mean().sort_values(ascending=False)
sns.barplot(x=industry_success.values, y=industry_success.index, palette='viridis')
plt.title('Average Success Probability by Industry', fontsize=14, fontweight='bold')
plt.xlabel('Success Probability')
plt.tight_layout()
plt.show()

In [None]:
# Funding vs Success
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Funding_Amount', y='Success_Probability', 
                hue='Success_Label', size='Team_Size', sizes=(50, 200))
plt.title('Funding Amount vs Success Probability', fontsize=14, fontweight='bold')
plt.xlabel('Funding Amount ($)')
plt.ylabel('Success Probability')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 8))
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numeric_cols].corr()

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5)
plt.title('Feature Correlation Heatmap', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 3. Data Preprocessing & Feature Engineering

In [None]:
# Create features for modeling
df_processed = df.copy()

# Encode categorical variables
label_encoders = {}
categorical_columns = ['Industry', 'Market_Size', 'Competition_Level']

for col in categorical_columns:
    le = LabelEncoder()
    df_processed[col + '_encoded'] = le.fit_transform(df_processed[col])
    label_encoders[col] = le

# Create target variable
df_processed['Target'] = (df_processed['Success_Label'] == 'Success').astype(int)

# Feature selection
features = ['Funding_Amount', 'Team_Size', 'Founder_Experience', 
           'Market_Size_encoded', 'Competition_Level_encoded',
           'Customer_Acquisition_Cost', 'Monthly_Growth_Rate', 
           'Monthly_Burn_Rate', 'Product_Readiness']

X = df_processed[features]
y = df_processed['Target']

print("✅ Features for modeling:")
print(features)
print(f"\n📊 Feature matrix shape: {X.shape}")
print(f"🎯 Target distribution: {y.value_counts().to_dict()}")

## 4. Machine Learning Model Training

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Success rate in training: {y_train.mean():.1%}")
print(f"Success rate in test: {y_test.mean():.1%}")

In [None]:
# Train Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)

rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

print("✅ Random Forest Model Trained Successfully!")
print(f"Training Accuracy: {rf_model.score(X_train, y_train):.1%}")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.1%}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_proba):.1%}")

## 5. Model Evaluation

In [None]:
# Classification report
print("📊 Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Failure', 'Success']))

In [None]:
# Confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Failure', 'Success'], 
            yticklabels=['Failure', 'Success'])
plt.title('Confusion Matrix', fontsize=14, fontweight='bold')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.show()

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='importance', y='feature', palette='rocket')
plt.title('Feature Importance - Random Forest', fontsize=14, fontweight='bold')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

print("🎯 Top 5 Most Important Features:")
print(feature_importance.head())

## 6. Model Saving & Deployment Preparation

In [None]:
# Save the model and encoders
import os

# Create models directory if it doesn't exist
os.makedirs('models', exist_ok=True)

# Save the trained model
joblib.dump(rf_model, 'models/innovatesmart_model.pkl')

# Save the label encoders
joblib.dump(label_encoders, 'models/label_encoders.pkl')

# Save the feature names
joblib.dump(features, 'models/feature_names.pkl')

print("💾 Model and artifacts saved successfully!")
print("📁 Files saved in 'models/' directory:")
print("   - innovatesmart_model.pkl (trained model)")
print("   - label_encoders.pkl (categorical encoders)")
print("   - feature_names.pkl (feature list)")

## 7. Prediction Demo

In [None]:
# Demo prediction with sample data
sample_data = {
    'Funding_Amount': 1200000,
    'Team_Size': 10,
    'Founder_Experience': 8,
    'Market_Size': 'Large',
    'Competition_Level': 'Medium',
    'Customer_Acquisition_Cost': 45,
    'Monthly_Growth_Rate': 12.5,
    'Monthly_Burn_Rate': 95000,
    'Product_Readiness': 80
}

# Convert to DataFrame
sample_df = pd.DataFrame([sample_data])

# Encode categorical variables
for col in ['Market_Size', 'Competition_Level']:
    sample_df[col + '_encoded'] = label_encoders[col].transform(sample_df[col])

# Prepare features
sample_features = sample_df[features]

# Make prediction
success_prob = rf_model.predict_proba(sample_features)[0, 1]
prediction = rf_model.predict(sample_features)[0]

print("🎯 Demo Prediction:")
print(f"Startup Features: {sample_data}")
print(f"\n📊 Success Probability: {success_prob:.1%}")
print(f"🎯 Prediction: {'SUCCESS 🎉' if prediction == 1 else 'FAILURE ⚠️'}")

if success_prob > 0.7:
    print("💡 Recommendation: Strong investment candidate!")
elif success_prob > 0.4:
    print("💡 Recommendation: Moderate potential - monitor closely")
else:
    print("💡 Recommendation: High risk - requires improvements")

## 8. Conclusion & Next Steps

✅ **Model trained successfully with good performance**  
✅ **Feature importance analyzed**  
✅ **Model saved for deployment**  
✅ **Ready for Streamlit dashboard integration**  

**Next:** Run the Streamlit app with: `streamlit run app.py`