In [58]:
import pandas as pd

df = pd.read_csv('Unicorn_data.csv')

df.head(5)

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Investors
0,ByteDance,$140,4/7/2017,China,Beijing,Artificial intelligence,"Sequoia Capital China, SIG Asia Investments, S..."
1,SpaceX,$127,12/1/2012,United States,Hawthorne,Other,"Founders Fund, Draper Fisher Jurvetson, Rothen..."
2,SHEIN,$100,7/3/2018,China,Shenzhen,E-commerce & direct-to-consumer,"Tiger Global Management, Sequoia Capital China..."
3,Stripe,$95,1/23/2014,United States,San Francisco,Fintech,"Khosla Ventures, LowercaseCapital, capitalG"
4,Canva,$40,1/8/2018,Australia,Surry Hills,Internet software & services,"Sequoia Capital China, Blackbird Ventures, Mat..."


In [59]:
# Check for missing values
print(df.isnull().sum())

# Remove hidden characters and extra spaces from column names
df.columns = df.columns.str.strip()

# Drop rows with missing valuation or investor data
df = df.dropna(subset = ['Valuation ($B)' , 'Investors'])

# Drop irrelevant columns (like Company Name or Date for ML)
df = df.drop(columns=['Company', 'Date Joined', 'City'])


Company            0
Valuation ($B)     0
Date Joined        0
Country            0
City               0
Industry           0
Investors         18
dtype: int64


In [60]:
df['Main_Investor'] = df['Investors'].str.split(',').str[0].str.strip()

df = df.drop(columns=['Investors'])  

df = pd.get_dummies(df, columns=['Country', 'Industry', 'Main_Investor'], drop_first=True)

In [None]:
df['Valuation ($B)'] = df['Valuation ($B)'].replace('[\$,]', '', regex=True).astype(float)

# Define X and y
X = df.drop(columns=['Valuation ($B)'])
y = df['Valuation ($B)']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ Train Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# ✅ Predict and evaluate
y_pred = model.predict(X_test)

print("R² Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Use a clean Seaborn style
sns.set(style="whitegrid")

# Create DataFrame of features and importances
feat_df = pd.DataFrame({
    'feature': features,
    'importance': importances
})

# Map each feature to its group (Country, Industry, Investor, Other)
def map_group(name):
    if name.startswith("Country_"):
        return "Country"
    elif name.startswith("Industry_"):
        return "Industry"
    elif name.startswith("Main_Investor_"):
        return "Main Investor"
    else:
        return "Other"

feat_df['group'] = feat_df['feature'].apply(map_group)

# Group by category and sum importances
grouped = feat_df.groupby('group')['importance'].sum().sort_values(ascending=True)  # Ascending for horizontal bar chart

# Plot
plt.figure(figsize=(10, 6))
colors = sns.color_palette("viridis", len(grouped))  # Nice modern gradient

bars = plt.barh(grouped.index, grouped.values, color=colors, edgecolor='black')

# Add values to bars
for bar in bars:
    width = bar.get_width()
    plt.text(width + 0.002, bar.get_y() + bar.get_height() / 2,
             f'{width:.3f}', va='center', fontsize=10)

plt.title("Feature Importance by Category", fontsize=16, fontweight='bold')
plt.xlabel("Total Importance", fontsize=12)
plt.ylabel("Feature Group", fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=11)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig("feature_importance_grouped.png", dpi=300)  # Save for LinkedIn upload
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Create a DataFrame for feature importances
feat_imp_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
})

# Sort and select top 20
top_features = feat_imp_df.sort_values(by='Importance', ascending=False).head(20)

# Set style
sns.set(style="whitegrid")

# Plot
plt.figure(figsize=(12, 8))
colors = sns.color_palette("viridis", len(top_features))

bars = plt.barh(top_features['Feature'], top_features['Importance'], color=colors)
plt.xlabel("Feature Importance", fontsize=12)
plt.title("Top 20 Features Impacting Unicorn Valuation Prediction", fontsize=15, weight='bold')
plt.gca().invert_yaxis()  # Most important on top

# Add value labels to bars
for bar in bars:
    width = bar.get_width()
    plt.text(width + 0.002, bar.get_y() + bar.get_height()/2,
             f'{width:.3f}', va='center', fontsize=10, color='black')

plt.tight_layout()
plt.show()

In [52]:
import pickle
import joblib

# Save the model
pickle.dump(model, open('unicorn_valuation_model.pkl', 'wb'))
sample = X.iloc[[0]]
prediction = model.predict(sample)
joblib.dump(X.columns.tolist(), 'model_columns.pkl')
print("Predicted Valuation ($B):", prediction[0])


Predicted Valuation ($B): 34.79019202380953
