Data Preprocessing:

In [12]:
import pandas as pd
import numpy as np
from datetime import datetime

In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
df = pd.read_csv('/content/drive/My Drive/Statistical_Data/startup_failures.csv')

In [26]:
df.head(2)

Unnamed: 0,permalink,name,homepage_url,category_list,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,first_funding_at,last_funding_at
0,/organization/-fame,#fame,http://livfame.com,Media,10000000,operating,IND,16,Mumbai,Mumbai,1,,2015-01-05,2015-01-05
1,/organization/-qounter,:Qounter,http://www.qounter.com,Application Platforms|Real Time|Social Network...,700000,operating,USA,DE,DE - Other,Delaware City,2,2014-09-04,2014-03-01,2014-10-14


In [29]:
# Clean funding data
df['funding_total_usd'] = pd.to_numeric(df['funding_total_usd'].replace('-', '0'))



In [30]:
for col in date_columns:
    print(f"\nSample dates from {col}:")
    print(df[col].head())


Sample dates from founded_at:
0           NaN
1    2014-09-04
2           NaN
3    2007-01-01
4    2010-01-01
Name: founded_at, dtype: object

Sample dates from first_funding_at:
0    2015-01-05
1    2014-03-01
2    2014-01-30
3    2008-03-19
4    2014-07-24
Name: first_funding_at, dtype: object

Sample dates from last_funding_at:
0    2015-01-05
1    2014-10-14
2    2014-01-30
3    2008-03-19
4    2014-07-24
Name: last_funding_at, dtype: object


In [31]:
for col in date_columns:
    try:

      # Method 1: Try with coerce option to set invalid dates to NaT
        df[col] = pd.to_datetime(df[col], errors='coerce')

             # Print number of NaT values
        nat_count = df[col].isna().sum()
        print(f"Column {col}: {nat_count} invalid dates converted to NaT")

    except Exception as e:
        print(f"Error in column {col}: {str(e)}")


Column founded_at: 15225 invalid dates converted to NaT
Column first_funding_at: 27 invalid dates converted to NaT
Column last_funding_at: 0 invalid dates converted to NaT


In [32]:
# Clean up NaT values if needed
df = df.dropna(subset=date_columns)  # Remove rows with invalid dates

In [33]:
# Convert dates
date_columns = ['founded_at', 'first_funding_at', 'last_funding_at']
for col in date_columns:
    df[col] = pd.to_datetime(df[col])

In [35]:
# Calculate company age
def calculate_age(founded_date):
    if pd.isna(founded_date):
        return None
    today = datetime.now()
    return (today - founded_date).days / 365.25

df['company_age'] = df['founded_at'].apply(calculate_age)

# Round to 2 decimal places if needed
df['company_age'] = df['company_age'].round(2)


Risk Scoring System:

In [36]:
def calculate_risk_score(row):
    score = 0

    # Funding rounds score
    score += min(row['funding_rounds'] * 10, 30)

    # Funding amount score
    if row['funding_total_usd'] > 10000000:
        score += 30
    elif row['funding_total_usd'] > 2000000:
        score += 20
    else:
        score += 10

    # Company age score
    if row['company_age'] > 3:
        score += 20
    elif row['company_age'] > 1:
        score += 10

    # Operating status score
    if row['status'] == 'operating':
        score += 20

    return score

df['risk_score'] = df.apply(calculate_risk_score, axis=1)


Industry Risk Analysis:

In [37]:
def analyze_industry_risk():
    # Split categories and create industry risk profile
    categories = df['category_list'].str.split('|', expand=True)
    categories_flat = categories.values.ravel()
    categories_flat = [x for x in categories_flat if str(x) != 'nan']

    industry_risk = pd.DataFrame({
        'category': pd.Series(categories_flat).value_counts().index,
        'count': pd.Series(categories_flat).value_counts().values
    })

    # Calculate average funding success by industry
    industry_risk['avg_funding'] = industry_risk['category'].apply(
        lambda x: df[df['category_list'].str.contains(x, na=False)]['funding_total_usd'].mean()
    )

    return industry_risk


Risk Classification:

In [38]:
def classify_risk(score):
    if score >= 70:
        return 'Low Risk'
    elif score >= 40:
        return 'Medium Risk'
    else:
        return 'High Risk'

df['risk_category'] = df['risk_score'].apply(classify_risk)


Credit Terms Assignment:

In [39]:
def assign_credit_terms(risk_category):
    terms = {
        'Low Risk': {
            'credit_limit': '500000',
            'interest_rate': '5%',
            'term_months': 36
        },
        'Medium Risk': {
            'credit_limit': '200000',
            'interest_rate': '8%',
            'term_months': 24
        },
        'High Risk': {
            'credit_limit': '50000',
            'interest_rate': '12%',
            'term_months': 12
        }
    }
    return terms.get(risk_category)

df['credit_terms'] = df['risk_category'].apply(assign_credit_terms)


Policy Validation:

In [40]:
def validate_policy():
    # Calculate risk distribution
    risk_distribution = df['risk_category'].value_counts()

    # Calculate average funding by risk category
    avg_funding_by_risk = df.groupby('risk_category')['funding_total_usd'].mean()

    # Calculate success rate by risk category
    success_rate = df.groupby('risk_category')['status'].apply(
        lambda x: (x == 'operating').mean()
    )

    return {
        'risk_distribution': risk_distribution,
        'avg_funding_by_risk': avg_funding_by_risk,
        'success_rate': success_rate
    }


Generate Reports:

In [41]:
def generate_risk_report():
    validation_results = validate_policy()
    industry_risks = analyze_industry_risk()

    report = {
        'risk_distribution': validation_results['risk_distribution'],
        'industry_risk_profile': industry_risks,
        'success_rates': validation_results['success_rate'],
        'high_risk_industries': industry_risks.nlargest(5, 'count'),
        'low_risk_industries': industry_risks.nsmallest(5, 'count')
    }

    return report


In [42]:
# Display basic information about the dataset
print("\nDataset Overview:")
print("-----------------")
print(f"Total number of companies: {len(df)}")
print("\nSample of the data:")
print(df.head())



Dataset Overview:
-----------------
Total number of companies: 51120

Sample of the data:
                             permalink                    name  \
1               /organization/-qounter                :Qounter   
3                /organization/0-6-com                 0-6.com   
4       /organization/004-technologies        004 Technologies   
6  /organization/0ndine-biomedical-inc  Ondine Biomedical Inc.   
7                 /organization/0xdata                  H2O.ai   

                        homepage_url  \
1             http://www.qounter.com   
3                 http://www.0-6.com   
4  http://004gmbh.de/en/004-interact   
6               http://ondinebio.com   
7                     http://h2o.ai/   

                                       category_list  funding_total_usd  \
1  Application Platforms|Real Time|Social Network...           700000.0   
3                                        Curated Web          2000000.0   
4                                           So

In [43]:

# Display risk distribution
print("\nRisk Category Distribution:")
print("-------------------------")
print(df['risk_category'].value_counts())


Risk Category Distribution:
-------------------------
risk_category
Medium Risk    26885
Low Risk       24234
High Risk          1
Name: count, dtype: int64


In [44]:
# Display average metrics by risk category
print("\nAverage Metrics by Risk Category:")
print("-------------------------------")
risk_metrics = df.groupby('risk_category').agg({
    'funding_total_usd': 'mean',
    'funding_rounds': 'mean',
    'company_age': 'mean'
}).round(2)
print(risk_metrics)


Average Metrics by Risk Category:
-------------------------------
               funding_total_usd  funding_rounds  company_age
risk_category                                                
High Risk              300000.00            1.00       -16.42
Low Risk             29617680.39            2.71        18.28
Medium Risk           3593803.98            1.06        16.46


In [45]:
# Display industry distribution
print("\nTop 10 Industries:")
print("----------------")
industry_counts = df['category_list'].value_counts().head(10)
print(industry_counts)



Top 10 Industries:
----------------
category_list
Software               3136
Biotechnology          2430
E-Commerce              999
Mobile                  883
Curated Web             794
Clean Technology        731
Hardware + Software     699
Enterprise Software     663
Health Care             634
Games                   625
Name: count, dtype: int64


In [46]:
# Save results to CSV (optional)
df.to_csv('credit_risk_analysis_results.csv', index=False)
print("\nResults have been saved to 'credit_risk_analysis_results.csv'")


Results have been saved to 'credit_risk_analysis_results.csv'


In [47]:
# Display sample credit terms
print("\nSample Credit Terms by Risk Category:")
print("----------------------------------")
for risk_level in ['Low Risk', 'Medium Risk', 'High Risk']:
    print(f"\n{risk_level}:")
    print(df[df['risk_category'] == risk_level]['credit_terms'].iloc[0])


Sample Credit Terms by Risk Category:
----------------------------------

Low Risk:
{'credit_limit': '500000', 'interest_rate': '5%', 'term_months': 36}

Medium Risk:
{'credit_limit': '200000', 'interest_rate': '8%', 'term_months': 24}

High Risk:
{'credit_limit': '50000', 'interest_rate': '12%', 'term_months': 12}


This implementation allows you to:

Score companies based on multiple risk factors
Classify them into risk categories
Assign appropriate credit terms
Validate the policy effectiveness
Generate comprehensive risk reports