In [6]:
import pandas as pd
import numpy as np
import re
import ast
from collections import Counter
import string

In [7]:
df = pd.read_csv('companies_2024.csv')


In [8]:
def analyze_business_model(df):
    """Extract business model indicators from description"""
    business_model_patterns = {
        'platform': r'\bplatform\b',
        'saas': r'\bsaas\b|\bsoftware as a service\b',
        'marketplace': r'\bmarketplace\b',
        'api': r'\bapi\b|\bapplication programming interface\b',
        'protocol': r'\bprotocol\b',
        'b2b': r'\bb2b\b|\bbusiness to business\b|\benterprise\b',
        'b2c': r'\bb2c\b|\bbusiness to consumer\b|\bconsumer\b',
        'marketplace': r'\bmarketplace\b',
        'subscription': r'\bsubscription\b',
        'transaction': r'\btransaction\b|\bpayment\b',
    }

    results = {}
    for model, pattern in business_model_patterns.items():
        mask = df['short_description'].str.contains(pattern, case=False, na=False, regex=True)
        results[model] = {
            'count': mask.sum(),
            'percentage': (mask.sum() / len(df)) * 100,
            'companies': df[mask]['name'].tolist()[:10]
        }
    return pd.DataFrame(results).T
business_models = analyze_business_model(df)
print('=' * 60)
print('BUSINESS MODEL ANALYSIS')
print('=' * 60)
print(business_models[['count', 'percentage']].sort_values('count', ascending=False))

BUSINESS MODEL ANALYSIS
             count percentage
platform      3912  15.039213
saas          1288   4.951561
b2b            487   1.872213
marketplace    403   1.549285
transaction    196   0.753498
b2c            147   0.565124
api            102   0.392127
protocol        69   0.265262
subscription    58   0.222974


In [9]:
def analyze_ai_terminology(df):
    """Deep dive into AI-specific terminology"""
    ai_terms = {
        'llm': r'\bllm\b|\blarge language model\b',
        'generative_ai': r'\bgenerative ai\b|\bgenai\b',
        'computer_vision': r'\bcomputer vision\b|\bcv\b',
        'nlp': r'\bnlp\b|\bnatural language processing\b',
        'deep_learning': r'\bdeep learning\b',
        'neural_network': r'\bneural network\b',
        'ai_agent': r'\bai agent\b|\bagent\b',
        'ai_powered': r'\bai.powered\b|\bai.driven\b',
        'ai_native': r'\bai.native\b',
        'machine_learning': r'\bmachine learning\b',
        'automation': r'\bautomation\b|\bautomate\b',
        'prediction': r'\bpredict\w*\b',
    }

    results = {}
    for term, pattern in ai_terms.items():
        mask = df['short_description'].str.contains(pattern, case=False, na=False, regex=True)
        results[term] = mask.sum()
    return pd.Series(results).sort_values(ascending=False)

ai_terms = analyze_ai_terminology(df)
print('=' * 60)
print('AI Technology Analysis')
print('=' * 60)
for term, count in ai_terms.items():
    print(f"{term:20s}: {count:5,} companies ({count/len(df)*100:.1f}%)")

AI Technology Analysis
ai_powered          : 1,699 companies (6.5%)
automation          :   749 companies (2.9%)
ai_agent            :   202 companies (0.8%)
generative_ai       :   201 companies (0.8%)
prediction          :   134 companies (0.5%)
ai_native           :    81 companies (0.3%)
machine_learning    :    74 companies (0.3%)
llm                 :    68 companies (0.3%)
computer_vision     :    40 companies (0.2%)
nlp                 :    12 companies (0.0%)
deep_learning       :     8 companies (0.0%)
neural_network      :     1 companies (0.0%)


In [10]:
# ============================================================================
# 2. VALUE PROPOSITION ANALYSIS
# ============================================================================

def analyze_value_propositions(df):
    """Extract value proposition keywords"""
    value_prop_patterns = {
        'automation': r'\bautomate\w*\b|\bautomation\b',
        'efficiency': r'\befficien\w*\b',
        'cost_reduction': r'\bcost\b|\breduce cost\b|\bsave money\b',
        'speed': r'\bfast\b|\bquick\b|\bspeed\b|\breal.time\b',
        'security': r'\bsecure\b|\bsecurity\b|\bencrypt\w*\b',
        'scalability': r'\bscalable\b|\bscale\b',
        'innovation': r'\binnovative\b|\binnovation\b',
        'simplify': r'\bsimplif\w*\b|\beasy\b',
        'optimize': r'\boptimiz\w*\b',
    }
    
    results = {}
    for prop, pattern in value_prop_patterns.items():
        mask = df['short_description'].str.contains(pattern, case=False, na=False, regex=True)
        results[prop] = mask.sum()
    
    return pd.Series(results).sort_values(ascending=False)

value_props = analyze_value_propositions(df)
print("=" * 60)
print("VALUE PROPOSITION KEYWORDS")
print("=" * 60)
for prop, count in value_props.items():
    print(f"{prop:20s}: {count:5,} companies ({count/len(df)*100:.1f}%)")


VALUE PROPOSITION KEYWORDS
automation          : 1,103 companies (4.2%)
speed               :   629 companies (2.4%)
innovation          :   627 companies (2.4%)
security            :   617 companies (2.4%)
simplify            :   443 companies (1.7%)
optimize            :   384 companies (1.5%)
efficiency          :   347 companies (1.3%)
scalability         :   325 companies (1.2%)
cost_reduction      :   128 companies (0.5%)


In [11]:
# ============================================================================
# 3. TECHNOLOGY STACK MENTIONS
# ============================================================================

def analyze_technology_mentions(df):
    """Extract specific technology mentions"""
    tech_patterns = {
        'ai': r'\bai\b|\bartificial intelligence\b',
        'blockchain': r'\bblockchain\b',
        'machine_learning': r'\bmachine learning\b|\bml\b',
        'llm': r'\bllm\b|\blarge language model\b',
        'generative_ai': r'\bgenerative ai\b|\bgenai\b',
        'cloud': r'\bcloud\b',
        'api': r'\bapi\b',
        'cryptocurrency': r'\bcryptocurrency\b|\bcrypto\b',
        'iot': r'\biot\b|\binternet of things\b',
        'robotics': r'\brobot\w*\b',
        'ar_vr': r'\baugmented reality\b|\bvirtual reality\b|\bar\b|\bvr\b',
    }
    
    results = {}
    for tech, pattern in tech_patterns.items():
        mask = df['short_description'].str.contains(pattern, case=False, na=False, regex=True)
        results[tech] = {
            'count': mask.sum(),
            'percentage': (mask.sum() / len(df)) * 100
        }
    
    return pd.DataFrame(results).T.sort_values('count', ascending=False)

tech_mentions = analyze_technology_mentions(df)
print("=" * 60)
print("TECHNOLOGY MENTIONS")
print("=" * 60)
print(tech_mentions)


TECHNOLOGY MENTIONS
                   count  percentage
ai                6585.0   25.315239
cloud              349.0    1.341688
blockchain         259.0    0.995694
cryptocurrency     235.0    0.903429
robotics           222.0    0.853452
generative_ai      201.0    0.772720
machine_learning   126.0    0.484392
api                102.0    0.392127
ar_vr               76.0    0.292173
iot                 69.0    0.265262
llm                 68.0    0.261418


In [12]:
# ============================================================================
# 4. TARGET MARKET INDICATORS
# ============================================================================

def analyze_target_markets(df):
    """Extract target market indicators"""
    market_patterns = {
        'enterprise': r'\benterprise\b|\blarge business\b',
        'smb': r'\bsmall business\b|\bsmb\b|\bmid.market\b',
        'consumer': r'\bconsumer\b|\bend user\b|\bhousehold\b',
        'healthcare': r'\bhealthcare\b|\bhealth care\b|\bmedical\b',
        'finance': r'\bfinance\b|\bfinancial\b|\bfintech\b',
        'retail': r'\bretail\b|\be.commerce\b',
        'education': r'\beducation\b|\bedtech\b',
        'government': r'\bgovernment\b|\bpublic sector\b',
    }
    
    results = {}
    for market, pattern in market_patterns.items():
        mask = df['short_description'].str.contains(pattern, case=False, na=False, regex=True)
        results[market] = mask.sum()
    
    return pd.Series(results).sort_values(ascending=False)

target_markets = analyze_target_markets(df)
print("=" * 60)
print("TARGET MARKET INDICATORS")
print("=" * 60)
for market, count in target_markets.items():
    print(f"{market:20s}: {count:5,} companies ({count/len(df)*100:.1f}%)")


TARGET MARKET INDICATORS
finance             :   931 companies (3.6%)
healthcare          :   679 companies (2.6%)
retail              :   379 companies (1.5%)
education           :   311 companies (1.2%)
enterprise          :   214 companies (0.8%)
consumer            :   123 companies (0.5%)
government          :    62 companies (0.2%)
smb                 :    40 companies (0.2%)


In [13]:
# ============================================================================
# 5. COMPETITIVE POSITIONING LANGUAGE
# ============================================================================

def analyze_positioning(df):
    """Extract competitive positioning language"""
    positioning_patterns = {
        'leading': r'\bleading\b|\b#1\b|\btop\b',
        'first': r'\bfirst\b|\bpioneer\w*\b',
        'only': r'\bonly\b|\bunique\b',
        'revolutionary': r'\brevolutionary\b|\brevolutioniz\w*\b',
        'cutting_edge': r'\bcutting.edge\b|\badvanced\b',
        'next_generation': r'\bnext.generation\b|\bnext gen\b',
        'breakthrough': r'\bbreakthrough\b',
        'innovative': r'\binnovative\b|\binnovation\b',
    }
    
    results = {}
    for pos, pattern in positioning_patterns.items():
        mask = df['short_description'].str.contains(pattern, case=False, na=False, regex=True)
        results[pos] = mask.sum()
    
    return pd.Series(results).sort_values(ascending=False)

positioning = analyze_positioning(df)
print("=" * 60)
print("COMPETITIVE POSITIONING LANGUAGE")
print("=" * 60)
for pos, count in positioning.items():
    print(f"{pos:20s}: {count:5,} companies ({count/len(df)*100:.1f}%)")


COMPETITIVE POSITIONING LANGUAGE
innovative          :   627 companies (2.4%)
cutting_edge        :   487 companies (1.9%)
first               :   458 companies (1.8%)
leading             :   448 companies (1.7%)
revolutionary       :   259 companies (1.0%)
only                :   166 companies (0.6%)
next_generation     :   132 companies (0.5%)
breakthrough        :    16 companies (0.1%)


In [14]:
# ============================================================================
# 6. ACTION VERB PATTERNS
# ============================================================================

def analyze_action_verbs(df):
    """Extract action verbs from descriptions"""
    action_verbs = {
        'develops': r'\bdevelop\w*\b',
        'builds': r'\bbuild\w*\b',
        'creates': r'\bcreat\w*\b',
        'offers': r'\boffer\w*\b',
        'provides': r'\bprovid\w*\b',
        'delivers': r'\bdeliver\w*\b',
        'enables': r'\benable\w*\b',
        'empowers': r'\bempower\w*\b',
        'facilitates': r'\bfacilitat\w*\b',
        'transforms': r'\btransform\w*\b',
        'revolutionizes': r'\brevolutioniz\w*\b',
        'connects': r'\bconnect\w*\b',
        'helps': r'\bhelp\w*\b',
        'supports': r'\bsupport\w*\b',
    }
    
    results = {}
    for verb, pattern in action_verbs.items():
        mask = df['short_description'].str.contains(pattern, case=False, na=False, regex=True)
        results[verb] = mask.sum()
    
    return pd.Series(results).sort_values(ascending=False)

action_verbs = analyze_action_verbs(df)
print("=" * 60)
print("ACTION VERB PATTERNS")
print("=" * 60)
for verb, count in action_verbs.items():
    print(f"{verb:20s}: {count:5,} companies ({count/len(df)*100:.1f}%)")


ACTION VERB PATTERNS
offers              : 3,178 companies (12.2%)
provides            : 3,050 companies (11.7%)
develops            : 2,267 companies (8.7%)
helps               : 1,471 companies (5.7%)
creates             : 1,448 companies (5.6%)
builds              :   912 companies (3.5%)
supports            :   779 companies (3.0%)
connects            :   742 companies (2.9%)
delivers            :   590 companies (2.3%)
transforms          :   537 companies (2.1%)
empowers            :   472 companies (1.8%)
enables             :   327 companies (1.3%)
revolutionizes      :   226 companies (0.9%)
facilitates         :    52 companies (0.2%)


In [15]:
# ============================================================================
# 7. REGULATORY AND COMPLIANCE SIGNALS
# ============================================================================

def analyze_compliance(df):
    """Extract regulatory and compliance mentions"""
    compliance_patterns = {
        'secure': r'\bsecure\b|\bsecurity\b',
        'encrypted': r'\bencrypt\w*\b',
        'compliant': r'\bcompliant\b|\bcompliance\b',
        'gdpr': r'\bgdpr\b',
        'hipaa': r'\bhipaa\b',
        'soc2': r'\bsoc.2\b|\bsoc2\b',
        'privacy': r'\bprivacy\b',
        'certified': r'\bcertified\b',
        'fda': r'\bfda\b',
        'regulatory': r'\bregulatory\b',
    }
    
    results = {}
    for comp, pattern in compliance_patterns.items():
        mask = df['short_description'].str.contains(pattern, case=False, na=False, regex=True)
        results[comp] = mask.sum()
    
    return pd.Series(results).sort_values(ascending=False)

compliance = analyze_compliance(df)
print("=" * 60)
print("REGULATORY AND COMPLIANCE SIGNALS")
print("=" * 60)
for comp, count in compliance.items():
    print(f"{comp:20s}: {count:5,} companies ({count/len(df)*100:.1f}%)")


REGULATORY AND COMPLIANCE SIGNALS
secure              :   607 companies (2.3%)
compliant           :   329 companies (1.3%)
privacy             :    69 companies (0.3%)
regulatory          :    55 companies (0.2%)
certified           :    41 companies (0.2%)
encrypted           :    18 companies (0.1%)
gdpr                :     6 companies (0.0%)
hipaa               :     5 companies (0.0%)
fda                 :     4 companies (0.0%)
soc2                :     2 companies (0.0%)


In [16]:
# ============================================================================
# 8. TRENDING CONCEPTS ANALYSIS
# ============================================================================

def analyze_trending_concepts(df):
    """Identify trending concepts and buzzwords"""
    trending_patterns = {
        'agentic': r'\bagentic\b',
        'autonomous': r'\bautonomous\b',
        'decentralized': r'\bdecentralized\b|\bdefi\b',
        'sustainable': r'\bsustainable\b|\bsustainability\b',
        'green': r'\bgreen\b',
        'carbon': r'\bcarbon\b',
        'renewable': r'\brenewable\b',
        'web3': r'\bweb3\b|\bweb.3\b',
        'metaverse': r'\bmetaverse\b',
        'quantum': r'\bquantum\b',
    }
    
    results = {}
    for trend, pattern in trending_patterns.items():
        mask = df['short_description'].str.contains(pattern, case=False, na=False, regex=True)
        results[trend] = mask.sum()
    
    return pd.Series(results).sort_values(ascending=False)

trending = analyze_trending_concepts(df)
print("=" * 60)
print("TRENDING CONCEPTS")
print("=" * 60)
for trend, count in trending.items():
    print(f"{trend:20s}: {count:5,} companies ({count/len(df)*100:.1f}%)")


TRENDING CONCEPTS
sustainable         :   280 companies (1.1%)
decentralized       :   213 companies (0.8%)
web3                :   162 companies (0.6%)
autonomous          :   139 companies (0.5%)
agentic             :   116 companies (0.4%)
carbon              :    85 companies (0.3%)
green               :    69 companies (0.3%)
renewable           :    65 companies (0.2%)
quantum             :    55 companies (0.2%)
metaverse           :    19 companies (0.1%)


In [17]:
# ============================================================================
# 9. DESCRIPTION QUALITY METRICS
# ============================================================================

def analyze_description_quality(df):
    """Analyze description completeness and quality"""
    df['desc_length'] = df['short_description'].str.len()
    df['desc_word_count'] = df['short_description'].str.split().str.len()
    df['desc_sentence_count'] = df['short_description'].str.count(r'[.!?]')
    
    # Check for vague descriptions
    vague_terms = r'\bcompany\b|\bbusiness\b|\borganization\b'
    df['has_vague_terms'] = df['short_description'].str.contains(vague_terms, case=False, na=False, regex=True)
    
    # Check for specific details
    specific_indicators = r'\bplatform\b|\bsoftware\b|\bservice\b|\bsolution\b|\bproduct\b'
    df['has_specific_terms'] = df['short_description'].str.contains(specific_indicators, case=False, na=False, regex=True)
    
    quality_stats = {
        'avg_length': df['desc_length'].mean(),
        'median_length': df['desc_length'].median(),
        'avg_word_count': df['desc_word_count'].mean(),
        'median_word_count': df['desc_word_count'].median(),
        'very_short_descriptions': (df['desc_length'] < 30).sum(),
        'vague_descriptions': df['has_vague_terms'].sum(),
        'specific_descriptions': df['has_specific_terms'].sum(),
    }
    
    return quality_stats

quality = analyze_description_quality(df)
print("=" * 60)
print("DESCRIPTION QUALITY METRICS")
print("=" * 60)
for metric, value in quality.items():
    if isinstance(value, float):
        print(f"{metric:30s}: {value:.2f}")
    else:
        print(f"{metric:30s}: {value:,}")


DESCRIPTION QUALITY METRICS
avg_length                    : 89.84
median_length                 : 99.00
avg_word_count                : 12.63
median_word_count             : 14.00
very_short_descriptions       : 2,073
vague_descriptions            : 3,802
specific_descriptions         : 6,163


In [18]:
# ============================================================================
# 10. INDUSTRY-SPECIFIC TERMINOLOGY CLUSTERS
# ============================================================================

def analyze_industry_terminology(df):
    """Extract industry-specific terminology"""
    industry_terms = {
        'healthcare': {
            'therapeutics': r'\btherapeutic\w*\b',
            'diagnostics': r'\bdiagnostic\w*\b',
            'clinical': r'\bclinical\b',
            'patient': r'\bpatient\b',
            'drug_discovery': r'\bdrug discovery\b',
        },
        'fintech': {
            'payments': r'\bpayment\w*\b',
            'lending': r'\blending\b',
            'trading': r'\btrading\b',
            'defi': r'\bdefi\b|\bdecentralized finance\b',
            'crypto': r'\bcryptocurrency\b|\bcrypto\b',
        },
        'enterprise': {
            'erp': r'\berp\b',
            'crm': r'\bcrm\b',
            'workflow': r'\bworkflow\b',
            'enterprise_software': r'\benterprise software\b',
        },
    }
    
    results = {}
    for industry, terms in industry_terms.items():
        industry_results = {}
        for term, pattern in terms.items():
            mask = df['short_description'].str.contains(pattern, case=False, na=False, regex=True)
            industry_results[term] = mask.sum()
        results[industry] = industry_results
    
    return results

industry_terms = analyze_industry_terminology(df)
print("=" * 60)
print("INDUSTRY-SPECIFIC TERMINOLOGY")
print("=" * 60)
for industry, terms in industry_terms.items():
    print(f"\n{industry.upper()}:")
    for term, count in sorted(terms.items(), key=lambda x: x[1], reverse=True):
        print(f"  {term:20s}: {count:5,} companies")


INDUSTRY-SPECIFIC TERMINOLOGY

HEALTHCARE:
  patient             :    94 companies
  therapeutics        :    91 companies
  clinical            :    86 companies
  diagnostics         :    56 companies
  drug_discovery      :    21 companies

FINTECH:
  payments            :   361 companies
  crypto              :   235 companies
  trading             :   201 companies
  defi                :    78 companies
  lending             :    45 companies

ENTERPRISE:
  crm                 :   148 companies
  workflow            :    77 companies
  erp                 :    49 companies
  enterprise_software :    11 companies


In [19]:
# ============================================================================
# 11. N-GRAMS AND PHRASE PATTERNS
# ============================================================================

def extract_ngrams(df, n=2, min_freq=10):
    """Extract common n-grams (phrases) from descriptions"""
    all_ngrams = []
    
    for desc in df['short_description'].dropna():
        # Clean and tokenize
        words = desc.lower().translate(str.maketrans('', '', string.punctuation)).split()
        # Create n-grams
        for i in range(len(words) - n + 1):
            ngram = ' '.join(words[i:i+n])
            if len(ngram) > 3:  # Filter very short phrases
                all_ngrams.append(ngram)
    
    ngram_counts = Counter(all_ngrams)
    # Filter by minimum frequency
    common_ngrams = {phrase: count for phrase, count in ngram_counts.items() if count >= min_freq}
    
    return pd.Series(common_ngrams).sort_values(ascending=False)

# Extract bigrams (2-word phrases)
print("=" * 60)
print("TOP 30 COMMON PHRASES (BIGRAMS)")
print("=" * 60)
bigrams = extract_ngrams(df, n=2, min_freq=20)
for phrase, count in bigrams.head(30).items():
    print(f"{phrase:40s}: {count:4,} occurrences")

# Extract trigrams (3-word phrases)
print("\n" + "=" * 60)
print("TOP 20 COMMON PHRASES (TRIGRAMS)")
print("=" * 60)
trigrams = extract_ngrams(df, n=3, min_freq=10)
for phrase, count in trigrams.head(20).items():
    print(f"{phrase:50s}: {count:4,} occurrences")


TOP 30 COMMON PHRASES (BIGRAMS)
is a                                    : 5,734 occurrences
is an                                   : 2,190 occurrences
company that                            : 1,352 occurrences
platform that                           : 1,157 occurrences
that offers                             :  834 occurrences
platform for                            :  824 occurrences
that provides                           :  566 occurrences
in the                                  :  525 occurrences
specializes in                          :  518 occurrences
an ai                                   :  507 occurrences
solutions for                           :  460 occurrences
digital marketing                       :  459 occurrences
a platform                              :  455 occurrences
real estate                             :  432 occurrences
an aipowered                            :  430 occurrences
designed to                             :  406 occurrences
is the              

In [24]:
# ============================================================================
# 12. DESCRIPTION COMPLETENESS SCORE
# ============================================================================
"""
def calculate_completeness_score(df):
   
    scores = []
    
    for idx, desc in df['short_description'].items():
        score = 0
        
        # Length score (0-3 points)
        if pd.notna(desc):
            length = len(desc)
            if length > 100:
                score += 3
            elif length > 50:
                score += 2
            elif length > 20:
                score += 1
        
        # Specificity score (0-2 points)
        specific_terms = ['platform', 'software', 'service', 'solution', 'product', 'system']
        if any(term in desc.lower() for term in specific_terms):
            score += 2
        elif any(term in desc.lower() for term in ['company', 'business', 'organization']):
            score += 1
        
        # Action verb score (0-2 points)
        action_verbs = ['develops', 'builds', 'creates', 'offers', 'provides', 'enables']
        if any(verb in desc.lower() for verb in action_verbs):
            score += 2
        
        # Technology mention score (0-2 points)
        tech_terms = ['ai', 'blockchain', 'cloud', 'api', 'software', 'technology']
        if any(term in desc.lower() for term in tech_terms):
            score += 2
        
        # Target market score (0-1 point)
        market_terms = ['enterprise', 'consumer', 'business', 'customer']
        if any(term in desc.lower() for term in market_terms):
            score += 1
        
        scores.append(score)
    
    return pd.Series(scores, index=df.index)

df['completeness_score'] = calculate_completeness_score(df)

print("=" * 60)
print("DESCRIPTION COMPLETENESS SCORES")
print("=" * 60)
print(f"Average completeness score: {df['completeness_score'].mean():.2f}/10")
print(f"Median completeness score: {df['completeness_score'].median():.2f}/10")
print(f"\nScore distribution:")
print(df['completeness_score'].value_counts().sort_index())

# Companies with low completeness
low_completeness = df[df['completeness_score'] <= 3]
print(f"\nCompanies with low completeness (≤3): {len(low_completeness):,}")
print("\nSample low-completeness descriptions:")
for name, desc in zip(low_completeness['name'].head(10), low_completeness['short_description'].head(10)):
    print(f"  {name}: {desc[:80]}...")
"""

'\ndef calculate_completeness_score(df):\n\n    scores = []\n\n    for idx, desc in df[\'short_description\'].items():\n        score = 0\n\n        # Length score (0-3 points)\n        if pd.notna(desc):\n            length = len(desc)\n            if length > 100:\n                score += 3\n            elif length > 50:\n                score += 2\n            elif length > 20:\n                score += 1\n\n        # Specificity score (0-2 points)\n        specific_terms = [\'platform\', \'software\', \'service\', \'solution\', \'product\', \'system\']\n        if any(term in desc.lower() for term in specific_terms):\n            score += 2\n        elif any(term in desc.lower() for term in [\'company\', \'business\', \'organization\']):\n            score += 1\n\n        # Action verb score (0-2 points)\n        action_verbs = [\'develops\', \'builds\', \'creates\', \'offers\', \'provides\', \'enables\']\n        if any(verb in desc.lower() for verb in action_verbs):\n           

In [25]:
# ============================================================================
# 13. COMPREHENSIVE FEATURE EXTRACTION
# ============================================================================

def extract_all_features(df):
    """Create a comprehensive feature set from descriptions"""
    features = pd.DataFrame(index=df.index)
    
    # Basic metrics
    features['desc_length'] = df['short_description'].str.len()
    features['desc_word_count'] = df['short_description'].str.split().str.len()
    features['desc_sentence_count'] = df['short_description'].str.count(r'[.!?]')
    
    # Business model
    features['is_platform'] = df['short_description'].str.contains(r'\bplatform\b', case=False, na=False, regex=True)
    features['is_saas'] = df['short_description'].str.contains(r'\bsaas\b', case=False, na=False, regex=True)
    features['is_api'] = df['short_description'].str.contains(r'\bapi\b', case=False, na=False, regex=True)
    features['is_marketplace'] = df['short_description'].str.contains(r'\bmarketplace\b', case=False, na=False, regex=True)
    
    # Technology
    features['mentions_ai'] = df['short_description'].str.contains(r'\bai\b|\bartificial intelligence\b', case=False, na=False, regex=True)
    features['mentions_blockchain'] = df['short_description'].str.contains(r'\bblockchain\b', case=False, na=False, regex=True)
    features['mentions_cloud'] = df['short_description'].str.contains(r'\bcloud\b', case=False, na=False, regex=True)
    features['mentions_ml'] = df['short_description'].str.contains(r'\bmachine learning\b', case=False, na=False, regex=True)
    
    # Value props
    features['mentions_automation'] = df['short_description'].str.contains(r'\bautomate\w*\b', case=False, na=False, regex=True)
    features['mentions_security'] = df['short_description'].str.contains(r'\bsecure\b|\bsecurity\b', case=False, na=False, regex=True)
    features['mentions_efficiency'] = df['short_description'].str.contains(r'\befficien\w*\b', case=False, na=False, regex=True)
    
    # Target market
    features['targets_enterprise'] = df['short_description'].str.contains(r'\benterprise\b', case=False, na=False, regex=True)
    features['targets_consumer'] = df['short_description'].str.contains(r'\bconsumer\b', case=False, na=False, regex=True)
    features['targets_healthcare'] = df['short_description'].str.contains(r'\bhealthcare\b|\bhealth care\b', case=False, na=False, regex=True)
    
    # Positioning
    features['uses_leading'] = df['short_description'].str.contains(r'\bleading\b', case=False, na=False, regex=True)
    features['uses_innovative'] = df['short_description'].str.contains(r'\binnovative\b', case=False, na=False, regex=True)
    features['uses_revolutionary'] = df['short_description'].str.contains(r'\brevolutionary\b', case=False, na=False, regex=True)
    
    # Action verbs
    features['uses_develops'] = df['short_description'].str.contains(r'\bdevelop\w*\b', case=False, na=False, regex=True)
    features['uses_builds'] = df['short_description'].str.contains(r'\bbuild\w*\b', case=False, na=False, regex=True)
    features['uses_offers'] = df['short_description'].str.contains(r'\boffer\w*\b', case=False, na=False, regex=True)
    features['uses_provides'] = df['short_description'].str.contains(r'\bprovid\w*\b', case=False, na=False, regex=True)
    features['uses_enables'] = df['short_description'].str.contains(r'\benable\w*\b', case=False, na=False, regex=True)
    
    # Compliance
    features['mentions_secure'] = df['short_description'].str.contains(r'\bsecure\b|\bsecurity\b', case=False, na=False, regex=True)
    features['mentions_privacy'] = df['short_description'].str.contains(r'\bprivacy\b', case=False, na=False, regex=True)
    features['mentions_compliant'] = df['short_description'].str.contains(r'\bcompliant\b', case=False, na=False, regex=True)
    
    # Convert boolean to int
    bool_cols = features.select_dtypes(include=['bool']).columns
    features[bool_cols] = features[bool_cols].astype(int)
    
    return features

description_features = extract_all_features(df)
print("=" * 60)
print("COMPREHENSIVE FEATURE EXTRACTION")
print("=" * 60)
print(f"Extracted {len(description_features.columns)} features for {len(description_features)} companies")
print("\nFeature summary:")
print(description_features.sum().sort_values(ascending=False))


COMPREHENSIVE FEATURE EXTRACTION
Extracted 28 features for 26012 companies

Feature summary:
desc_length            2336666.0
desc_word_count         328447.0
desc_sentence_count      20506.0
mentions_ai               6585.0
is_platform               3912.0
uses_offers               3178.0
uses_provides             3050.0
uses_develops             2267.0
is_saas                   1287.0
uses_builds                912.0
mentions_automation        630.0
mentions_security          607.0
mentions_secure            607.0
targets_healthcare         434.0
is_marketplace             403.0
uses_innovative            390.0
mentions_cloud             349.0
mentions_efficiency        347.0
uses_enables               327.0
mentions_blockchain        259.0
uses_leading               225.0
targets_enterprise         214.0
targets_consumer           112.0
is_api                     102.0
mentions_ml                 74.0
mentions_privacy            69.0
mentions_compliant          41.0
uses_revolutiona

In [26]:
# ============================================================================
# 14. SECTOR-SPECIFIC DESCRIPTION ANALYSIS
# ============================================================================

def analyze_sector_descriptions(df):
    """Compare description patterns across different sectors"""
    sectors = {
        'AI': df['categories'].str.contains('Artificial Intelligence', na=False, case=False),
        'Blockchain': df['categories'].str.contains('Blockchain', na=False, case=False),
        'Healthcare': df['categories'].str.contains('Health Care', na=False, case=False),
        'FinTech': df['categories'].str.contains('FinTech', na=False, case=False),
    }
    
    sector_stats = {}
    for sector_name, mask in sectors.items():
        sector_df = df[mask]
        if len(sector_df) > 0:
            sector_stats[sector_name] = {
                'count': len(sector_df),
                'avg_length': sector_df['short_description'].str.len().mean(),
                'avg_words': sector_df['short_description'].str.split().str.len().mean(),
                'mentions_ai': sector_df['short_description'].str.contains(r'\bai\b', case=False, na=False, regex=True).sum(),
                'mentions_platform': sector_df['short_description'].str.contains(r'\bplatform\b', case=False, na=False, regex=True).sum(),
                'avg_completeness': sector_df['completeness_score'].mean() if 'completeness_score' in sector_df.columns else 0,
            }
    
    return pd.DataFrame(sector_stats).T

sector_comparison = analyze_sector_descriptions(df)
print("=" * 60)
print("SECTOR-SPECIFIC DESCRIPTION PATTERNS")
print("=" * 60)
print(sector_comparison)


SECTOR-SPECIFIC DESCRIPTION PATTERNS
             count  avg_length  avg_words  mentions_ai  mentions_platform  \
AI          5036.0   94.620258  13.245680       3586.0             1092.0   
Blockchain   620.0   92.285484  12.900000        114.0              129.0   
Healthcare  1302.0   96.559140  13.218894        333.0              196.0   
FinTech      814.0   93.750614  12.993857        191.0              204.0   

            avg_completeness  
AI                       0.0  
Blockchain               0.0  
Healthcare               0.0  
FinTech                  0.0  


In [27]:
# ============================================================================
# 15. EXPORT RESULTS
# ============================================================================

# Save comprehensive features
description_features.to_csv('description_features.csv', index=False)
print("Description features saved to 'description_features.csv'")

# Create summary statistics
summary_data = {
    'Analysis_Type': [],
    'Metric': [],
    'Count': [],
    'Percentage': []
}

# Business Models
for model, row in business_models.iterrows():
    summary_data['Analysis_Type'].append('Business Model')
    summary_data['Metric'].append(model)
    summary_data['Count'].append(int(row['count']))
    summary_data['Percentage'].append(row['percentage'])

# Value Propositions
for prop, count in value_props.items():
    summary_data['Analysis_Type'].append('Value Proposition')
    summary_data['Metric'].append(prop)
    summary_data['Count'].append(int(count))
    summary_data['Percentage'].append((count / len(df)) * 100)

# Technology Mentions
for tech, row in tech_mentions.iterrows():
    summary_data['Analysis_Type'].append('Technology')
    summary_data['Metric'].append(tech)
    summary_data['Count'].append(int(row['count']))
    summary_data['Percentage'].append(row['percentage'])

# Target Markets
for market, count in target_markets.items():
    summary_data['Analysis_Type'].append('Target Market')
    summary_data['Metric'].append(market)
    summary_data['Count'].append(int(count))
    summary_data['Percentage'].append((count / len(df)) * 100)

# AI Terminology
for term, count in ai_terms.items():
    summary_data['Analysis_Type'].append('AI Terminology')
    summary_data['Metric'].append(term)
    summary_data['Count'].append(int(count))
    summary_data['Percentage'].append((count / len(df)) * 100)

# Positioning
for pos, count in positioning.items():
    summary_data['Analysis_Type'].append('Positioning')
    summary_data['Metric'].append(pos)
    summary_data['Count'].append(int(count))
    summary_data['Percentage'].append((count / len(df)) * 100)

# Action Verbs
for verb, count in action_verbs.items():
    summary_data['Analysis_Type'].append('Action Verb')
    summary_data['Metric'].append(verb)
    summary_data['Count'].append(int(count))
    summary_data['Percentage'].append((count / len(df)) * 100)

# Compliance
for comp, count in compliance.items():
    summary_data['Analysis_Type'].append('Compliance')
    summary_data['Metric'].append(comp)
    summary_data['Count'].append(int(count))
    summary_data['Percentage'].append((count / len(df)) * 100)

# Trending Concepts
for trend, count in trending.items():
    summary_data['Analysis_Type'].append('Trending Concept')
    summary_data['Metric'].append(trend)
    summary_data['Count'].append(int(count))
    summary_data['Percentage'].append((count / len(df)) * 100)

summary_df = pd.DataFrame(summary_data)
summary_df.to_csv('description_analysis_summary.csv', index=False)
print("Summary statistics saved to 'description_analysis_summary.csv'")

print("\n" + "=" * 60)
print("ANALYSIS COMPLETE!")
print("=" * 60)
print(f"Analyzed {len(df):,} company descriptions")
print("Results exported to CSV files:")
print("  - description_features.csv (comprehensive feature matrix)")
print("  - description_analysis_summary.csv (summary statistics)")


Description features saved to 'description_features.csv'
Summary statistics saved to 'description_analysis_summary.csv'

ANALYSIS COMPLETE!
Analyzed 26,012 company descriptions
Results exported to CSV files:
  - description_features.csv (comprehensive feature matrix)
  - description_analysis_summary.csv (summary statistics)


In [28]:
# ============================================================================
# AI MENTIONS ANALYSIS - FOCUSED ANALYSIS
# ============================================================================

def analyze_ai_mentions_focused(df):
    """Comprehensive analysis of AI mentions in descriptions"""
    
    # Define AI-related patterns
    ai_patterns = {
        'ai_direct': r'\bai\b',
        'artificial_intelligence': r'\bartificial intelligence\b',
        'machine_learning': r'\bmachine learning\b|\bml\b',
        'deep_learning': r'\bdeep learning\b',
        'neural_network': r'\bneural network\b',
        'llm': r'\bllm\b|\blarge language model\b',
        'generative_ai': r'\bgenerative ai\b|\bgenai\b',
        'computer_vision': r'\bcomputer vision\b',
        'nlp': r'\bnlp\b|\bnatural language processing\b',
        'ai_powered': r'\bai.powered\b|\bai.driven\b',
        'ai_native': r'\bai.native\b',
        'ai_agent': r'\bai agent\b',
    }
    
    # Check for any AI mention (broad)
    broad_ai_pattern = r'\bai\b|\bartificial intelligence\b|\bmachine learning\b|\bdeep learning\b|\bneural\b|\bllm\b|\bgenerative ai\b|\bgenai\b'
    df['mentions_ai'] = df['short_description'].str.contains(broad_ai_pattern, case=False, na=False, regex=True)
    
    # Check for AI category
    df['has_ai_category'] = df['categories'].str.contains('Artificial Intelligence', na=False, case=False)
    
    # Detailed pattern matching
    pattern_counts = {}
    for pattern_name, pattern in ai_patterns.items():
        mask = df['short_description'].str.contains(pattern, case=False, na=False, regex=True)
        pattern_counts[pattern_name] = mask.sum()
    
    return df, pattern_counts

# Run analysis
df_ai, pattern_counts = analyze_ai_mentions_focused(df)

# Calculate statistics
total_companies = len(df_ai)
ai_in_description = df_ai['mentions_ai'].sum()
ai_in_category = df_ai['has_ai_category'].sum()
both_ai = (df_ai['mentions_ai'] & df_ai['has_ai_category']).sum()
either_ai = (df_ai['mentions_ai'] | df_ai['has_ai_category']).sum()

print("=" * 80)
print("AI MENTIONS IN COMPANY DESCRIPTIONS - ANALYSIS SUMMARY")
print("=" * 80)
print(f"\nGenerated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

print("OVERALL STATISTICS")
print("-" * 80)
print(f"Total Companies Analyzed: {total_companies:,}")
print(f"Companies Mentioning AI in Description: {ai_in_description:,} ({ai_in_description/total_companies*100:.2f}%)")
print(f"Companies with AI Category: {ai_in_category:,} ({ai_in_category/total_companies*100:.2f}%)")
print(f"Companies with Both Description & Category: {both_ai:,}")
print(f"Companies with Either Description OR Category: {either_ai:,} ({either_ai/total_companies*100:.2f}%)")
print(f"Description Only (no AI category): {(df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum():,}")
print(f"Category Only (no AI in description): {(~df_ai['mentions_ai'] & df_ai['has_ai_category']).sum():,}")
print(f"Neither Description nor Category: {(~df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum():,}")

print("\nAI TERMINOLOGY BREAKDOWN")
print("-" * 80)
sorted_patterns = sorted(pattern_counts.items(), key=lambda x: x[1], reverse=True)
for pattern, count in sorted_patterns:
    pct = (count / total_companies) * 100
    print(f"  {pattern.replace('_', ' ').title():30s}: {count:6,} companies ({pct:5.2f}%)")

# Company size analysis for AI companies
ai_companies = df_ai[df_ai['mentions_ai']]
if len(ai_companies) > 0 and 'employees_midpoint' in ai_companies.columns:
    print("\nAI COMPANY SIZE ANALYSIS")
    print("-" * 80)
    print(f"Average Employees (AI Companies): {ai_companies['employees_midpoint'].mean():.1f}")
    print(f"Median Employees (AI Companies): {ai_companies['employees_midpoint'].median():.1f}")
    print(f"Small Companies (1-10 employees): {(ai_companies['employees_min'] <= 10).sum():,}")
    print(f"Medium Companies (11-50 employees): {((ai_companies['employees_min'] > 10) & (ai_companies['employees_max'] <= 50)).sum():,}")
    print(f"Large Companies (51+ employees): {(ai_companies['employees_min'] > 50).sum():,}")

# Sector analysis
print("\nSECTOR-SPECIFIC AI MENTIONS")
print("-" * 80)
sectors = {
    'Software': df_ai['categories'].str.contains('Software', na=False, case=False),
    'SaaS': df_ai['categories'].str.contains('SaaS', na=False, case=False),
    'Health Care': df_ai['categories'].str.contains('Health Care', na=False, case=False),
    'FinTech': df_ai['categories'].str.contains('FinTech', na=False, case=False),
    'Blockchain': df_ai['categories'].str.contains('Blockchain', na=False, case=False),
    'Robotics': df_ai['categories'].str.contains('Robotics', na=False, case=False),
}

sector_stats = []
for sector_name, mask in sectors.items():
    sector_df = df_ai[mask]
    if len(sector_df) > 0:
        sector_ai = sector_df['mentions_ai'].sum()
        pct_ai = (sector_ai / len(sector_df)) * 100
        sector_stats.append({
            'Sector': sector_name,
            'Total': len(sector_df),
            'Mentions AI': sector_ai,
            'Percentage': pct_ai
        })
        print(f"{sector_name}:")
        print(f"  Total Companies: {len(sector_df):,}")
        print(f"  Mentioning AI: {sector_ai:,} ({pct_ai:.2f}%)")
        print()

# Export summary to CSV
summary_data = {
    'Metric': [
        'Total Companies',
        'Mentions AI in Description',
        'Has AI Category',
        'Both Description and Category',
        'Either Description or Category',
        'Description Only',
        'Category Only',
        'Neither'
    ],
    'Count': [
        total_companies,
        ai_in_description,
        ai_in_category,
        both_ai,
        either_ai,
        (df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum(),
        (~df_ai['mentions_ai'] & df_ai['has_ai_category']).sum(),
        (~df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum()
    ],
    'Percentage': [
        100.0,
        ai_in_description/total_companies*100,
        ai_in_category/total_companies*100,
        both_ai/total_companies*100,
        either_ai/total_companies*100,
        (df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum()/total_companies*100,
        (~df_ai['mentions_ai'] & df_ai['has_ai_category']).sum()/total_companies*100,
        (~df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum()/total_companies*100
    ]
}

summary_df = pd.DataFrame(summary_data)
summary_df.to_csv('ai_mentions_summary.csv', index=False)
print("=" * 80)
print("Summary saved to: ai_mentions_summary.csv")

# Export pattern breakdown
pattern_df = pd.DataFrame([
    {'Pattern': pattern.replace('_', ' ').title(), 'Count': count, 'Percentage': count/total_companies*100}
    for pattern, count in sorted_patterns
])
pattern_df.to_csv('ai_patterns_breakdown.csv', index=False)
print("Pattern breakdown saved to: ai_patterns_breakdown.csv")

# Export sector breakdown
sector_df = pd.DataFrame(sector_stats)
sector_df.to_csv('ai_sector_breakdown.csv', index=False)
print("Sector breakdown saved to: ai_sector_breakdown.csv")

# Export list of AI companies
ai_companies_export = df_ai[df_ai['mentions_ai']][['name', 'short_description', 'categories', 'founded_date']].copy()
if 'employees_midpoint' in df_ai.columns:
    ai_companies_export['employees_midpoint'] = df_ai[df_ai['mentions_ai']]['employees_midpoint']
ai_companies_export = ai_companies_export.sort_values('name')
ai_companies_export.to_csv('companies_mentioning_ai.csv', index=False)
print(f"List of {len(ai_companies_export):,} companies mentioning AI saved to: companies_mentioning_ai.csv")

print("\n" + "=" * 80)
print("ANALYSIS COMPLETE!")
print("=" * 80)


AI MENTIONS IN COMPANY DESCRIPTIONS - ANALYSIS SUMMARY

Generated: 2025-11-15 12:09:32

OVERALL STATISTICS
--------------------------------------------------------------------------------
Total Companies Analyzed: 26,012
Companies Mentioning AI in Description: 6,693 (25.73%)
Companies with AI Category: 5,036 (19.36%)
Companies with Both Description & Category: 3,790
Companies with Either Description OR Category: 7,939 (30.52%)
Description Only (no AI category): 2,903
Category Only (no AI in description): 1,246
Neither Description nor Category: 18,073

AI TERMINOLOGY BREAKDOWN
--------------------------------------------------------------------------------
  Ai Direct                     :  6,338 companies (24.37%)
  Ai Powered                    :  1,699 companies ( 6.53%)
  Artificial Intelligence       :    367 companies ( 1.41%)
  Generative Ai                 :    201 companies ( 0.77%)
  Machine Learning              :    126 companies ( 0.48%)
  Ai Native                     :   

KeyError: "['founded_date'] not in index"

In [None]:
# ============================================================================
# AI MENTIONS ANALYSIS - FOCUSED ANALYSIS
# ============================================================================

def analyze_ai_mentions_focused(df):
    """Comprehensive analysis of AI mentions in descriptions"""
    
    # Define AI-related patterns
    ai_patterns = {
        'ai_direct': r'\bai\b',
        'artificial_intelligence': r'\bartificial intelligence\b',
        'machine_learning': r'\bmachine learning\b|\bml\b',
        'deep_learning': r'\bdeep learning\b',
        'neural_network': r'\bneural network\b',
        'llm': r'\bllm\b|\blarge language model\b',
        'generative_ai': r'\bgenerative ai\b|\bgenai\b',
        'computer_vision': r'\bcomputer vision\b',
        'nlp': r'\bnlp\b|\bnatural language processing\b',
        'ai_powered': r'\bai.powered\b|\bai.driven\b',
        'ai_native': r'\bai.native\b',
        'ai_agent': r'\bai agent\b',
    }
    
    # Check for any AI mention (broad)
    broad_ai_pattern = r'\bai\b|\bartificial intelligence\b|\bmachine learning\b|\bdeep learning\b|\bneural\b|\bllm\b|\bgenerative ai\b|\bgenai\b'
    df['mentions_ai'] = df['short_description'].str.contains(broad_ai_pattern, case=False, na=False, regex=True)
    
    # Check for AI category
    df['has_ai_category'] = df['categories'].str.contains('Artificial Intelligence', na=False, case=False)
    
    # Detailed pattern matching
    pattern_counts = {}
    for pattern_name, pattern in ai_patterns.items():
        mask = df['short_description'].str.contains(pattern, case=False, na=False, regex=True)
        pattern_counts[pattern_name] = mask.sum()
    
    return df, pattern_counts

# Run analysis
df_ai, pattern_counts = analyze_ai_mentions_focused(df)

# Calculate statistics
total_companies = len(df_ai)
ai_in_description = df_ai['mentions_ai'].sum()
ai_in_category = df_ai['has_ai_category'].sum()
both_ai = (df_ai['mentions_ai'] & df_ai['has_ai_category']).sum()
either_ai = (df_ai['mentions_ai'] | df_ai['has_ai_category']).sum()

print("=" * 80)
print("AI MENTIONS IN COMPANY DESCRIPTIONS - ANALYSIS SUMMARY")
print("=" * 80)
print(f"\nGenerated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

print("OVERALL STATISTICS")
print("-" * 80)
print(f"Total Companies Analyzed: {total_companies:,}")
print(f"Companies Mentioning AI in Description: {ai_in_description:,} ({ai_in_description/total_companies*100:.2f}%)")
print(f"Companies with AI Category: {ai_in_category:,} ({ai_in_category/total_companies*100:.2f}%)")
print(f"Companies with Both Description & Category: {both_ai:,}")
print(f"Companies with Either Description OR Category: {either_ai:,} ({either_ai/total_companies*100:.2f}%)")
print(f"Description Only (no AI category): {(df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum():,}")
print(f"Category Only (no AI in description): {(~df_ai['mentions_ai'] & df_ai['has_ai_category']).sum():,}")
print(f"Neither Description nor Category: {(~df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum():,}")

print("\nAI TERMINOLOGY BREAKDOWN")
print("-" * 80)
sorted_patterns = sorted(pattern_counts.items(), key=lambda x: x[1], reverse=True)
for pattern, count in sorted_patterns:
    pct = (count / total_companies) * 100
    print(f"  {pattern.replace('_', ' ').title():30s}: {count:6,} companies ({pct:5.2f}%)")

# Company size analysis for AI companies
ai_companies = df_ai[df_ai['mentions_ai']]
if len(ai_companies) > 0 and 'employees_midpoint' in ai_companies.columns:
    print("\nAI COMPANY SIZE ANALYSIS")
    print("-" * 80)
    print(f"Average Employees (AI Companies): {ai_companies['employees_midpoint'].mean():.1f}")
    print(f"Median Employees (AI Companies): {ai_companies['employees_midpoint'].median():.1f}")
    print(f"Small Companies (1-10 employees): {(ai_companies['employees_min'] <= 10).sum():,}")
    print(f"Medium Companies (11-50 employees): {((ai_companies['employees_min'] > 10) & (ai_companies['employees_max'] <= 50)).sum():,}")
    print(f"Large Companies (51+ employees): {(ai_companies['employees_min'] > 50).sum():,}")

# Sector analysis
print("\nSECTOR-SPECIFIC AI MENTIONS")
print("-" * 80)
sectors = {
    'Software': df_ai['categories'].str.contains('Software', na=False, case=False),
    'SaaS': df_ai['categories'].str.contains('SaaS', na=False, case=False),
    'Health Care': df_ai['categories'].str.contains('Health Care', na=False, case=False),
    'FinTech': df_ai['categories'].str.contains('FinTech', na=False, case=False),
    'Blockchain': df_ai['categories'].str.contains('Blockchain', na=False, case=False),
    'Robotics': df_ai['categories'].str.contains('Robotics', na=False, case=False),
}

sector_stats = []
for sector_name, mask in sectors.items():
    sector_df = df_ai[mask]
    if len(sector_df) > 0:
        sector_ai = sector_df['mentions_ai'].sum()
        pct_ai = (sector_ai / len(sector_df)) * 100
        sector_stats.append({
            'Sector': sector_name,
            'Total': len(sector_df),
            'Mentions AI': sector_ai,
            'Percentage': pct_ai
        })
        print(f"{sector_name}:")
        print(f"  Total Companies: {len(sector_df):,}")
        print(f"  Mentioning AI: {sector_ai:,} ({pct_ai:.2f}%)")
        print()

# Export summary to CSV
summary_data = {
    'Metric': [
        'Total Companies',
        'Mentions AI in Description',
        'Has AI Category',
        'Both Description and Category',
        'Either Description or Category',
        'Description Only',
        'Category Only',
        'Neither'
    ],
    'Count': [
        total_companies,
        ai_in_description,
        ai_in_category,
        both_ai,
        either_ai,
        (df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum(),
        (~df_ai['mentions_ai'] & df_ai['has_ai_category']).sum(),
        (~df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum()
    ],
    'Percentage': [
        100.0,
        ai_in_description/total_companies*100,
        ai_in_category/total_companies*100,
        both_ai/total_companies*100,
        either_ai/total_companies*100,
        (df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum()/total_companies*100,
        (~df_ai['mentions_ai'] & df_ai['has_ai_category']).sum()/total_companies*100,
        (~df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum()/total_companies*100
    ]
}

summary_df = pd.DataFrame(summary_data)
summary_df.to_csv('ai_mentions_summary.csv', index=False)
print("=" * 80)
print("Summary saved to: ai_mentions_summary.csv")

# Export pattern breakdown
pattern_df = pd.DataFrame([
    {'Pattern': pattern.replace('_', ' ').title(), 'Count': count, 'Percentage': count/total_companies*100}
    for pattern, count in sorted_patterns
])
pattern_df.to_csv('ai_patterns_breakdown.csv', index=False)
print("Pattern breakdown saved to: ai_patterns_breakdown.csv")

# Export sector breakdown
sector_df = pd.DataFrame(sector_stats)
sector_df.to_csv('ai_sector_breakdown.csv', index=False)
print("Sector breakdown saved to: ai_sector_breakdown.csv")

# Export list of AI companies
ai_companies_export = df_ai[df_ai['mentions_ai']][['name', 'short_description', 'categories', 'founded_date']].copy()
if 'employees_midpoint' in df_ai.columns:
    ai_companies_export['employees_midpoint'] = df_ai[df_ai['mentions_ai']]['employees_midpoint']
ai_companies_export = ai_companies_export.sort_values('name')
ai_companies_export.to_csv('companies_mentioning_ai.csv', index=False)
print(f"List of {len(ai_companies_export):,} companies mentioning AI saved to: companies_mentioning_ai.csv")

print("\n" + "=" * 80)
print("ANALYSIS COMPLETE!")
print("=" * 80)


AI MENTIONS IN COMPANY DESCRIPTIONS - ANALYSIS SUMMARY

Generated: 2025-11-15 12:09:32

OVERALL STATISTICS
--------------------------------------------------------------------------------
Total Companies Analyzed: 26,012
Companies Mentioning AI in Description: 6,693 (25.73%)
Companies with AI Category: 5,036 (19.36%)
Companies with Both Description & Category: 3,790
Companies with Either Description OR Category: 7,939 (30.52%)
Description Only (no AI category): 2,903
Category Only (no AI in description): 1,246
Neither Description nor Category: 18,073

AI TERMINOLOGY BREAKDOWN
--------------------------------------------------------------------------------
  Ai Direct                     :  6,338 companies (24.37%)
  Ai Powered                    :  1,699 companies ( 6.53%)
  Artificial Intelligence       :    367 companies ( 1.41%)
  Generative Ai                 :    201 companies ( 0.77%)
  Machine Learning              :    126 companies ( 0.48%)
  Ai Native                     :   

KeyError: "['founded_date'] not in index"

In [None]:
# ============================================================================
# AI MENTIONS ANALYSIS - FOCUSED ANALYSIS
# ============================================================================

def analyze_ai_mentions_focused(df):
    """Comprehensive analysis of AI mentions in descriptions"""
    
    # Define AI-related patterns
    ai_patterns = {
        'ai_direct': r'\bai\b',
        'artificial_intelligence': r'\bartificial intelligence\b',
        'machine_learning': r'\bmachine learning\b|\bml\b',
        'deep_learning': r'\bdeep learning\b',
        'neural_network': r'\bneural network\b',
        'llm': r'\bllm\b|\blarge language model\b',
        'generative_ai': r'\bgenerative ai\b|\bgenai\b',
        'computer_vision': r'\bcomputer vision\b',
        'nlp': r'\bnlp\b|\bnatural language processing\b',
        'ai_powered': r'\bai.powered\b|\bai.driven\b',
        'ai_native': r'\bai.native\b',
        'ai_agent': r'\bai agent\b',
    }
    
    # Check for any AI mention (broad)
    broad_ai_pattern = r'\bai\b|\bartificial intelligence\b|\bmachine learning\b|\bdeep learning\b|\bneural\b|\bllm\b|\bgenerative ai\b|\bgenai\b'
    df['mentions_ai'] = df['short_description'].str.contains(broad_ai_pattern, case=False, na=False, regex=True)
    
    # Check for AI category
    df['has_ai_category'] = df['categories'].str.contains('Artificial Intelligence', na=False, case=False)
    
    # Detailed pattern matching
    pattern_counts = {}
    for pattern_name, pattern in ai_patterns.items():
        mask = df['short_description'].str.contains(pattern, case=False, na=False, regex=True)
        pattern_counts[pattern_name] = mask.sum()
    
    return df, pattern_counts

# Run analysis
df_ai, pattern_counts = analyze_ai_mentions_focused(df)

# Calculate statistics
total_companies = len(df_ai)
ai_in_description = df_ai['mentions_ai'].sum()
ai_in_category = df_ai['has_ai_category'].sum()
both_ai = (df_ai['mentions_ai'] & df_ai['has_ai_category']).sum()
either_ai = (df_ai['mentions_ai'] | df_ai['has_ai_category']).sum()

print("=" * 80)
print("AI MENTIONS IN COMPANY DESCRIPTIONS - ANALYSIS SUMMARY")
print("=" * 80)
print(f"\nGenerated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

print("OVERALL STATISTICS")
print("-" * 80)
print(f"Total Companies Analyzed: {total_companies:,}")
print(f"Companies Mentioning AI in Description: {ai_in_description:,} ({ai_in_description/total_companies*100:.2f}%)")
print(f"Companies with AI Category: {ai_in_category:,} ({ai_in_category/total_companies*100:.2f}%)")
print(f"Companies with Both Description & Category: {both_ai:,}")
print(f"Companies with Either Description OR Category: {either_ai:,} ({either_ai/total_companies*100:.2f}%)")
print(f"Description Only (no AI category): {(df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum():,}")
print(f"Category Only (no AI in description): {(~df_ai['mentions_ai'] & df_ai['has_ai_category']).sum():,}")
print(f"Neither Description nor Category: {(~df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum():,}")

print("\nAI TERMINOLOGY BREAKDOWN")
print("-" * 80)
sorted_patterns = sorted(pattern_counts.items(), key=lambda x: x[1], reverse=True)
for pattern, count in sorted_patterns:
    pct = (count / total_companies) * 100
    print(f"  {pattern.replace('_', ' ').title():30s}: {count:6,} companies ({pct:5.2f}%)")

# Company size analysis for AI companies
ai_companies = df_ai[df_ai['mentions_ai']]
if len(ai_companies) > 0 and 'employees_midpoint' in ai_companies.columns:
    print("\nAI COMPANY SIZE ANALYSIS")
    print("-" * 80)
    print(f"Average Employees (AI Companies): {ai_companies['employees_midpoint'].mean():.1f}")
    print(f"Median Employees (AI Companies): {ai_companies['employees_midpoint'].median():.1f}")
    print(f"Small Companies (1-10 employees): {(ai_companies['employees_min'] <= 10).sum():,}")
    print(f"Medium Companies (11-50 employees): {((ai_companies['employees_min'] > 10) & (ai_companies['employees_max'] <= 50)).sum():,}")
    print(f"Large Companies (51+ employees): {(ai_companies['employees_min'] > 50).sum():,}")

# Sector analysis
print("\nSECTOR-SPECIFIC AI MENTIONS")
print("-" * 80)
sectors = {
    'Software': df_ai['categories'].str.contains('Software', na=False, case=False),
    'SaaS': df_ai['categories'].str.contains('SaaS', na=False, case=False),
    'Health Care': df_ai['categories'].str.contains('Health Care', na=False, case=False),
    'FinTech': df_ai['categories'].str.contains('FinTech', na=False, case=False),
    'Blockchain': df_ai['categories'].str.contains('Blockchain', na=False, case=False),
    'Robotics': df_ai['categories'].str.contains('Robotics', na=False, case=False),
}

sector_stats = []
for sector_name, mask in sectors.items():
    sector_df = df_ai[mask]
    if len(sector_df) > 0:
        sector_ai = sector_df['mentions_ai'].sum()
        pct_ai = (sector_ai / len(sector_df)) * 100
        sector_stats.append({
            'Sector': sector_name,
            'Total': len(sector_df),
            'Mentions AI': sector_ai,
            'Percentage': pct_ai
        })
        print(f"{sector_name}:")
        print(f"  Total Companies: {len(sector_df):,}")
        print(f"  Mentioning AI: {sector_ai:,} ({pct_ai:.2f}%)")
        print()

# Export summary to CSV
summary_data = {
    'Metric': [
        'Total Companies',
        'Mentions AI in Description',
        'Has AI Category',
        'Both Description and Category',
        'Either Description or Category',
        'Description Only',
        'Category Only',
        'Neither'
    ],
    'Count': [
        total_companies,
        ai_in_description,
        ai_in_category,
        both_ai,
        either_ai,
        (df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum(),
        (~df_ai['mentions_ai'] & df_ai['has_ai_category']).sum(),
        (~df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum()
    ],
    'Percentage': [
        100.0,
        ai_in_description/total_companies*100,
        ai_in_category/total_companies*100,
        both_ai/total_companies*100,
        either_ai/total_companies*100,
        (df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum()/total_companies*100,
        (~df_ai['mentions_ai'] & df_ai['has_ai_category']).sum()/total_companies*100,
        (~df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum()/total_companies*100
    ]
}

summary_df = pd.DataFrame(summary_data)
summary_df.to_csv('ai_mentions_summary.csv', index=False)
print("=" * 80)
print("Summary saved to: ai_mentions_summary.csv")

# Export pattern breakdown
pattern_df = pd.DataFrame([
    {'Pattern': pattern.replace('_', ' ').title(), 'Count': count, 'Percentage': count/total_companies*100}
    for pattern, count in sorted_patterns
])
pattern_df.to_csv('ai_patterns_breakdown.csv', index=False)
print("Pattern breakdown saved to: ai_patterns_breakdown.csv")

# Export sector breakdown
sector_df = pd.DataFrame(sector_stats)
sector_df.to_csv('ai_sector_breakdown.csv', index=False)
print("Sector breakdown saved to: ai_sector_breakdown.csv")

# Export list of AI companies
ai_companies_export = df_ai[df_ai['mentions_ai']][['name', 'short_description', 'categories', 'founded_date']].copy()
if 'employees_midpoint' in df_ai.columns:
    ai_companies_export['employees_midpoint'] = df_ai[df_ai['mentions_ai']]['employees_midpoint']
ai_companies_export = ai_companies_export.sort_values('name')
ai_companies_export.to_csv('companies_mentioning_ai.csv', index=False)
print(f"List of {len(ai_companies_export):,} companies mentioning AI saved to: companies_mentioning_ai.csv")

print("\n" + "=" * 80)
print("ANALYSIS COMPLETE!")
print("=" * 80)


AI MENTIONS IN COMPANY DESCRIPTIONS - ANALYSIS SUMMARY

Generated: 2025-11-15 12:09:32

OVERALL STATISTICS
--------------------------------------------------------------------------------
Total Companies Analyzed: 26,012
Companies Mentioning AI in Description: 6,693 (25.73%)
Companies with AI Category: 5,036 (19.36%)
Companies with Both Description & Category: 3,790
Companies with Either Description OR Category: 7,939 (30.52%)
Description Only (no AI category): 2,903
Category Only (no AI in description): 1,246
Neither Description nor Category: 18,073

AI TERMINOLOGY BREAKDOWN
--------------------------------------------------------------------------------
  Ai Direct                     :  6,338 companies (24.37%)
  Ai Powered                    :  1,699 companies ( 6.53%)
  Artificial Intelligence       :    367 companies ( 1.41%)
  Generative Ai                 :    201 companies ( 0.77%)
  Machine Learning              :    126 companies ( 0.48%)
  Ai Native                     :   

KeyError: "['founded_date'] not in index"

In [None]:
# ============================================================================
# AI MENTIONS ANALYSIS - FOCUSED ANALYSIS
# ============================================================================

def analyze_ai_mentions_focused(df):
    """Comprehensive analysis of AI mentions in descriptions"""
    
    # Define AI-related patterns
    ai_patterns = {
        'ai_direct': r'\bai\b',
        'artificial_intelligence': r'\bartificial intelligence\b',
        'machine_learning': r'\bmachine learning\b|\bml\b',
        'deep_learning': r'\bdeep learning\b',
        'neural_network': r'\bneural network\b',
        'llm': r'\bllm\b|\blarge language model\b',
        'generative_ai': r'\bgenerative ai\b|\bgenai\b',
        'computer_vision': r'\bcomputer vision\b',
        'nlp': r'\bnlp\b|\bnatural language processing\b',
        'ai_powered': r'\bai.powered\b|\bai.driven\b',
        'ai_native': r'\bai.native\b',
        'ai_agent': r'\bai agent\b',
    }
    
    # Check for any AI mention (broad)
    broad_ai_pattern = r'\bai\b|\bartificial intelligence\b|\bmachine learning\b|\bdeep learning\b|\bneural\b|\bllm\b|\bgenerative ai\b|\bgenai\b'
    df['mentions_ai'] = df['short_description'].str.contains(broad_ai_pattern, case=False, na=False, regex=True)
    
    # Check for AI category
    df['has_ai_category'] = df['categories'].str.contains('Artificial Intelligence', na=False, case=False)
    
    # Detailed pattern matching
    pattern_counts = {}
    for pattern_name, pattern in ai_patterns.items():
        mask = df['short_description'].str.contains(pattern, case=False, na=False, regex=True)
        pattern_counts[pattern_name] = mask.sum()
    
    return df, pattern_counts

# Run analysis
df_ai, pattern_counts = analyze_ai_mentions_focused(df)

# Calculate statistics
total_companies = len(df_ai)
ai_in_description = df_ai['mentions_ai'].sum()
ai_in_category = df_ai['has_ai_category'].sum()
both_ai = (df_ai['mentions_ai'] & df_ai['has_ai_category']).sum()
either_ai = (df_ai['mentions_ai'] | df_ai['has_ai_category']).sum()

print("=" * 80)
print("AI MENTIONS IN COMPANY DESCRIPTIONS - ANALYSIS SUMMARY")
print("=" * 80)
print(f"\nGenerated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

print("OVERALL STATISTICS")
print("-" * 80)
print(f"Total Companies Analyzed: {total_companies:,}")
print(f"Companies Mentioning AI in Description: {ai_in_description:,} ({ai_in_description/total_companies*100:.2f}%)")
print(f"Companies with AI Category: {ai_in_category:,} ({ai_in_category/total_companies*100:.2f}%)")
print(f"Companies with Both Description & Category: {both_ai:,}")
print(f"Companies with Either Description OR Category: {either_ai:,} ({either_ai/total_companies*100:.2f}%)")
print(f"Description Only (no AI category): {(df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum():,}")
print(f"Category Only (no AI in description): {(~df_ai['mentions_ai'] & df_ai['has_ai_category']).sum():,}")
print(f"Neither Description nor Category: {(~df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum():,}")

print("\nAI TERMINOLOGY BREAKDOWN")
print("-" * 80)
sorted_patterns = sorted(pattern_counts.items(), key=lambda x: x[1], reverse=True)
for pattern, count in sorted_patterns:
    pct = (count / total_companies) * 100
    print(f"  {pattern.replace('_', ' ').title():30s}: {count:6,} companies ({pct:5.2f}%)")

# Company size analysis for AI companies
ai_companies = df_ai[df_ai['mentions_ai']]
if len(ai_companies) > 0 and 'employees_midpoint' in ai_companies.columns:
    print("\nAI COMPANY SIZE ANALYSIS")
    print("-" * 80)
    print(f"Average Employees (AI Companies): {ai_companies['employees_midpoint'].mean():.1f}")
    print(f"Median Employees (AI Companies): {ai_companies['employees_midpoint'].median():.1f}")
    print(f"Small Companies (1-10 employees): {(ai_companies['employees_min'] <= 10).sum():,}")
    print(f"Medium Companies (11-50 employees): {((ai_companies['employees_min'] > 10) & (ai_companies['employees_max'] <= 50)).sum():,}")
    print(f"Large Companies (51+ employees): {(ai_companies['employees_min'] > 50).sum():,}")

# Sector analysis
print("\nSECTOR-SPECIFIC AI MENTIONS")
print("-" * 80)
sectors = {
    'Software': df_ai['categories'].str.contains('Software', na=False, case=False),
    'SaaS': df_ai['categories'].str.contains('SaaS', na=False, case=False),
    'Health Care': df_ai['categories'].str.contains('Health Care', na=False, case=False),
    'FinTech': df_ai['categories'].str.contains('FinTech', na=False, case=False),
    'Blockchain': df_ai['categories'].str.contains('Blockchain', na=False, case=False),
    'Robotics': df_ai['categories'].str.contains('Robotics', na=False, case=False),
}

sector_stats = []
for sector_name, mask in sectors.items():
    sector_df = df_ai[mask]
    if len(sector_df) > 0:
        sector_ai = sector_df['mentions_ai'].sum()
        pct_ai = (sector_ai / len(sector_df)) * 100
        sector_stats.append({
            'Sector': sector_name,
            'Total': len(sector_df),
            'Mentions AI': sector_ai,
            'Percentage': pct_ai
        })
        print(f"{sector_name}:")
        print(f"  Total Companies: {len(sector_df):,}")
        print(f"  Mentioning AI: {sector_ai:,} ({pct_ai:.2f}%)")
        print()

# Export summary to CSV
summary_data = {
    'Metric': [
        'Total Companies',
        'Mentions AI in Description',
        'Has AI Category',
        'Both Description and Category',
        'Either Description or Category',
        'Description Only',
        'Category Only',
        'Neither'
    ],
    'Count': [
        total_companies,
        ai_in_description,
        ai_in_category,
        both_ai,
        either_ai,
        (df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum(),
        (~df_ai['mentions_ai'] & df_ai['has_ai_category']).sum(),
        (~df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum()
    ],
    'Percentage': [
        100.0,
        ai_in_description/total_companies*100,
        ai_in_category/total_companies*100,
        both_ai/total_companies*100,
        either_ai/total_companies*100,
        (df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum()/total_companies*100,
        (~df_ai['mentions_ai'] & df_ai['has_ai_category']).sum()/total_companies*100,
        (~df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum()/total_companies*100
    ]
}

summary_df = pd.DataFrame(summary_data)
summary_df.to_csv('ai_mentions_summary.csv', index=False)
print("=" * 80)
print("Summary saved to: ai_mentions_summary.csv")

# Export pattern breakdown
pattern_df = pd.DataFrame([
    {'Pattern': pattern.replace('_', ' ').title(), 'Count': count, 'Percentage': count/total_companies*100}
    for pattern, count in sorted_patterns
])
pattern_df.to_csv('ai_patterns_breakdown.csv', index=False)
print("Pattern breakdown saved to: ai_patterns_breakdown.csv")

# Export sector breakdown
sector_df = pd.DataFrame(sector_stats)
sector_df.to_csv('ai_sector_breakdown.csv', index=False)
print("Sector breakdown saved to: ai_sector_breakdown.csv")

# Export list of AI companies
ai_companies_export = df_ai[df_ai['mentions_ai']][['name', 'short_description', 'categories', 'founded_date']].copy()
if 'employees_midpoint' in df_ai.columns:
    ai_companies_export['employees_midpoint'] = df_ai[df_ai['mentions_ai']]['employees_midpoint']
ai_companies_export = ai_companies_export.sort_values('name')
ai_companies_export.to_csv('companies_mentioning_ai.csv', index=False)
print(f"List of {len(ai_companies_export):,} companies mentioning AI saved to: companies_mentioning_ai.csv")

print("\n" + "=" * 80)
print("ANALYSIS COMPLETE!")
print("=" * 80)


AI MENTIONS IN COMPANY DESCRIPTIONS - ANALYSIS SUMMARY

Generated: 2025-11-15 12:09:32

OVERALL STATISTICS
--------------------------------------------------------------------------------
Total Companies Analyzed: 26,012
Companies Mentioning AI in Description: 6,693 (25.73%)
Companies with AI Category: 5,036 (19.36%)
Companies with Both Description & Category: 3,790
Companies with Either Description OR Category: 7,939 (30.52%)
Description Only (no AI category): 2,903
Category Only (no AI in description): 1,246
Neither Description nor Category: 18,073

AI TERMINOLOGY BREAKDOWN
--------------------------------------------------------------------------------
  Ai Direct                     :  6,338 companies (24.37%)
  Ai Powered                    :  1,699 companies ( 6.53%)
  Artificial Intelligence       :    367 companies ( 1.41%)
  Generative Ai                 :    201 companies ( 0.77%)
  Machine Learning              :    126 companies ( 0.48%)
  Ai Native                     :   

KeyError: "['founded_date'] not in index"

In [None]:
# ============================================================================
# AI MENTIONS ANALYSIS - FOCUSED ANALYSIS
# ============================================================================

def analyze_ai_mentions_focused(df):
    """Comprehensive analysis of AI mentions in descriptions"""
    
    # Define AI-related patterns
    ai_patterns = {
        'ai_direct': r'\bai\b',
        'artificial_intelligence': r'\bartificial intelligence\b',
        'machine_learning': r'\bmachine learning\b|\bml\b',
        'deep_learning': r'\bdeep learning\b',
        'neural_network': r'\bneural network\b',
        'llm': r'\bllm\b|\blarge language model\b',
        'generative_ai': r'\bgenerative ai\b|\bgenai\b',
        'computer_vision': r'\bcomputer vision\b',
        'nlp': r'\bnlp\b|\bnatural language processing\b',
        'ai_powered': r'\bai.powered\b|\bai.driven\b',
        'ai_native': r'\bai.native\b',
        'ai_agent': r'\bai agent\b',
    }
    
    # Check for any AI mention (broad)
    broad_ai_pattern = r'\bai\b|\bartificial intelligence\b|\bmachine learning\b|\bdeep learning\b|\bneural\b|\bllm\b|\bgenerative ai\b|\bgenai\b'
    df['mentions_ai'] = df['short_description'].str.contains(broad_ai_pattern, case=False, na=False, regex=True)
    
    # Check for AI category
    df['has_ai_category'] = df['categories'].str.contains('Artificial Intelligence', na=False, case=False)
    
    # Detailed pattern matching
    pattern_counts = {}
    for pattern_name, pattern in ai_patterns.items():
        mask = df['short_description'].str.contains(pattern, case=False, na=False, regex=True)
        pattern_counts[pattern_name] = mask.sum()
    
    return df, pattern_counts

# Run analysis
df_ai, pattern_counts = analyze_ai_mentions_focused(df)

# Calculate statistics
total_companies = len(df_ai)
ai_in_description = df_ai['mentions_ai'].sum()
ai_in_category = df_ai['has_ai_category'].sum()
both_ai = (df_ai['mentions_ai'] & df_ai['has_ai_category']).sum()
either_ai = (df_ai['mentions_ai'] | df_ai['has_ai_category']).sum()

print("=" * 80)
print("AI MENTIONS IN COMPANY DESCRIPTIONS - ANALYSIS SUMMARY")
print("=" * 80)
print(f"\nGenerated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

print("OVERALL STATISTICS")
print("-" * 80)
print(f"Total Companies Analyzed: {total_companies:,}")
print(f"Companies Mentioning AI in Description: {ai_in_description:,} ({ai_in_description/total_companies*100:.2f}%)")
print(f"Companies with AI Category: {ai_in_category:,} ({ai_in_category/total_companies*100:.2f}%)")
print(f"Companies with Both Description & Category: {both_ai:,}")
print(f"Companies with Either Description OR Category: {either_ai:,} ({either_ai/total_companies*100:.2f}%)")
print(f"Description Only (no AI category): {(df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum():,}")
print(f"Category Only (no AI in description): {(~df_ai['mentions_ai'] & df_ai['has_ai_category']).sum():,}")
print(f"Neither Description nor Category: {(~df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum():,}")

print("\nAI TERMINOLOGY BREAKDOWN")
print("-" * 80)
sorted_patterns = sorted(pattern_counts.items(), key=lambda x: x[1], reverse=True)
for pattern, count in sorted_patterns:
    pct = (count / total_companies) * 100
    print(f"  {pattern.replace('_', ' ').title():30s}: {count:6,} companies ({pct:5.2f}%)")

# Company size analysis for AI companies
ai_companies = df_ai[df_ai['mentions_ai']]
if len(ai_companies) > 0 and 'employees_midpoint' in ai_companies.columns:
    print("\nAI COMPANY SIZE ANALYSIS")
    print("-" * 80)
    print(f"Average Employees (AI Companies): {ai_companies['employees_midpoint'].mean():.1f}")
    print(f"Median Employees (AI Companies): {ai_companies['employees_midpoint'].median():.1f}")
    print(f"Small Companies (1-10 employees): {(ai_companies['employees_min'] <= 10).sum():,}")
    print(f"Medium Companies (11-50 employees): {((ai_companies['employees_min'] > 10) & (ai_companies['employees_max'] <= 50)).sum():,}")
    print(f"Large Companies (51+ employees): {(ai_companies['employees_min'] > 50).sum():,}")

# Sector analysis
print("\nSECTOR-SPECIFIC AI MENTIONS")
print("-" * 80)
sectors = {
    'Software': df_ai['categories'].str.contains('Software', na=False, case=False),
    'SaaS': df_ai['categories'].str.contains('SaaS', na=False, case=False),
    'Health Care': df_ai['categories'].str.contains('Health Care', na=False, case=False),
    'FinTech': df_ai['categories'].str.contains('FinTech', na=False, case=False),
    'Blockchain': df_ai['categories'].str.contains('Blockchain', na=False, case=False),
    'Robotics': df_ai['categories'].str.contains('Robotics', na=False, case=False),
}

sector_stats = []
for sector_name, mask in sectors.items():
    sector_df = df_ai[mask]
    if len(sector_df) > 0:
        sector_ai = sector_df['mentions_ai'].sum()
        pct_ai = (sector_ai / len(sector_df)) * 100
        sector_stats.append({
            'Sector': sector_name,
            'Total': len(sector_df),
            'Mentions AI': sector_ai,
            'Percentage': pct_ai
        })
        print(f"{sector_name}:")
        print(f"  Total Companies: {len(sector_df):,}")
        print(f"  Mentioning AI: {sector_ai:,} ({pct_ai:.2f}%)")
        print()

# Export summary to CSV
summary_data = {
    'Metric': [
        'Total Companies',
        'Mentions AI in Description',
        'Has AI Category',
        'Both Description and Category',
        'Either Description or Category',
        'Description Only',
        'Category Only',
        'Neither'
    ],
    'Count': [
        total_companies,
        ai_in_description,
        ai_in_category,
        both_ai,
        either_ai,
        (df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum(),
        (~df_ai['mentions_ai'] & df_ai['has_ai_category']).sum(),
        (~df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum()
    ],
    'Percentage': [
        100.0,
        ai_in_description/total_companies*100,
        ai_in_category/total_companies*100,
        both_ai/total_companies*100,
        either_ai/total_companies*100,
        (df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum()/total_companies*100,
        (~df_ai['mentions_ai'] & df_ai['has_ai_category']).sum()/total_companies*100,
        (~df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum()/total_companies*100
    ]
}

summary_df = pd.DataFrame(summary_data)
summary_df.to_csv('ai_mentions_summary.csv', index=False)
print("=" * 80)
print("Summary saved to: ai_mentions_summary.csv")

# Export pattern breakdown
pattern_df = pd.DataFrame([
    {'Pattern': pattern.replace('_', ' ').title(), 'Count': count, 'Percentage': count/total_companies*100}
    for pattern, count in sorted_patterns
])
pattern_df.to_csv('ai_patterns_breakdown.csv', index=False)
print("Pattern breakdown saved to: ai_patterns_breakdown.csv")

# Export sector breakdown
sector_df = pd.DataFrame(sector_stats)
sector_df.to_csv('ai_sector_breakdown.csv', index=False)
print("Sector breakdown saved to: ai_sector_breakdown.csv")

# Export list of AI companies
ai_companies_export = df_ai[df_ai['mentions_ai']][['name', 'short_description', 'categories', 'founded_date']].copy()
if 'employees_midpoint' in df_ai.columns:
    ai_companies_export['employees_midpoint'] = df_ai[df_ai['mentions_ai']]['employees_midpoint']
ai_companies_export = ai_companies_export.sort_values('name')
ai_companies_export.to_csv('companies_mentioning_ai.csv', index=False)
print(f"List of {len(ai_companies_export):,} companies mentioning AI saved to: companies_mentioning_ai.csv")

print("\n" + "=" * 80)
print("ANALYSIS COMPLETE!")
print("=" * 80)


AI MENTIONS IN COMPANY DESCRIPTIONS - ANALYSIS SUMMARY

Generated: 2025-11-15 12:09:32

OVERALL STATISTICS
--------------------------------------------------------------------------------
Total Companies Analyzed: 26,012
Companies Mentioning AI in Description: 6,693 (25.73%)
Companies with AI Category: 5,036 (19.36%)
Companies with Both Description & Category: 3,790
Companies with Either Description OR Category: 7,939 (30.52%)
Description Only (no AI category): 2,903
Category Only (no AI in description): 1,246
Neither Description nor Category: 18,073

AI TERMINOLOGY BREAKDOWN
--------------------------------------------------------------------------------
  Ai Direct                     :  6,338 companies (24.37%)
  Ai Powered                    :  1,699 companies ( 6.53%)
  Artificial Intelligence       :    367 companies ( 1.41%)
  Generative Ai                 :    201 companies ( 0.77%)
  Machine Learning              :    126 companies ( 0.48%)
  Ai Native                     :   

KeyError: "['founded_date'] not in index"

In [None]:
# ============================================================================
# AI MENTIONS ANALYSIS - FOCUSED ANALYSIS
# ============================================================================

def analyze_ai_mentions_focused(df):
    """Comprehensive analysis of AI mentions in descriptions"""
    
    # Define AI-related patterns
    ai_patterns = {
        'ai_direct': r'\bai\b',
        'artificial_intelligence': r'\bartificial intelligence\b',
        'machine_learning': r'\bmachine learning\b|\bml\b',
        'deep_learning': r'\bdeep learning\b',
        'neural_network': r'\bneural network\b',
        'llm': r'\bllm\b|\blarge language model\b',
        'generative_ai': r'\bgenerative ai\b|\bgenai\b',
        'computer_vision': r'\bcomputer vision\b',
        'nlp': r'\bnlp\b|\bnatural language processing\b',
        'ai_powered': r'\bai.powered\b|\bai.driven\b',
        'ai_native': r'\bai.native\b',
        'ai_agent': r'\bai agent\b',
    }
    
    # Check for any AI mention (broad)
    broad_ai_pattern = r'\bai\b|\bartificial intelligence\b|\bmachine learning\b|\bdeep learning\b|\bneural\b|\bllm\b|\bgenerative ai\b|\bgenai\b'
    df['mentions_ai'] = df['short_description'].str.contains(broad_ai_pattern, case=False, na=False, regex=True)
    
    # Check for AI category
    df['has_ai_category'] = df['categories'].str.contains('Artificial Intelligence', na=False, case=False)
    
    # Detailed pattern matching
    pattern_counts = {}
    for pattern_name, pattern in ai_patterns.items():
        mask = df['short_description'].str.contains(pattern, case=False, na=False, regex=True)
        pattern_counts[pattern_name] = mask.sum()
    
    return df, pattern_counts

# Run analysis
df_ai, pattern_counts = analyze_ai_mentions_focused(df)

# Calculate statistics
total_companies = len(df_ai)
ai_in_description = df_ai['mentions_ai'].sum()
ai_in_category = df_ai['has_ai_category'].sum()
both_ai = (df_ai['mentions_ai'] & df_ai['has_ai_category']).sum()
either_ai = (df_ai['mentions_ai'] | df_ai['has_ai_category']).sum()

print("=" * 80)
print("AI MENTIONS IN COMPANY DESCRIPTIONS - ANALYSIS SUMMARY")
print("=" * 80)
print(f"\nGenerated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

print("OVERALL STATISTICS")
print("-" * 80)
print(f"Total Companies Analyzed: {total_companies:,}")
print(f"Companies Mentioning AI in Description: {ai_in_description:,} ({ai_in_description/total_companies*100:.2f}%)")
print(f"Companies with AI Category: {ai_in_category:,} ({ai_in_category/total_companies*100:.2f}%)")
print(f"Companies with Both Description & Category: {both_ai:,}")
print(f"Companies with Either Description OR Category: {either_ai:,} ({either_ai/total_companies*100:.2f}%)")
print(f"Description Only (no AI category): {(df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum():,}")
print(f"Category Only (no AI in description): {(~df_ai['mentions_ai'] & df_ai['has_ai_category']).sum():,}")
print(f"Neither Description nor Category: {(~df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum():,}")

print("\nAI TERMINOLOGY BREAKDOWN")
print("-" * 80)
sorted_patterns = sorted(pattern_counts.items(), key=lambda x: x[1], reverse=True)
for pattern, count in sorted_patterns:
    pct = (count / total_companies) * 100
    print(f"  {pattern.replace('_', ' ').title():30s}: {count:6,} companies ({pct:5.2f}%)")

# Company size analysis for AI companies
ai_companies = df_ai[df_ai['mentions_ai']]
if len(ai_companies) > 0 and 'employees_midpoint' in ai_companies.columns:
    print("\nAI COMPANY SIZE ANALYSIS")
    print("-" * 80)
    print(f"Average Employees (AI Companies): {ai_companies['employees_midpoint'].mean():.1f}")
    print(f"Median Employees (AI Companies): {ai_companies['employees_midpoint'].median():.1f}")
    print(f"Small Companies (1-10 employees): {(ai_companies['employees_min'] <= 10).sum():,}")
    print(f"Medium Companies (11-50 employees): {((ai_companies['employees_min'] > 10) & (ai_companies['employees_max'] <= 50)).sum():,}")
    print(f"Large Companies (51+ employees): {(ai_companies['employees_min'] > 50).sum():,}")

# Sector analysis
print("\nSECTOR-SPECIFIC AI MENTIONS")
print("-" * 80)
sectors = {
    'Software': df_ai['categories'].str.contains('Software', na=False, case=False),
    'SaaS': df_ai['categories'].str.contains('SaaS', na=False, case=False),
    'Health Care': df_ai['categories'].str.contains('Health Care', na=False, case=False),
    'FinTech': df_ai['categories'].str.contains('FinTech', na=False, case=False),
    'Blockchain': df_ai['categories'].str.contains('Blockchain', na=False, case=False),
    'Robotics': df_ai['categories'].str.contains('Robotics', na=False, case=False),
}

sector_stats = []
for sector_name, mask in sectors.items():
    sector_df = df_ai[mask]
    if len(sector_df) > 0:
        sector_ai = sector_df['mentions_ai'].sum()
        pct_ai = (sector_ai / len(sector_df)) * 100
        sector_stats.append({
            'Sector': sector_name,
            'Total': len(sector_df),
            'Mentions AI': sector_ai,
            'Percentage': pct_ai
        })
        print(f"{sector_name}:")
        print(f"  Total Companies: {len(sector_df):,}")
        print(f"  Mentioning AI: {sector_ai:,} ({pct_ai:.2f}%)")
        print()

# Export summary to CSV
summary_data = {
    'Metric': [
        'Total Companies',
        'Mentions AI in Description',
        'Has AI Category',
        'Both Description and Category',
        'Either Description or Category',
        'Description Only',
        'Category Only',
        'Neither'
    ],
    'Count': [
        total_companies,
        ai_in_description,
        ai_in_category,
        both_ai,
        either_ai,
        (df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum(),
        (~df_ai['mentions_ai'] & df_ai['has_ai_category']).sum(),
        (~df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum()
    ],
    'Percentage': [
        100.0,
        ai_in_description/total_companies*100,
        ai_in_category/total_companies*100,
        both_ai/total_companies*100,
        either_ai/total_companies*100,
        (df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum()/total_companies*100,
        (~df_ai['mentions_ai'] & df_ai['has_ai_category']).sum()/total_companies*100,
        (~df_ai['mentions_ai'] & ~df_ai['has_ai_category']).sum()/total_companies*100
    ]
}

summary_df = pd.DataFrame(summary_data)
summary_df.to_csv('ai_mentions_summary.csv', index=False)
print("=" * 80)
print("Summary saved to: ai_mentions_summary.csv")

# Export pattern breakdown
pattern_df = pd.DataFrame([
    {'Pattern': pattern.replace('_', ' ').title(), 'Count': count, 'Percentage': count/total_companies*100}
    for pattern, count in sorted_patterns
])
pattern_df.to_csv('ai_patterns_breakdown.csv', index=False)
print("Pattern breakdown saved to: ai_patterns_breakdown.csv")

# Export sector breakdown
sector_df = pd.DataFrame(sector_stats)
sector_df.to_csv('ai_sector_breakdown.csv', index=False)
print("Sector breakdown saved to: ai_sector_breakdown.csv")

# Export list of AI companies
ai_companies_export = df_ai[df_ai['mentions_ai']][['name', 'short_description', 'categories', 'founded_date']].copy()
if 'employees_midpoint' in df_ai.columns:
    ai_companies_export['employees_midpoint'] = df_ai[df_ai['mentions_ai']]['employees_midpoint']
ai_companies_export = ai_companies_export.sort_values('name')
ai_companies_export.to_csv('companies_mentioning_ai.csv', index=False)
print(f"List of {len(ai_companies_export):,} companies mentioning AI saved to: companies_mentioning_ai.csv")

print("\n" + "=" * 80)
print("ANALYSIS COMPLETE!")
print("=" * 80)


AI MENTIONS IN COMPANY DESCRIPTIONS - ANALYSIS SUMMARY

Generated: 2025-11-15 12:09:32

OVERALL STATISTICS
--------------------------------------------------------------------------------
Total Companies Analyzed: 26,012
Companies Mentioning AI in Description: 6,693 (25.73%)
Companies with AI Category: 5,036 (19.36%)
Companies with Both Description & Category: 3,790
Companies with Either Description OR Category: 7,939 (30.52%)
Description Only (no AI category): 2,903
Category Only (no AI in description): 1,246
Neither Description nor Category: 18,073

AI TERMINOLOGY BREAKDOWN
--------------------------------------------------------------------------------
  Ai Direct                     :  6,338 companies (24.37%)
  Ai Powered                    :  1,699 companies ( 6.53%)
  Artificial Intelligence       :    367 companies ( 1.41%)
  Generative Ai                 :    201 companies ( 0.77%)
  Machine Learning              :    126 companies ( 0.48%)
  Ai Native                     :   

KeyError: "['founded_date'] not in index"