# 1. TOP INDUSTRIES BY NUMBER OF COMPANIES

In [3]:
import pandas as pd 
import plotly.express as px

df = pd.read_csv("INC 5000 Companies 2019.csv")
industry_counts = df['industry'].value_counts().nlargest(10).reset_index()
industry_counts.columns = ['industry', 'Company Count']

fig = px.bar(industry_counts, x='Company Count', y='industry', orientation='h',
             title='Top 10 Industries by Number of Companies')
fig.show()


# 2. STATE-WISE COMPANY DISTRIBUTION

In [35]:
state_counts = df['state'].value_counts().reset_index()
state_counts.columns = ['state', 'Company Count']

fig = px.choropleth(state_counts, locations='state', locationmode="USA-states",
                    color='Company Count', scope="usa",
                    title='Number of Companies by state')
fig.show()

# 3. INDUSTRY-WISE AVERAGE GROWTH RATES

In [36]:
industry_growth = df.groupby('industry')['growth_%'].mean().nlargest(10).reset_index()

fig = px.bar(industry_growth, x='industry', y='growth_%',
             title='Top 10 Industries by Average Growth Rate')
fig.update_xaxes(tickangle=45)
fig.show()

#  4. REVENUE vs. GROWTH RATE RELATIONSHIP 

In [37]:
fig = px.scatter(df, x='revenue', y='growth_%', hover_data=['name'],
                 title='Revenue vs Growth Rate')
fig.show()

# 5. TOP 10 COMPANIES BY REVENUE

In [38]:
top_revenue = df[['name', 'revenue']].sort_values(by='revenue', ascending=False).head(10)

fig = px.bar(top_revenue, x='revenue', y='name', orientation='h',
             title='Top 10 Companies by Revenue')
fig.show()


# 6. YEAR FOUNDED TREND

In [39]:
founded_counts = df['founded'].value_counts().sort_index()

fig = px.line(x=founded_counts.index, y=founded_counts.values,
              labels={'x': 'founded in year', 'y': 'Number of Companies'},
              title='Companies by Year Founded')

fig.update_xaxes(range=[1850, 2050],  
                  dtick=50)           

fig.show()

# 1. T-Test: Are younger companies growing faster than older ones?


In [40]:
from scipy.stats import ttest_ind

df['Company Age'] = 2019 - df['founded']

young = df[df['Company Age'] <= 5]['growth_%']
old = df[df['Company Age'] > 5]['growth_%']

t_stat, p_val = ttest_ind(young, old, nan_policy='omit')

print(f"T-statistic: {t_stat:.3f}, P-value: {p_val:.3f}")


T-statistic: 22.485, P-value: 0.000


#  2. Chi-Square Test: Is company distribution independent of state?

In [41]:
from scipy.stats import chisquare

state_counts = df['state'].value_counts()
expected = [state_counts.sum() / len(state_counts)] * len(state_counts)

chi_stat, p_val = chisquare(f_obs=state_counts.values, f_exp=expected)

print(f"Chi-square Statistic: {chi_stat:.3f}, P-value: {p_val:.3f}")



Chi-square Statistic: 9152.528, P-value: 0.000


# 3. Z-Test: Is the average revenue of Health companies higher than the dataset average?

In [42]:
def convert_revenue(value):
    try:
        if 'Million' in value:
            return float(value.replace(' Million', '').replace(',', ''))
        elif 'Billion' in value:
            return float(value.replace(' Billion', '').replace(',', '')) * 1000
        else:
            return float(value.replace(',', ''))  
    except:
        return None  


df['revenue'] = df['revenue'].astype(str).apply(convert_revenue)

from statsmodels.stats.weightstats import ztest

health_revenue = df[df['industry'] == 'Health']['revenue']
overall_mean = df['revenue'].mean()

# Z-test
z_stat, p_val = ztest(health_revenue, value=overall_mean, alternative='larger')

print(f"Z-statistic: {z_stat:.3f}, P-value: {p_val:.3f}")


Z-statistic: 1.026, P-value: 0.152
