In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import opendatasets as od
pd.options.plotting.backend = "plotly"


## Download & Import the dataset

* Requires a Kaggle API Key

In [13]:
# od.download('https://www.kaggle.com/datasets/lamiatabassum/top-50-us-tech-companies-2022-2023-dataset')

In [63]:
df = pd.read_csv('top-50-us-tech-companies-2022-2023-dataset/Top 50 US Tech Companies 2022 - 2023.csv')

## What does the dataset contain?

We can see we have the top 50 US Tech companies and the dataset also provides us with some basic figures for said companies.

Descriptive: Company Name, Industry, Sector, HQ State, Stock Name, Founding Year

Size of the company/Financials: Annual Revenue 2022-2023 (USD in Billions), Market Cap (USD in Trillions), Annual Income Tax in 2022-2023 (USD in Billions), Employee Size

In [64]:
df.shape

(50, 10)

In [65]:
df.head()

Unnamed: 0,Company Name,Industry,Sector,HQ State,Founding Year,Annual Revenue 2022-2023 (USD in Billions),Market Cap (USD in Trillions),Stock Name,Annual Income Tax in 2022-2023 (USD in Billions),Employee Size
0,Apple Inc.,Technology,Consumer Electronics,California,1976,387.53,2.52,AAPL,18.314,164000
1,Microsoft Corporation,Technology,Software Infrastructure,Washington,1975,204.09,2.037,MSFT,15.139,221000
2,Alphabet (Google),Technology,Software Infrastructure,California,1998,282.83,1.35,GOOG,11.356,190234
3,Amazon,Technology,Software Application,Washington,1994,513.98,1.03,AMZN,-3.217,1541000
4,NVIDIA Corporation,Technology,Semiconductors,California,1993,26.97,0.653,NVDA,0.189,22473


## Basic Statistics

In [66]:
df.describe()

Unnamed: 0,Founding Year,Annual Revenue 2022-2023 (USD in Billions),Market Cap (USD in Trillions),Annual Income Tax in 2022-2023 (USD in Billions),Employee Size
count,50.0,50.0,50.0,50.0,50.0
mean,1984.14,51.2044,0.25216,1.38678,83249.62
std,24.988985,97.41288,0.490377,3.687916,220586.9
min,1890.0,2.06,0.028,-3.217,2993.0
25%,1977.25,7.6525,0.05125,0.09875,14150.0
50%,1988.5,17.665,0.0825,0.2805,24725.0
75%,1999.75,40.815,0.16025,0.945,70155.75
max,2012.0,513.98,2.52,18.314,1541000.0


In [67]:
df.describe(include=['O'])

Unnamed: 0,Company Name,Industry,Sector,HQ State,Stock Name
count,50,50,50,50,50
unique,50,1,8,13,50
top,Micron Technology,Technology,Software Application,California,KLAC
freq,1,50,15,33,1


In [69]:
## Are there any null values
df.isnull().sum()

Company Name                                        0
Industry                                            0
Sector                                              0
HQ State                                            0
Founding Year                                       0
Annual Revenue 2022-2023 (USD in Billions)          0
Market Cap (USD in Trillions)                       0
Stock Name                                          0
Annual Income Tax in 2022-2023 (USD in Billions)    0
Employee Size                                       0
dtype: int64

## Initial EDA

In [72]:
Sector = df['Sector'].value_counts()

fig = px.pie(names = Sector.index, values = Sector.values)
fig.update_traces(textinfo='label+percent+value', title= 'Sector')
fig.show()

In [75]:
state_code = {'Alabama': 'AL',
        'Alaska': 'AK',
        'Arizona': 'AZ',
        'Arkansas': 'AR',
        'California': 'CA',
        'Colorado': 'CO',
        'Connecticut': 'CT',
        'Delaware': 'DE',
        'District of Columbia': 'DC',
        'Florida': 'FL',
        'Georgia': 'GA',
        'Hawaii': 'HI',
        'Idaho': 'ID',
        'Illinois': 'IL',
        'Indiana': 'IN',
        'Iowa': 'IA',
        'Kansas': 'KS',
        'Kentucky': 'KY',
        'Louisiana': 'LA',
        'Maine': 'ME',
        'Maryland': 'MD',
        'Massachusetts': 'MA',
        'Michigan': 'MI',
        'Minnesota': 'MN',
        'Mississippi': 'MS',
        'Missouri': 'MO',
        'Montana': 'MT',
        'Nebraska': 'NE',
        'Nevada': 'NV',
        'New Hampshire': 'NH',
        'New Jersey': 'NJ',
        'New Mexico': 'NM',
        'New York': 'NY',
        'North Carolina': 'NC',
        'North Dakota': 'ND',
        'Ohio': 'OH',
        'Oklahoma': 'OK',
        'Oregon': 'OR',
        'Pennsylvania': 'PA',
        'Rhode Island': 'RI',
        'South Carolina': 'SC',
        'South Dakota': 'SD',
        'Tennessee': 'TN',
        'Texas': 'TX',
        'Utah': 'UT',
        'Vermont': 'VT',
        'Virginia': 'VA',
        'Washington': 'WA',
        'West Virginia': 'WV',
        'Wisconsin': 'WI',
        'Wyoming': 'WY'}

In [78]:
df['HQ State Code'] = df['HQ State'].map(state_code)

In [83]:
volume_per_area = (df[['HQ State Code','HQ State','Company Name']]
                   .groupby(['HQ State Code','HQ State'],as_index=False)
                   .count()
                   .rename(columns={'Company Name':'Volume'})
                  )

In [113]:
fig = px.choropleth(volume_per_area,
                    locations='HQ State Code',
                    color='Volume',
                    color_continuous_scale='spectral_r',
                    hover_name='HQ State',
                    locationmode='USA-states',
                    labels={'HQ State':'Volume'},
                    scope='usa'
                   )

fig.add_scattergeo(
    locations=volume_per_area['HQ State Code'],
    locationmode='USA-states',
    text=volume_per_area['HQ State Code'],
    mode='text'
)

fig.update_layout(title= {'text':'Volume of Companies per State',
                          'xanchor':'center',
                          'yanchor':'top',
                          'x':0.5})
fig.show()

In [94]:
df

Unnamed: 0,Company Name,Industry,Sector,HQ State,Founding Year,Annual Revenue 2022-2023 (USD in Billions),Market Cap (USD in Trillions),Stock Name,Annual Income Tax in 2022-2023 (USD in Billions),Employee Size,HQ State Code
0,Apple Inc.,Technology,Consumer Electronics,California,1976,387.53,2.52,AAPL,18.314,164000,CA
1,Microsoft Corporation,Technology,Software Infrastructure,Washington,1975,204.09,2.037,MSFT,15.139,221000,WA
2,Alphabet (Google),Technology,Software Infrastructure,California,1998,282.83,1.35,GOOG,11.356,190234,CA
3,Amazon,Technology,Software Application,Washington,1994,513.98,1.03,AMZN,-3.217,1541000,WA
4,NVIDIA Corporation,Technology,Semiconductors,California,1993,26.97,0.653,NVDA,0.189,22473,CA
5,Tesla,Technology,Software Infrastructure,Texas,2003,81.46,0.625,TSLA,1.132,127855,TX
6,Meta Platforms,Technology,Software Infrastructure,California,2004,116.6,0.524,META,5.619,86482,CA
7,Broadcom Inc.,Technology,Semiconductors,California,1961,34.41,0.266,AVGO,0.939,20000,CA
8,Oracle Corporation,Technology,Software Infrastructure,Texas,1977,46.07,0.236,ORCL,0.932,143000,TX
9,Cisco Systems Inc.,Technology,Communication Equipments,California,1984,53.16,0.208,CSCO,2.665,83300,CA
