In [36]:
!pip install -q pandas matplotlib seaborn numpy plotly.express
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from tabulate import tabulate
import plotly.express as px
import statsmodels.api as sm

In [3]:
tech_salaries = pd.read_csv('/Users/annamezhlauk/Desktop/ECO225Project/Data/Tech_salaries.csv')
startups_main = pd.read_csv('/Users/annamezhlauk/Desktop/ECO225Project/Data/Startups/objects.csv', low_memory = False)

In [4]:
# dropping observations with "spam" position names
tech_salaries = tech_salaries.drop([44, 53, 267, 281, 522, 586, 646, 789, 1303, 1422, 1423, 1424, 1555, 1595, 1640, 1642], axis=0)


# dropping observations where compensation isn't in USD
tech_salaries = tech_salaries.drop([1448, 1132, 860, 766, 689, 677, 387, 21], axis=0)

# dropping improbable salaries that could be data entry mistakes
tech_salaries = tech_salaries[(tech_salaries['annual_base_pay'] >= 5000) & (tech_salaries['annual_base_pay'] <= 3000000
                                                                           )].reset_index(drop=True)

In [5]:
startups_main.rename(columns = {"normalized_name": "employer_name"}, inplace = True)
startups_main.rename(columns = {"entity_id": "startup_id"}, inplace = True)

In [6]:
startups_main['startup_id'] = startups_main['startup_id'].astype('Int64')
startups_and_salaries = tech_salaries.merge(startups_main, how="left", on="employer_name")

In [7]:
# creating a column for the number of employees in the data at each startup
startups_and_salaries['num_employees'] = startups_and_salaries.groupby('startup_id')['salary_id'].transform('count')

In [8]:
def distance_formula(lat1, lon1, lat2, lon2):
    """Return the distance between the points at lat1 lon1 and lat2 lon2 in miles.
    """
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # this is the haversine formula (from community.esri.com)
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    
    radius_earth_miles = 3960
    return radius_earth_miles * c


def count_competitor_employees(df, radius):

    df['competitors'] = np.nan

    # this is to make sure employees within the 'other' category aren't counted as competitor employees
    for category in df['job_title_category'].dropna().unique():
        if category == "Other":
            continue

        category_df = df[df['job_title_category'] == category]

        # iterates over each startup & exits the loop early if location data is missing
        for i, row in category_df.iterrows():
            if pd.isna(row['location_latitude']) or pd.isna(row['location_longitude']):
                continue
            
            lat1, lon1 = row['location_latitude'], row['location_longitude']
            competitors = 0

            
            # compares with startups in the same category
            for j, comp_row in category_df.iterrows():
                
                # checking for missing values
                if pd.notna(row['startup_id']) and pd.notna(comp_row['startup_id']) and row['startup_id'] != comp_row['startup_id']:
                    if pd.notna(comp_row['location_latitude']) and pd.notna(comp_row['location_longitude']):
                        lat2, lon2 = comp_row['location_latitude'], comp_row['location_longitude']

                        # adds competitors based on distance
                        if distance_formula(lat1, lon1, lat2, lon2) <= radius:
                            competitors += comp_row['num_employees']

            df.at[row.name, 'competitors'] = competitors

    return df

In [9]:
# using the count_competitor_employees function to add that column
startups_and_salaries = count_competitor_employees(startups_and_salaries, 300)

# group by startup_id and calculate the average salary
avg_salary_by_startup = startups_and_salaries.groupby('startup_id')['annual_base_pay'].transform('mean')

startups_and_salaries['avg_salary_per_startup'] = avg_salary_by_startup

In [28]:
dummies = pd.get_dummies(startups_and_salaries['status'], drop_first=False)  # if true drops 1st cat (alphabetically) to avoid dummy variable trap
dummies = dummies.astype(int)

In [30]:
data = pd.concat([startups_and_salaries, dummies], axis=1)
data.drop(columns=['status', 'operating'], inplace=True)
data.head()

Unnamed: 0,index,salary_id,employer_name,location_name,location_state,location_country,location_latitude,location_longitude,job_title,job_title_category,...,updated_at,num_employees,competitors,avg_salary_per_startup,acquired,beta,closed,ipo,live,private
0,0,1,opower,"san francisco, ca",CA,US,37.77,-122.41,systems engineer,Engineering,...,2013-07-25 20:10:17,1.0,74.0,125000.0,0,0,0,0,0,0
1,1,3,walmart,"bentonville, ar",AR,US,36.36,-94.2,senior developer,Software,...,2013-10-30 21:48:04,3.0,1622.0,113000.0,0,0,0,1,0,0
2,2,4,vertical knowledge,"cleveland, oh",OH,US,41.47,-81.67,software engineer,Software,...,,,0.0,,0,0,0,0,0,0
3,3,6,netapp,waltham,,,,,mts,Other,...,2009-09-24 06:26:32,2.0,,131000.0,0,0,0,1,0,0
4,4,12,apple,cupertino,,,,,software engineer,Software,...,2013-12-04 06:16:19,16.0,,131625.0,0,0,0,1,0,0


In [37]:
X = data[['acquired', 'beta', 'live', 'ipo', 'closed', 'private']]
y = data['annual_base_pay']
X = sm.add_constant(X)  # Add intercept

# Step 4: Run Regression Model
model = sm.OLS(y, X).fit()

# Step 5: Print Summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:        annual_base_pay   R-squared:                       0.012
Model:                            OLS   Adj. R-squared:                  0.009
Method:                 Least Squares   F-statistic:                     4.180
Date:                Tue, 04 Mar 2025   Prob (F-statistic):           0.000348
Time:                        14:48:24   Log-Likelihood:                -27766.
No. Observations:                2130   AIC:                         5.555e+04
Df Residuals:                    2123   BIC:                         5.559e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.106e+05   2818.469     39.244      0.0