In [5]:
import pandas as pd
import numpy as np
import matplotlib as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pprint


In [4]:
df = pd.read_stata('CEOSAL2.DTA')
df

Unnamed: 0,salary,age,college,grad,comten,ceoten,sales,profits,mktval,lsalary,lsales,lmktval,comtensq,ceotensq,profmarg
0,1161,49,1,1,9,2,6200.0,966,23200.0,7.057037,8.732305,10.051908,81,4,15.580646
1,600,43,1,1,10,10,283.0,48,1100.0,6.396930,5.645447,7.003066,100,100,16.961130
2,379,51,1,1,9,3,169.0,40,1100.0,5.937536,5.129899,7.003066,81,9,23.668638
3,651,55,1,0,22,22,1100.0,-54,1000.0,6.478509,7.003066,6.907755,484,484,-4.909091
4,497,44,1,1,8,6,351.0,28,387.0,6.208590,5.860786,5.958425,64,36,7.977208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172,264,63,1,0,42,3,334.0,43,480.0,5.575949,5.811141,6.173786,1764,9,12.874251
173,185,58,1,0,39,1,766.0,49,560.0,5.220356,6.641182,6.327937,1521,1,6.396867
174,387,71,1,1,32,13,432.0,28,477.0,5.958425,6.068426,6.167517,1024,169,6.481482
175,2220,63,1,1,18,18,277.0,-80,540.0,7.705263,5.624018,6.291569,324,324,-28.880867


In [7]:
reader = pd.io.stata.StataReader('CEOSAL2.DTA')
pprint.pprint(reader.variable_labels())

{'age': 'in years',
 'ceoten': 'years as ceo with company',
 'ceotensq': 'ceoten^2',
 'college': '=1 if attended college',
 'comten': 'years with company',
 'comtensq': 'comten^2',
 'grad': '=1 if attended graduate school',
 'lmktval': 'log(mktval)',
 'lsalary': 'log(salary)',
 'lsales': 'log(sales)',
 'mktval': 'market value, end 1990, mills.',
 'profits': '1990 profits, millions',
 'profmarg': 'profits as % of sales',
 'salary': '1990 compensation, $1000s',
 'sales': '1990 firm sales, millions'}


In [13]:
model = smf.ols(formula = 'lsalary ~ lsales + lmktval', data = df)
print(model.fit().summary())

                            OLS Regression Results                            
Dep. Variable:                lsalary   R-squared:                       0.299
Model:                            OLS   Adj. R-squared:                  0.291
Method:                 Least Squares   F-statistic:                     37.13
Date:                Tue, 02 Sep 2025   Prob (F-statistic):           3.73e-14
Time:                        16:20:48   Log-Likelihood:                -130.56
No. Observations:                 177   AIC:                             267.1
Df Residuals:                     174   BIC:                             276.6
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      4.6209      0.254     18.163      0.0

In [14]:
model = smf.ols(formula = 'lsalary ~ lsales + lmktval+ profits', data = df)
print(model.fit().summary())

                            OLS Regression Results                            
Dep. Variable:                lsalary   R-squared:                       0.299
Model:                            OLS   Adj. R-squared:                  0.287
Method:                 Least Squares   F-statistic:                     24.64
Date:                Tue, 02 Sep 2025   Prob (F-statistic):           2.53e-13
Time:                        16:21:00   Log-Likelihood:                -130.53
No. Observations:                 177   AIC:                             269.1
Df Residuals:                     173   BIC:                             281.8
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      4.6869      0.380     12.343      0.0

ii. Profits can not be included as logs can only have positive values, having negative profits makes log profits not work. These firm preformance metrics only account for 29 percent of the ceo salary change

In [15]:
model = smf.ols(formula = 'lsalary ~ lsales + lmktval+ ceoten', data = df)
print(model.fit().summary())

                            OLS Regression Results                            
Dep. Variable:                lsalary   R-squared:                       0.318
Model:                            OLS   Adj. R-squared:                  0.306
Method:                 Least Squares   F-statistic:                     26.91
Date:                Tue, 02 Sep 2025   Prob (F-statistic):           2.47e-14
Time:                        16:24:52   Log-Likelihood:                -128.12
No. Observations:                 177   AIC:                             264.2
Df Residuals:                     173   BIC:                             276.9
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      4.5038      0.257     17.509      0.0

For every additional year in ceoten the increase in salary per year goes up 1 percent

In [21]:
df['lmktval'].corr(df['profits'])


np.float64(0.7768975920312072)

iv. Sample correlation coefficient is .7769 which means these varibles are highly correlated. OLS estimator is still unbiased but it reduces percision in estimating specific varible effects