## Importing needed libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression


## Data downloading

In [2]:
data_all = pd.read_csv("https://osf.io/download/4ay9x/")

  data_all = pd.read_csv("https://osf.io/download/4ay9x/")


## Selecting occupations: Computer programmers; Software developers, applications and systems software; Web developers

In [3]:
data = data_all[data_all['occ2012'].isin([1010, 1020, 1030])]

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,hhid,intmonth,stfips,weight,earnwke,uhours,grade92,race,ethnic,...,ownchild,chldpres,prcitshp,state,ind02,occ2012,class,unionmme,unioncov,lfsr94
155,346,673032906039520,January,AK,411.5571,1346.15,40,39,3,,...,0,0,"Native, Born In US",94,Executive offices and legislative bodies (9211...,1030,Government - Local,No,No,Employed-At Work
293,651,207004430306994,January,AZ,3410.8853,2500.0,40,44,4,,...,0,0,"Foreign Born, US Cit By Naturalization",86,Computer systems design and related services (...,1020,"Private, For Profit",No,No,Employed-At Work
296,657,236096309400800,January,AZ,3916.3279,2500.0,40,43,4,,...,2,10,"Foreign Born, US Cit By Naturalization",86,Business support services (5614),1020,"Private, For Profit",No,No,Employed-At Work
324,724,914299270769003,January,AZ,5115.4707,1250.0,45,43,1,,...,0,0,"Native, Born In US",86,Computer systems design and related services (...,1020,"Private, For Profit",No,No,Employed-At Work
404,913,138098329500878,January,AR,1786.4459,700.0,40,43,1,,...,0,0,"Native, Born In US",71,Offices of physicians (6211),1030,Government - State,No,No,Employed-At Work


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2067 entries, 155 to 149221
Data columns (total 23 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  2067 non-null   int64  
 1   hhid        2067 non-null   int64  
 2   intmonth    2067 non-null   object 
 3   stfips      2067 non-null   object 
 4   weight      2067 non-null   float64
 5   earnwke     2067 non-null   float64
 6   uhours      2067 non-null   int64  
 7   grade92     2067 non-null   int64  
 8   race        2067 non-null   int64  
 9   ethnic      90 non-null     float64
 10  age         2067 non-null   int64  
 11  sex         2067 non-null   int64  
 12  marital     2067 non-null   int64  
 13  ownchild    2067 non-null   int64  
 14  chldpres    2067 non-null   int64  
 15  prcitshp    2067 non-null   object 
 16  state       2067 non-null   object 
 17  ind02       2067 non-null   object 
 18  occ2012     2067 non-null   int64  
 19  class       2067 non-null   

### Ethnic variable has only 90 non-null values, and unioncov 2012 out of 2067. All other variables do not have missing values

In [6]:
data['grade92'].value_counts()

grade92
43    1098
44     505
40     184
42      98
39      71
41      60
46      37
45       9
37       2
38       2
36       1
Name: count, dtype: int64

In [7]:
data['class'].value_counts()

class
Private, For Profit     1796
Private, Nonprofit        92
Government - Federal      81
Government - State        74
Government - Local        24
Name: count, dtype: int64

## Creating variables

In [8]:
# Creating basic variables such as gender, age, and log of the dependent variable
data["female"] = (data["sex"] == 2)
data["w"] = data["earnwke"] / data["uhours"]
data["lnw"] = np.log(data["w"])
data["agesq"] = np.power(data["age"], 2)

# Creating education variables
data["ed_MA"] = (data["grade92"] == 44).astype(int)
data["ed_Phd"] = (data["grade92"] == 46).astype(int)

# Creating interaction terms between gender and education levels
data["female_ed_MA"] = data["female"] * data["ed_MA"]
data["female_ed_Phd"] = data["female"] * data["ed_Phd"]

# Creating the 'priv' column for private classes
data["priv"] = data["class"].isin(['Private, For Profit', 'Private, Nonprofit']).astype(int)
# Creating the 'gover' column for government classes
data["gover"] = data["class"].isin(['Government - Federal', 'Government - State', 'Government - Local']).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["female"] = (data["sex"] == 2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["w"] = data["earnwke"] / data["uhours"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["lnw"] = np.log(data["w"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexe

## Distribution of earnings

In [9]:
data.loc[:, ["earnwke", "uhours", "w"]].describe()

Unnamed: 0,earnwke,uhours,w
count,2067.0,2067.0,2067.0
mean,1554.762545,41.37494,37.508938
std,679.6485,6.525637,15.908843
min,2.0,4.0,0.05
25%,1040.0,40.0,25.0
50%,1500.0,40.0,36.05
75%,2000.0,40.0,48.07675
max,2884.61,80.0,120.192083


In [10]:
data.loc[data["w"] >= 1, ["earnwke", "uhours", "w"]].describe()

Unnamed: 0,earnwke,uhours,w
count,2066.0,2066.0,2066.0
mean,1555.514124,41.375605,37.527069
std,678.953328,6.527147,15.891319
min,37.0,4.0,2.3075
25%,1042.0175,40.0,25.0
50%,1500.0,40.0,36.05
75%,2000.0,40.0,48.07675
max,2884.61,80.0,120.192083


In [11]:
data["female"].value_counts()

female
False    1578
True      489
Name: count, dtype: int64

In [12]:
data.groupby(["occ2012", "female"]).size()

occ2012  female
1010     False      409
         True       107
1020     False     1044
         True       291
1030     False      125
         True        91
dtype: int64

### We see that the majority of samples is male 