In [1]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import pandas as pd

df = pd.read_csv("salaries_modified.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0.1,Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,yearsofexperience,yearsatcompany,stockgrantvalue,bonus,city,state,country,city_state,distance_NY,distance_SF,distance_seattle
0,745,2018-06-05 14:06:30,LinkedIn,5.0,Data Scientist,233.0,4.0,0.0,220.0,10.0,San Francisco,CA,US,San Francisco%20CCA,4680488,110,1297072
1,772,2018-06-08 00:29:47,Amazon,4.0,Data Scientist,140.0,2.0,2.0,48.0,0.0,Seattle,WA,US,Seattle%20CWA,4606061,1296231,798
2,776,2018-06-08 09:49:25,Microsoft,64.0,Data Scientist,218.0,11.0,11.0,28.0,23.0,Seattle,WA,US,Seattle%20CWA,4606061,1296231,798
3,782,2018-06-08 17:55:09,ebay,26.0,Data Scientist,180.0,10.0,5.0,0.0,0.0,San Jose,CA,US,San Jose%20CCA,4733129,75992,1349713
4,796,2018-06-10 19:39:35,Twitter,4.0,Data Scientist,500.0,4.0,4.0,280.0,20.0,San Francisco,CA,US,San Francisco%20CCA,4680488,110,1297072


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2056 entries, 0 to 2055
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               2056 non-null   int64  
 1   timestamp                2056 non-null   object 
 2   company                  2056 non-null   object 
 3   level                    2056 non-null   float64
 4   title                    2056 non-null   object 
 5   totalyearlycompensation  2056 non-null   float64
 6   yearsofexperience        2056 non-null   float64
 7   yearsatcompany           2056 non-null   float64
 8   stockgrantvalue          2056 non-null   float64
 9   bonus                    2056 non-null   float64
 10  city                     2056 non-null   object 
 11  state                    2056 non-null   object 
 12  country                  2056 non-null   object 
 13  city_state               2056 non-null   object 
 14  distance_NY             

In [3]:
def clean_dist(dist):
    if dist == 'None':
        return None
    try:
        return int(dist)
    except:
        return None

In [4]:
for col in ['distance_NY', 'distance_SF', 'distance_seattle']:
    df[col]=df[col].apply(lambda x: clean_dist(x))

In [5]:
for col in ['distance_NY','distance_SF','distance_seattle']:
    df = df[df[col].notnull()].copy()
    

In [6]:
# Set features. This will also be used as your x values.
selected_features = df[['level', 'yearsofexperience', 'yearsatcompany', 'distance_NY', 'distance_SF', 'distance_seattle']]

In [7]:
selected_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2055 entries, 0 to 2055
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   level              2055 non-null   float64
 1   yearsofexperience  2055 non-null   float64
 2   yearsatcompany     2055 non-null   float64
 3   distance_NY        2055 non-null   float64
 4   distance_SF        2055 non-null   float64
 5   distance_seattle   2055 non-null   float64
dtypes: float64(6)
memory usage: 112.4 KB


In [8]:
selected_features = selected_features[selected_features.notnull()].copy()
selected_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2055 entries, 0 to 2055
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   level              2055 non-null   float64
 1   yearsofexperience  2055 non-null   float64
 2   yearsatcompany     2055 non-null   float64
 3   distance_NY        2055 non-null   float64
 4   distance_SF        2055 non-null   float64
 5   distance_seattle   2055 non-null   float64
dtypes: float64(6)
memory usage: 112.4 KB


In [9]:
selected_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2055 entries, 0 to 2055
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   level              2055 non-null   float64
 1   yearsofexperience  2055 non-null   float64
 2   yearsatcompany     2055 non-null   float64
 3   distance_NY        2055 non-null   float64
 4   distance_SF        2055 non-null   float64
 5   distance_seattle   2055 non-null   float64
dtypes: float64(6)
memory usage: 112.4 KB


In [10]:
import numpy as np
inf_df = selected_features.isin([np.inf, -np.inf])
len(inf_df)

2055

In [11]:
selected_features.head()

Unnamed: 0,level,yearsofexperience,yearsatcompany,distance_NY,distance_SF,distance_seattle
0,5.0,4.0,0.0,4680488.0,110.0,1297072.0
1,4.0,2.0,2.0,4606061.0,1296231.0,798.0
2,64.0,11.0,11.0,4606061.0,1296231.0,798.0
3,26.0,10.0,5.0,4733129.0,75992.0,1349713.0
4,4.0,4.0,4.0,4680488.0,110.0,1297072.0


In [12]:
df.describe()

Unnamed: 0.1,Unnamed: 0,level,totalyearlycompensation,yearsofexperience,yearsatcompany,stockgrantvalue,bonus,distance_NY,distance_SF,distance_seattle
count,2055.0,2055.0,2055.0,2055.0,2055.0,2055.0,2055.0,2055.0,2055.0,2055.0
mean,31790.884672,14.421898,221.609732,5.027251,2.036667,46.851582,18.511436,3493419.0,1605568.0,1932465.0
std,18164.814512,48.636267,105.692401,4.026233,2.237007,66.050664,20.766348,1824317.0,1788747.0,1625972.0
min,745.0,0.0,16.0,0.0,0.0,0.0,0.0,10007.0,110.0,798.0
25%,16258.5,4.0,150.0,2.0,0.1,0.0,3.0,2076857.0,52394.0,1297072.0
50%,31591.0,5.0,200.0,4.0,2.0,28.0,15.0,4606061.0,1296231.0,1348731.0
75%,48258.0,6.0,260.0,7.0,3.0,60.0,25.0,4680488.0,3100688.0,3363231.0
max,62610.0,603.0,1000.0,34.0,20.0,700.0,350.0,4796489.0,4977621.0,5305593.0


In [13]:

X = selected_features
y = df.stockgrantvalue
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=1, test_size=0.2)

In [14]:
X.shape

(2055, 6)

In [15]:
y.shape

(2055,)

In [16]:
y.head

<bound method NDFrame.head of 0       220.0
1        48.0
2        28.0
3         0.0
4       280.0
        ...  
2051    100.0
2052      0.0
2053      0.0
2054      0.0
2055     60.0
Name: stockgrantvalue, Length: 2055, dtype: float64>

In [17]:
X.dtypes

level                float64
yearsofexperience    float64
yearsatcompany       float64
distance_NY          float64
distance_SF          float64
distance_seattle     float64
dtype: object

In [18]:

sc_X = StandardScaler()
X_trainscaled=sc_X.fit_transform(X_train)
X_testscaled=sc_X.transform(X_test)

In [19]:
np.where(np.isnan(X_trainscaled))

(array([], dtype=int64), array([], dtype=int64))

In [20]:
reg = MLPRegressor(hidden_layer_sizes=(200,100,50),activation="relu" ,
                   random_state=42, learning_rate='adaptive', max_iter=2000, solver='adam', early_stopping=True).fit(X_trainscaled, y_train)


In [23]:
parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.01, 0.001, 0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

In [24]:
from sklearn.metrics import mean_absolute_error
y_pred=reg.predict(X_testscaled)
print("The Score with ", (mean_absolute_error(y_pred, y_test)))

The Score with  38.14863048471355


In [25]:
from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(reg, param_distributions = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (200,100,50)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.01, 0.001, 0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
    'max_iter': [1000,2000]},
    random_state=42
)

result = rs.fit(X_train, y_train)

  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
Traceback (most recent call last):
  File "C:\Users\thepi\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\thepi\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 673, in fit
    return self._fit(X, y, incremental=False)
  File "C:\Users\thepi\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 399, in _fit
    self._fit_stochastic(X, y, activations, deltas, coef_grads,
  File "C:\Users\thepi\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 586, in _fit_stochastic
    self._update_no_improvement_count(early_stopping, X_val, y_val)
  File "C:\Users\thepi\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 629, in _update_no_improvement_count
    self.validation_scores_.ap

 -7.22458192e+01 -1.03254125e+02  6.99146720e-02  8.55490621e-02
  8.55490621e-02  6.99146720e-02]


In [26]:
best_params = result.best_params_

In [27]:
best_params

{'solver': 'adam',
 'max_iter': 1000,
 'learning_rate': 'adaptive',
 'hidden_layer_sizes': (200, 100, 50),
 'alpha': 0.001,
 'activation': 'tanh'}

In [28]:
y_test_preds = rs.predict(X_test)
mean_absolute_error(y_test_preds, y_test)

39.39257674331105

In [29]:
from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(reg, param_distributions = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (200,100,50)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.01, 0.001, .1],
    'learning_rate': ['constant','adaptive'],
    'max_iter': [1000,2000]},
    random_state=42
)

result = rs.fit(X_trainscaled, y_train)

  ret = a @ b
Traceback (most recent call last):
  File "C:\Users\thepi\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\thepi\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 673, in fit
    return self._fit(X, y, incremental=False)
  File "C:\Users\thepi\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 399, in _fit
    self._fit_stochastic(X, y, activations, deltas, coef_grads,
  File "C:\Users\thepi\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 586, in _fit_stochastic
    self._update_no_improvement_count(early_stopping, X_val, y_val)
  File "C:\Users\thepi\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 629, in _update_no_improvement_count
    self.validation_scores_.append(self.score(X_val, y_val))
  File "C:\Users\thepi\anaco

In [30]:
y_test_preds = rs.predict(X_testscaled)
mean_absolute_error(y_test_preds, y_test)

38.249605834852986

# model is result
# scalar is sc_X

In [32]:
from pickle import dump
equity_model = 'model_artifacts/equity_model.pkl'
equity_scalar = 'model_artifacts/equity_scalar.pkl'
dump(result, open(equity_model, 'wb'))
dump(sc_X, open(equity_scalar, 'wb'))