In [29]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import pandas as pd

df = pd.read_csv("salaries_modified.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0.1,Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,yearsofexperience,yearsatcompany,stockgrantvalue,bonus,city,state,country,city_state,distance_NY,distance_SF,distance_seattle
0,745,2018-06-05 14:06:30,LinkedIn,5.0,Data Scientist,233.0,4.0,0.0,220.0,10.0,San Francisco,CA,US,San Francisco%20CCA,4680488,110,1297072
1,776,2018-06-08 09:49:25,Microsoft,64.0,Data Scientist,218.0,11.0,11.0,28.0,23.0,Seattle,WA,US,Seattle%20CWA,4606061,1296231,798
2,796,2018-06-10 19:39:35,Twitter,4.0,Data Scientist,500.0,4.0,4.0,280.0,20.0,San Francisco,CA,US,San Francisco%20CCA,4680488,110,1297072
3,822,2018-06-12 20:54:06,Google,6.0,Data Scientist,685.0,22.0,2.0,296000.0,55000.0,Kirkland,WA,US,Kirkland%20CWA,2566387,2714350,4479753
4,858,2018-06-17 11:39:38,Facebook,5.0,Data Scientist,370.0,8.0,3.0,140.0,40.0,Seattle,WA,US,Seattle%20CWA,4606061,1296231,798


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1934 entries, 0 to 1933
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               1934 non-null   int64  
 1   timestamp                1934 non-null   object 
 2   company                  1934 non-null   object 
 3   level                    1934 non-null   float64
 4   title                    1934 non-null   object 
 5   totalyearlycompensation  1934 non-null   float64
 6   yearsofexperience        1934 non-null   float64
 7   yearsatcompany           1934 non-null   float64
 8   stockgrantvalue          1934 non-null   float64
 9   bonus                    1934 non-null   float64
 10  city                     1934 non-null   object 
 11  state                    1934 non-null   object 
 12  country                  1934 non-null   object 
 13  city_state               1934 non-null   object 
 14  distance_NY             

In [31]:
def clean_dist(dist):
    if dist == 'None':
        return None
    try:
        return int(dist)
    except:
        return None

In [32]:
for col in ['distance_NY', 'distance_SF', 'distance_seattle']:
    df[col]=df[col].apply(lambda x: clean_dist(x))

In [33]:
for col in ['distance_NY','distance_SF','distance_seattle']:
    df = df[df[col].notnull()].copy()
    

In [34]:
# Set features. This will also be used as your x values.
selected_features = df[['level', 'yearsofexperience', 'yearsatcompany', 'distance_NY', 'bonus', 'stockgrantvalue', 'distance_SF', 'distance_seattle']]

In [35]:
selected_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1934 entries, 0 to 1933
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   level              1934 non-null   float64
 1   yearsofexperience  1934 non-null   float64
 2   yearsatcompany     1934 non-null   float64
 3   distance_NY        1934 non-null   int64  
 4   bonus              1934 non-null   float64
 5   stockgrantvalue    1934 non-null   float64
 6   distance_SF        1934 non-null   int64  
 7   distance_seattle   1934 non-null   int64  
dtypes: float64(5), int64(3)
memory usage: 136.0 KB


In [36]:
selected_features = selected_features[selected_features.notnull()].copy()
selected_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1934 entries, 0 to 1933
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   level              1934 non-null   float64
 1   yearsofexperience  1934 non-null   float64
 2   yearsatcompany     1934 non-null   float64
 3   distance_NY        1934 non-null   int64  
 4   bonus              1934 non-null   float64
 5   stockgrantvalue    1934 non-null   float64
 6   distance_SF        1934 non-null   int64  
 7   distance_seattle   1934 non-null   int64  
dtypes: float64(5), int64(3)
memory usage: 136.0 KB


In [37]:
selected_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1934 entries, 0 to 1933
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   level              1934 non-null   float64
 1   yearsofexperience  1934 non-null   float64
 2   yearsatcompany     1934 non-null   float64
 3   distance_NY        1934 non-null   int64  
 4   bonus              1934 non-null   float64
 5   stockgrantvalue    1934 non-null   float64
 6   distance_SF        1934 non-null   int64  
 7   distance_seattle   1934 non-null   int64  
dtypes: float64(5), int64(3)
memory usage: 136.0 KB


In [38]:
import numpy as np
inf_df = selected_features.isin([np.inf, -np.inf])
len(inf_df)

1934

In [39]:
selected_features.head()

Unnamed: 0,level,yearsofexperience,yearsatcompany,distance_NY,bonus,stockgrantvalue,distance_SF,distance_seattle
0,5.0,4.0,0.0,4680488,10.0,220.0,110,1297072
1,64.0,11.0,11.0,4606061,23.0,28.0,1296231,798
2,4.0,4.0,4.0,4680488,20.0,280.0,110,1297072
3,6.0,22.0,2.0,2566387,55000.0,296000.0,2714350,4479753
4,5.0,8.0,3.0,4606061,40.0,140.0,1296231,798


In [40]:
df.describe()

Unnamed: 0.1,Unnamed: 0,level,totalyearlycompensation,yearsofexperience,yearsatcompany,stockgrantvalue,bonus,distance_NY,distance_SF,distance_seattle
count,1934.0,1934.0,1934.0,1934.0,1934.0,1934.0,1934.0,1934.0,1934.0,1934.0
mean,33393.588935,14.643226,222.000517,5.050672,2.043873,211.790589,78.555843,3473487.0,1623933.0,1953982.0
std,17503.726027,49.932431,105.199972,4.05517,2.274708,6745.015178,1497.653081,1832508.0,1796548.0,1630193.0
min,745.0,0.0,16.0,0.0,0.0,0.0,0.0,10007.0,110.0,798.0
25%,19355.25,4.0,152.0,2.0,0.0,0.0,7.0,1929914.0,52394.0,1297072.0
50%,33569.5,5.0,200.0,4.0,2.0,30.0,15.0,4606061.0,1296231.0,1348731.0
75%,48975.75,6.0,260.0,7.0,3.0,63.75,27.0,4680488.0,3174687.0,3363231.0
max,62610.0,603.0,1000.0,34.0,20.0,296000.0,55000.0,4796489.0,4977621.0,5305593.0


In [41]:

X = selected_features
y = df.totalyearlycompensation
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=1, test_size=0.2)

In [42]:
X.shape

(1934, 8)

In [43]:
y.shape

(1934,)

In [44]:
y.head

<bound method NDFrame.head of 0       233.0
1       218.0
2       500.0
3       685.0
4       370.0
        ...  
1929    250.0
1930    145.0
1931    110.0
1932    120.0
1933    233.0
Name: totalyearlycompensation, Length: 1934, dtype: float64>

In [45]:
X.dtypes

level                float64
yearsofexperience    float64
yearsatcompany       float64
distance_NY            int64
bonus                float64
stockgrantvalue      float64
distance_SF            int64
distance_seattle       int64
dtype: object

In [46]:

sc_X = StandardScaler()
X_trainscaled=sc_X.fit_transform(X_train)
X_testscaled=sc_X.transform(X_test)

In [47]:
np.where(np.isnan(X_trainscaled))

(array([], dtype=int64), array([], dtype=int64))

In [48]:
reg = MLPRegressor(hidden_layer_sizes=(200,100,50),activation="relu" ,
                   random_state=42, learning_rate='adaptive', max_iter=2000, solver='adam', early_stopping=True).fit(X_trainscaled, y_train)


In [49]:
parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

In [50]:
from sklearn.metrics import mean_absolute_error
y_pred=reg.predict(X_testscaled)
print("The Score with ", (mean_absolute_error(y_pred, y_test)))

The Score with  251.9634507563064


In [51]:
from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(reg, param_distributions = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (200,100,50)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.01, 0.001, .1],
    'learning_rate': ['constant','adaptive'],
    'max_iter': [1000,2000]},
    random_state=42
)

result = rs.fit(X_train, y_train)

  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
Traceback (most recent call last):
  File "C:\Users\richk\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\richk\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 673, in fit
    return self._fit(X, y, incremental=False)
  File "C:\Users\richk\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 399, in _fit
    self._fit_stochastic(X, y, activations, deltas, coef_grads,
  File "C:\Users\richk\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 586, in _fit_stochastic
    self._update_no_improvement_count(early_stopping, X_val, y_val)
  File "C:\Users\richk\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 629, in _update_no_improvement_count
    self.validation_scores_.ap

  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
Traceback (most recent call last):
  File "C:\Users\richk\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\richk\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 673, in fit
    return self._fit(X, y, incremental=False)
  File "C:\Users\richk\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 399, in _fit
    self._fit_stochastic(X, y, activations, deltas, coef_grads,
  File "C:\Users\richk\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 586, in _fit_stochastic
    self._update_no_improvement_count(early_stopping, X_val, y_val)
  File "C:\Users\richk\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 629, in _update_no_improvement_count
    self.validation_scores_.ap

 -6.80214610e-03 -5.83544129e-03             nan  4.83789078e-02
 -8.74979085e-03 -8.75016507e-03]


In [52]:
best_params = result.best_params_

In [53]:
best_params

{'solver': 'adam',
 'max_iter': 1000,
 'learning_rate': 'constant',
 'hidden_layer_sizes': (200, 100, 50),
 'alpha': 0.1,
 'activation': 'tanh'}

In [54]:
y_test_preds = rs.predict(X_test)
mean_absolute_error(y_test_preds, y_test)

80.20064890502901

In [55]:
y_testscaled_preds = rs.predict(X_testscaled)
mean_absolute_error(y_testscaled_preds, y_testscaled)

from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(reg, param_distributions = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (200,100,50)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.01, 0.001, .1],
    'learning_rate': ['constant','adaptive'],
    'max_iter': [1000,2000]},
    random_state=42
)

result = rs.fit(X_trainscaled, y_trainscaled)

NameError: name 'y_testscaled' is not defined

In [None]:
mean_absolute_error(df.totalyearlycompensation)

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'your_name.sav'
joblib.dump(your_model, filename)

In [None]:
actual_vals = [220, 100, 45, 350]
mean_vals = [150, 150, 150, 150]