In [47]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import pandas as pd

df = pd.read_csv("salaries_modified.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0.1,Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,yearsofexperience,yearsatcompany,city,state,country,city_state,distance_NY,distance_SF,distance_seattle
0,745,2018-06-05 14:06:30,LinkedIn,5.0,Data Scientist,233.0,4.0,0.0,San Francisco,CA,US,San Francisco%20CCA,4680488,110,1297072
1,772,2018-06-08 00:29:47,Amazon,4.0,Data Scientist,140.0,2.0,2.0,Seattle,WA,US,Seattle%20CWA,4606061,1296231,798
2,776,2018-06-08 09:49:25,Microsoft,64.0,Data Scientist,218.0,11.0,11.0,Seattle,WA,US,Seattle%20CWA,4606061,1296231,798
3,782,2018-06-08 17:55:09,ebay,26.0,Data Scientist,180.0,10.0,5.0,San Jose,CA,US,San Jose%20CCA,4733129,75992,1349713
4,796,2018-06-10 19:39:35,Twitter,4.0,Data Scientist,500.0,4.0,4.0,San Francisco,CA,US,San Francisco%20CCA,4680488,110,1297072


In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2056 entries, 0 to 2055
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               2056 non-null   int64  
 1   timestamp                2056 non-null   object 
 2   company                  2056 non-null   object 
 3   level                    2056 non-null   float64
 4   title                    2056 non-null   object 
 5   totalyearlycompensation  2056 non-null   float64
 6   yearsofexperience        2056 non-null   float64
 7   yearsatcompany           2056 non-null   float64
 8   city                     2056 non-null   object 
 9   state                    2056 non-null   object 
 10  country                  2056 non-null   object 
 11  city_state               2056 non-null   object 
 12  distance_NY              2056 non-null   object 
 13  distance_SF              2056 non-null   object 
 14  distance_seattle        

In [49]:
def clean_dist(dist):
    if dist == 'None':
        return None
    try:
        return int(dist)
    except:
        return None

In [50]:
for col in ['distance_NY', 'distance_SF', 'distance_seattle']:
    df[col]=df[col].apply(lambda x: clean_dist(x))

In [51]:
for col in ['distance_NY','distance_SF','distance_seattle']:
    df = df[df[col].notnull()].copy()
    

In [52]:
# Set features. This will also be used as your x values.
selected_features = df[['level', 'yearsofexperience', 'yearsatcompany', 'distance_NY', 'distance_SF', 'distance_seattle']]

In [53]:
selected_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2055 entries, 0 to 2055
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   level              2055 non-null   float64
 1   yearsofexperience  2055 non-null   float64
 2   yearsatcompany     2055 non-null   float64
 3   distance_NY        2055 non-null   float64
 4   distance_SF        2055 non-null   float64
 5   distance_seattle   2055 non-null   float64
dtypes: float64(6)
memory usage: 112.4 KB


In [54]:
selected_features = selected_features[selected_features.notnull()].copy()
selected_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2055 entries, 0 to 2055
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   level              2055 non-null   float64
 1   yearsofexperience  2055 non-null   float64
 2   yearsatcompany     2055 non-null   float64
 3   distance_NY        2055 non-null   float64
 4   distance_SF        2055 non-null   float64
 5   distance_seattle   2055 non-null   float64
dtypes: float64(6)
memory usage: 112.4 KB


In [55]:
selected_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2055 entries, 0 to 2055
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   level              2055 non-null   float64
 1   yearsofexperience  2055 non-null   float64
 2   yearsatcompany     2055 non-null   float64
 3   distance_NY        2055 non-null   float64
 4   distance_SF        2055 non-null   float64
 5   distance_seattle   2055 non-null   float64
dtypes: float64(6)
memory usage: 112.4 KB


In [56]:
import numpy as np
inf_df = selected_features.isin([np.inf, -np.inf])
len(inf_df)

2055

In [57]:
selected_features.head()

Unnamed: 0,level,yearsofexperience,yearsatcompany,distance_NY,distance_SF,distance_seattle
0,5.0,4.0,0.0,4680488.0,110.0,1297072.0
1,4.0,2.0,2.0,4606061.0,1296231.0,798.0
2,64.0,11.0,11.0,4606061.0,1296231.0,798.0
3,26.0,10.0,5.0,4733129.0,75992.0,1349713.0
4,4.0,4.0,4.0,4680488.0,110.0,1297072.0


In [58]:
df.describe()

Unnamed: 0.1,Unnamed: 0,level,totalyearlycompensation,yearsofexperience,yearsatcompany,distance_NY,distance_SF,distance_seattle
count,2055.0,2055.0,2055.0,2055.0,2055.0,2055.0,2055.0,2055.0
mean,31790.884672,14.421898,221.609732,5.027251,2.036667,3493419.0,1605568.0,1932465.0
std,18164.814512,48.636267,105.692401,4.026233,2.237007,1824317.0,1788747.0,1625972.0
min,745.0,0.0,16.0,0.0,0.0,10007.0,110.0,798.0
25%,16258.5,4.0,150.0,2.0,0.1,2076857.0,52394.0,1297072.0
50%,31591.0,5.0,200.0,4.0,2.0,4606061.0,1296231.0,1348731.0
75%,48258.0,6.0,260.0,7.0,3.0,4680488.0,3100688.0,3363231.0
max,62610.0,603.0,1000.0,34.0,20.0,4796489.0,4977621.0,5305593.0


In [59]:

X = selected_features
y = df.totalyearlycompensation
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=1, test_size=0.2)

In [60]:
X.shape

(2055, 6)

In [61]:
y.shape

(2055,)

In [62]:
y.head

<bound method NDFrame.head of 0       233.0
1       140.0
2       218.0
3       180.0
4       500.0
        ...  
2051    250.0
2052    145.0
2053    110.0
2054    120.0
2055    233.0
Name: totalyearlycompensation, Length: 2055, dtype: float64>

In [63]:
X.dtypes

level                float64
yearsofexperience    float64
yearsatcompany       float64
distance_NY          float64
distance_SF          float64
distance_seattle     float64
dtype: object

In [64]:

sc_X = StandardScaler()
X_trainscaled=sc_X.fit_transform(X_train)
X_testscaled=sc_X.transform(X_test)

In [65]:
np.where(np.isnan(X_trainscaled))

(array([], dtype=int64), array([], dtype=int64))

In [66]:
reg = MLPRegressor(hidden_layer_sizes=(200,100,50),activation="relu" ,
                   random_state=42, learning_rate='adaptive', max_iter=2000, solver='adam', 'alpha': [0.0001, 0.05], early_stopping=True).fit(X_trainscaled, y_train)


In [None]:
parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

In [67]:
from sklearn.metrics import mean_absolute_error
y_pred=reg.predict(X_testscaled)
print("The Score with ", (mean_absolute_error(y_pred, y_test)))

The Score with  59.1747030322278


In [74]:
from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(reg, param_distributions = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (200,100,50)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.01, 0.001, .1],
    'learning_rate': ['constant','adaptive'],
    'max_iter': [1000,2000]},
    random_state=42
)

result = rs.fit(X_train, y_train)

  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
Traceback (most recent call last):
  File "C:\Users\richk\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\richk\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 673, in fit
    return self._fit(X, y, incremental=False)
  File "C:\Users\richk\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 399, in _fit
    self._fit_stochastic(X, y, activations, deltas, coef_grads,
  File "C:\Users\richk\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 586, in _fit_stochastic
    self._update_no_improvement_count(early_stopping, X_val, y_val)
  File "C:\Users\richk\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 629, in _update_no_improvement_count
    self.validation_scores_.ap

  return ((y_true - y_pred) ** 2).mean() / 2
  ret = a @ b
  ret = a @ b
Traceback (most recent call last):
  File "C:\Users\richk\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\richk\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 673, in fit
    return self._fit(X, y, incremental=False)
  File "C:\Users\richk\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 399, in _fit
    self._fit_stochastic(X, y, activations, deltas, coef_grads,
  File "C:\Users\richk\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 586, in _fit_stochastic
    self._update_no_improvement_count(early_stopping, X_val, y_val)
  File "C:\Users\richk\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 629, in _update_no_improvement_count
    self.validation_scores_.ap

 -4.33157561e-03 -4.32848959e-03             nan  3.03659878e-02
 -4.27274126e-03 -4.27327194e-03]


In [75]:
best_params = result.best_params_

In [76]:
best_params

{'solver': 'adam',
 'max_iter': 2000,
 'learning_rate': 'constant',
 'hidden_layer_sizes': (200, 100, 50),
 'alpha': 0.01,
 'activation': 'tanh'}

In [79]:
y_test_preds = rs.predict(X_test)
mean_absolute_error(y_test_preds, y_test)

74.12749323331276

In [68]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'your_name.sav'
joblib.dump(your_model, filename)

NameError: name 'your_model' is not defined

In [None]:
actual_vals = [220, 100, 45, 350]
mean_vals = [150, 150, 150, 150]