In [12]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

In [13]:
if IN_COLAB: 
    !pip install pandas
    !pip install numpy
    !pip install tensorflow

In [14]:
if IN_COLAB:
  !nvidia-smi -L

In [15]:
if IN_COLAB:
  from google.colab import drive
  import sys
  drive.mount('/content/drive')
  sys.path.append('/content/drive/MyDrive')
  sys.path.append('/content/drive/MyDrive/swe-salary-predictor')

In [1]:
import pandas as pd
from common import DataLoader
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
# from scipy import stats
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

import seaborn as sns

pd.options.display.float_format = '{:.2f}'.format
sns.set_theme(color_codes=True)

In [2]:
path = './data/stack-overflow-developer-survey-2021/survey_results_public.csv'
if IN_COLAB:
  path = './drive/MyDrive/swe-salary-predictor/survey_results_public.csv'

dl = DataLoader(path)
df = dl.df
df.head()

_start_pipeline:
  runtime=0:00:00.015991, end shape=(83439, 48)
_select:
  runtime=0:00:00.047674, end shape=(40627, 9)
_clean:
  runtime=0:00:11.640552, end shape=(40627, 238)
_remove_outliers:
  runtime=0:00:00.060732, end shape=(31610, 238)
_handle_missing:
  runtime=0:00:00.074087, end shape=(31610, 238)


Unnamed: 0,ConvertedCompYearly,EdLevel,Age1stCode,YearsCode,YearsCodePro,Age,Academic researcher,Data or business analyst,Data scientist or machine learning specialist,Database administrator,...,United Kingdom of Great Britain and Northern Ireland,United Republic of Tanzania,United States of America,Uruguay,Uzbekistan,"Venezuela, Bolivarian Republic of...",Viet Nam,Yemen,Zambia,Zimbabwe
0,51552.0,3.0,14.0,7.0,4.0,29.5,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,46482.0,2.0,14.0,12.0,5.0,29.5,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,77290.0,3.0,14.0,15.0,6.0,29.5,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,17748.0,2.0,7.5,6.0,2.0,29.5,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,46135.0,2.0,14.0,9.0,6.0,29.5,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
df.isna().sum()

ConvertedCompYearly                     0
EdLevel                                 0
Age1stCode                              0
YearsCode                               0
YearsCodePro                            0
                                       ..
Venezuela, Bolivarian Republic of...    0
Viet Nam                                0
Yemen                                   0
Zambia                                  0
Zimbabwe                                0
Length: 237, dtype: int64

In [6]:
reg = LinearRegression()
label = 'ConvertedCompYearly'
labels = df[label]
train = df.drop([label], axis=1)

In [7]:
x_train, x_test, y_train, y_test = train_test_split(train, labels, test_size=0.2, random_state=1776)

In [8]:
reg.fit(x_train, y_train)

LinearRegression()

In [9]:
def performance(x, y, name):
    y_predict = reg.predict(x)
    rmse = (np.sqrt(mean_squared_error(y, y_predict)))
    r2 = r2_score(y, y_predict)

    print(name)
    print(f'  rmse={rmse}, r2={r2}')


performance(x_train, y_train, 'train')
performance(x_test, y_test, 'test')

train
  rmse=33758.308134338076, r2=0.575034508370031
test
  rmse=34421.421519381955, r2=0.5526288983693404


In [None]:
if IN_COLAB:
    import tensorflow as tf
    from tensorflow import keras
    
    dl = DataLoader(path)
    df = dl.df
    train_dataset = df.sample(frac=0.8, random_state=0)
    test_dataset = df.drop(train_dataset.index)
    train_features = train_dataset.copy()
    test_features = test_dataset.copy()
    train_labels = train_features.pop('ConvertedCompYearly')
    test_labels = test_features.pop('ConvertedCompYearly')
    train_dataset.describe().transpose()[['mean', 'std']]
    
    normalizer = tf.keras.layers.Normalization(axis=-1)
    normalizer.adapt(np.array(train_features))
    print(normalizer.mean.numpy())
    
    first = np.array(train_features[:1])

    with np.printoptions(precision=2, suppress=True):
      print('First example:', first)
      print()
      print('Normalized:', normalizer(first).numpy())

In [None]:
def build_and_compile_model(norm):
  model = keras.Sequential([
      norm,
      keras.layers.Dense(64, activation='relu'),
      keras.layers.Dense(64, activation='relu'),
      keras.layers.Dense(1)
  ])

  model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
  return model

if IN_COLAB:
    dnn_model = build_and_compile_model(normalizer)
    print(dnn_model.summary())

In [None]:
if IN_COLAB:
    %%time
    history = dnn_model.fit(
        train_features,
        train_labels,
        validation_split=0.2,
        verbose=0, epochs=50)

    print(dnn_model.evaluate(test_features, test_labels, verbose=0))