In [94]:
# imports necessary for preprocessing
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

# NLP Imports
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import CountVectorizer
import re
# Code to download corpora
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

# Imports to create Neural Net and metrics associated
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout
import random
import tensorflow as tf

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cristallobo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/cristallobo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/cristallobo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/cristallobo/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [95]:
# Load data in with read_csv
file_path = "./data_science_jobs_indeed_usa.csv"
df = pd.read_csv(file_path)
df.head()

# check the basic information of the data set, dtypes, null values, column names
df.info()

df.isnull().sum()

# Drop the columns that we won't need, Company, ContractType, ContractTime, SalaryRaw, LocationRaw, ID
new_df = df.drop(columns = ["Company", "Rating", "Date", "Location", "Links", "Descriptions"])
new_df.head()

# check datatypes again
new_df.dtypes

# drop the null values of this dataframe
new_df.dropna(inplace = True)

new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    1200 non-null   int64  
 1   Title         1200 non-null   object 
 2   Company       1200 non-null   object 
 3   Location      1200 non-null   object 
 4   Rating        745 non-null    float64
 5   Date          1200 non-null   object 
 6   Salary        582 non-null    object 
 7   Description   1199 non-null   object 
 8   Links         1200 non-null   object 
 9   Descriptions  1200 non-null   object 
dtypes: float64(1), int64(1), object(8)
memory usage: 93.9+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 582 entries, 1 to 1198
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   582 non-null    int64 
 1   Title        582 non-null    object
 2   Salary       582 non-null    object
 3  

In [96]:
# create a varialbe for wordnetlemmatizer
lemmatizer = WordNetLemmatizer()

def clean_text(article):
    sw = set(stopwords.words('english'))
    sw_addons = {"k", "uk","also"} 
    # Substitute everything that is not a letter with an empty string
    regex = re.compile("[^a-zA-Z ]")
    # we sub in an extra character for anything that is not a character from the
    # above line of code
    re_clean = regex.sub('', article)
    # tokenize each word in the sentence
    words = word_tokenize(re_clean)
    # obtain the root word for each word 
    lem = [lemmatizer.lemmatize(word) for word in words]
    # obtain an output that is all lowercase and not in the stop words
    output = [word.lower() for word in lem if word.lower() not in sw.union(sw_addons)]
    output = ' '.join(output)
    return output

# test function on sliced df to make sure it is correct
print(new_df["Description"][1])
clean_text(new_df["Description"][1])

Preferred candidates will have prior experience in implementing Cloud-hosted business process migration in Software as a Service (SAAS) implementations‚Ä¶


'preferred candidate prior experience implementing cloudhosted business process migration software service saas implementation'

In [97]:
# create new column that has the clean description of the job
new_df['CleanDescription'] = new_df['Description'].apply(clean_text)
new_df.head()

Unnamed: 0.1,Unnamed: 0,Title,Salary,Description,CleanDescription
1,1,Business Analyst,$80 - $120 an hour,Preferred candidates will have prior experienc...,preferred candidate prior experience implement...
3,3,Data Engineer,"$90,000 - $110,000 a year",Incorporate core data management competencies ...,incorporate core data management competency in...
4,4,Network Administrator/dba developer,"$50,000 - $70,000 a year",The Network Administrator provides 2nd level e...,network administrator provides nd level enduse...
8,8,Senior Manager-Data Science,$75 - $90 an hour,Stay aware of emerging data science techniques...,stay aware emerging data science technique tec...
10,10,Data Engineer,From $50 an hour,Should have strong data analysis.\nProven expe...,strong data analysisproven experience design i...


In [98]:
# Calculating the COUNT for the working corpus.
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words="english", min_df=.12)
count_vectorizer = vectorizer.fit_transform(new_df["CleanDescription"])
words_df = pd.DataFrame(count_vectorizer.toarray(), columns=vectorizer.get_feature_names_out())
words_df.head()



Unnamed: 0,business,data,database,experience,learning,machine,year
0,1,0,0,1,0,0,0
1,0,5,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,2,0,0,0,0,0
4,0,2,0,1,0,0,0


In [99]:
# since there are words that were missed with the stop words, I want to give each word an equal weight of one. I want to do this because
# I do not want the word "said" to out way the word "engineer" as an example
# Filter the dataframe so each word has a weight of 1 
filtered_df_2 = words_df.replace(list(range(1,100)),1)
filtered_df_2.head()

Unnamed: 0,business,data,database,experience,learning,machine,year
0,1,0,0,1,0,0,0
1,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0
4,0,1,0,1,0,0,0


In [100]:
import pandas as pd
# combine the two dataframes
combined_df = pd.concat([new_df, filtered_df_2], axis = 1)

# drop the null values from the new dataframe
combined_df.dropna(inplace = True)

# drop the 2 description columns as we no longer need them
combined_df = combined_df.drop(columns = ["Description", "CleanDescription","Unnamed: 0"])
combined_df.head()

Unnamed: 0,Title,Salary,business,data,database,experience,learning,machine,year
1,Business Analyst,$80 - $120 an hour,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,Data Engineer,"$90,000 - $110,000 a year",0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,Network Administrator/dba developer,"$50,000 - $70,000 a year",0.0,1.0,0.0,1.0,0.0,0.0,0.0
8,Senior Manager-Data Science,$75 - $90 an hour,0.0,1.0,0.0,0.0,0.0,0.0,0.0
10,Data Engineer,From $50 an hour,0.0,1.0,0.0,1.0,1.0,1.0,0.0


In [101]:
import re

def normalize_salary(salary_str):
    """
    Normalizes salary string to hourly rate in USD.
    """
    
    salary_str = salary_str.lower().strip()
    if 'hour' in salary_str:
        # hourly rate
        rate_range = re.findall('\d+\.?\d*', salary_str)
        low_rate = float(rate_range[0])
        high_rate = float(rate_range[-1])
        avg_rate = (low_rate + high_rate) / 2
        #rate = float(re.search('\d+\.?\d*', salary_str).group(0))
        return avg_rate
    else:
        # annual rate
        rate_range = re.findall('\d+\.?\d*', salary_str)
        low_rate = float(rate_range[0].replace(',', ''))
        high_rate = float(rate_range[-1].replace(',', ''))
        avg_rate = (low_rate + high_rate) / 2
        # convert annual rate to hourly rate
        #avg_rate /= 52
        #avg_rate /= 40
        return avg_rate

# Example usage:
print(normalize_salary('80-120 an hour'))  # Output: 100.0
print(normalize_salary('90,000-110,000 a year'))  # Output: 42.30769230769231
print(normalize_salary('50,000-70,000 a year'))  # Output: 21.634615384615383
print(normalize_salary('75-90 an hour'))  # Output: 82.5
print(normalize_salary('From $50 an hour'))  # Output: 50.0


100.0
45.0
25.0
82.5
50.0


In [102]:
combined_df['Salary'] = combined_df['Salary'].astype(str).apply(normalize_salary)
print(combined_df)

                                               Title  Salary  business  data  \
1                                   Business Analyst   100.0       0.0   1.0   
3                                      Data Engineer    45.0       0.0   1.0   
4                Network Administrator/dba developer    25.0       0.0   1.0   
8                        Senior Manager-Data Science    82.5       0.0   1.0   
10                                     Data Engineer    50.0       0.0   1.0   
..                                               ...     ...       ...   ...   
572           Senior Business Intelligence Developer    42.5       0.0   0.0   
574                                 Business Analyst   417.0       0.0   1.0   
575                                    Data Engineer    54.0       0.0   1.0   
577                      Manager of Data Engineering    82.5       1.0   1.0   
581  Director, Team Lead - Data & Analytics Delivery   304.0       0.0   1.0   

     database  experience  learning  ma

In [103]:
# use get dummies to turn the category columns into number columns
encoded_df = pd.get_dummies(combined_df)
print(encoded_df)

     Salary  business  data  database  experience  learning  machine  year  \
1     100.0       0.0   1.0       0.0         0.0       0.0      0.0   0.0   
3      45.0       0.0   1.0       0.0         0.0       0.0      0.0   0.0   
4      25.0       0.0   1.0       0.0         1.0       0.0      0.0   0.0   
8      82.5       0.0   1.0       0.0         0.0       0.0      0.0   0.0   
10     50.0       0.0   1.0       0.0         1.0       1.0      1.0   0.0   
..      ...       ...   ...       ...         ...       ...      ...   ...   
572    42.5       0.0   0.0       1.0         0.0       0.0      0.0   0.0   
574   417.0       0.0   1.0       0.0         0.0       0.0      0.0   0.0   
575    54.0       0.0   1.0       0.0         0.0       0.0      0.0   0.0   
577    82.5       1.0   1.0       0.0         1.0       0.0      0.0   1.0   
581   304.0       0.0   1.0       0.0         0.0       0.0      0.0   0.0   

     Title_AI Data Annotation & Deployment Manager  \
1        

In [104]:
# split the dataset into X and y
X = encoded_df.drop(columns = ["Salary"])
y = encoded_df["Salary"].values.reshape(-1,1)

# look at the shape of each data set
X.shape
y.shape

# lets import train test split to split the data up
X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   random_state=78)
# use MinMaxScaler to scale the date
x_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

# scale the training data
x_scaler.fit(X_train)
y_scaler.fit(y_train)
X_train_scaled = x_scaler.transform(X_train)
X_test_scaled = x_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled =y_scaler.transform(y_test)

In [105]:
#salary_df = pd.read_csv("salary_final.csv")

In [106]:
#salary_df.info()

In [107]:
########################### Neural Network Model #################################

In [117]:
import random
import numpy as np
import pandas as pd
import tensorflow as tf
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler

# Set random seed for reproducibility
seed_value = 0
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)

# Load data
#salary_data = pd.read_csv("https://raw.githubusercontent.com/edyoda/data-science-complete-tutorial/master/Data/salary_data.csv")

# Preprocess data
#X = salary_data.iloc[:, :-1].values
#y = salary_data.iloc[:, -1].values.reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed_value)

X_scaler = StandardScaler()
X_train_scaled = X_scaler.fit_transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

y_scaler = StandardScaler()
y_train_scaled = y_scaler.fit_transform(y_train)
y_test_scaled = y_scaler.transform(y_test)


# Define the neural network architecture
number_input_columns = X.shape[1]
number_hidden_nodes = X.shape[1] * 2

neural_network = Sequential()

neural_network.add(Dense(units=number_input_columns, input_dim=number_input_columns, activation="relu"))
neural_network.add(Dense(units=number_hidden_nodes, activation="relu"))
neural_network.add(Dense(units=number_hidden_nodes, activation="relu"))
neural_network.add(Dense(units=number_hidden_nodes, activation="relu"))
neural_network.add(Dense(units=number_hidden_nodes, activation="relu"))
neural_network.add(Dropout(0.5))
neural_network.add(Dense(units=1, activation="linear"))

# Define the learning rate schedule
def step_decay(epoch):
    initial_lrate = 0.1
    drop = 0.5
    epochs_drop = 10.0
    lrate = initial_lrate * math.pow(drop, math.floor((1+epoch)/epochs_drop))
    return lrate

opt = SGD(learning_rate=0.01, momentum=0.8)

# Compile the model
neural_network.compile(loss="mean_absolute_error", optimizer=opt, metrics=["mean_absolute_error"])

# Define the callbacks
lrate = LearningRateScheduler(step_decay)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
callbacks_list = [lrate, es]

# Train the model
nn_model = neural_network.fit(X_train_scaled, y_train_scaled, validation_split=0.2, epochs=100, batch_size=28, callbacks=callbacks_list)

# Evaluate the model
train_loss, train_mae = neural_network.evaluate(X_train_scaled, y_train_scaled, verbose=0)
test_loss, test_mae = neural_network.evaluate(X_test_scaled, y_test_scaled, verbose=0)

print(f"Train MAE: {train_mae:.2f}")
print(f"Test MAE: {test_mae:.2f}")

# Make predictions on X_test_scaled
predictions = neural_network.predict(X_test_scaled)
predicted_salaries = y_scaler.inverse_transform(predictions)
real_salaries = y_scaler.inverse_transform(y_test_scaled)

# Evaluate the model using R-squared and correlation coefficient
r2 = np.corrcoef(predicted_salaries.ravel(), real_salaries.ravel())


from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


# make predictions on X_test_scaled
predictions = neural_network.predict(X_test_scaled)

salaries = pd.DataFrame({
    "Real": real_salaries.ravel(),
    "Predicted": predicted_salaries.ravel()
})
#print(salaries)

# Calculate metrics
mse = mean_squared_error(real_salaries, predicted_salaries)
mae = mean_absolute_error(real_salaries, predicted_salaries)
r2 = r2_score(real_salaries, predicted_salaries)
corr_coef = np.corrcoef(real_salaries.ravel(), predicted_salaries.ravel())[0, 1]

#print("Mean squared error: {:.2f}".format(mse))
print("Mean absolute error: {:.2f}".format(mae))
print("R-squared: {:.2f}".format(r2))
print("Correlation coefficient: {:.2f}".format(corr_coef))

#### test sample values
###
print(new_df.iloc[65:66])
X_train_sample = X_train.iloc[65:66]
y_train_sample = y_train[65:66]
#print(X_train_sample)
#print(y_train_sample)
X_scaled_sample = X_scaler.fit_transform(X_train_sample)
y_scaled_sample  = y_scaler.fit_transform(y_train_sample)
###
prediction_sample = neural_network.predict(X_scaled_sample)

predicted_salary_sample = y_scaler.inverse_transform(prediction_sample)
real_salary_sample = y_scaler.inverse_transform(y_scaled_sample)
salary_sample = pd.DataFrame({
    "Real": real_salary_sample.ravel(),
    "Predicted": predicted_salary_sample.ravel()
})
print(salary_sample)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 27: early stopping
Train MAE: 0.33
Test MAE: 0.40
Mean absolute error: 56.86
R-squared: -0.19
Correlation coefficient: -0.01
     Unnamed: 0                   Title                      Salary  \
134         134  Director, Data Science  $150,000 - $225,000 a year   

                                           Description  \
134  Experience in developing advanced data science...   

                                      CleanDescription  
134  experience developing advanced data science ml...  
   Real  Predicted
0  45.0  44.569637


In [63]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.callbacks import LearningRateScheduler, EarlyStopping
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load the dataset
#salary_data = pd.read_csv("salary_data.csv")

# Split the dataset into input features (X) and target variable (y)
#X = salary_data.iloc[:, :-1].values
#y = salary_data.iloc[:, -1].values.reshape(-1, 1)

# Split the dataset into training and testing sets
seed_value = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed_value)

# Scale the input features and target variable
X_scaler = StandardScaler()
X_train_scaled = X_scaler.fit_transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

y_scaler = StandardScaler()
y_train_scaled = y_scaler.fit_transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

# Define the neural network architecture
number_input_columns = X.shape[1]
number_hidden_nodes = X.shape[1] * 2

neural_network = Sequential()

neural_network.add(Dense(units=number_input_columns, input_dim=number_input_columns, activation="relu"))
neural_network.add(Dense(units=number_hidden_nodes, activation="relu"))
neural_network.add(Dense(units=number_hidden_nodes, activation="relu"))
neural_network.add(Dense(units=number_hidden_nodes, activation="relu"))
neural_network.add(Dense(units=number_hidden_nodes, activation="relu"))
neural_network.add(Dropout(0.5))
neural_network.add(Dense(units=1, activation="linear"))

# Define the learning rate schedule
def step_decay(epoch):
    initial_lrate = 0.1
    drop = 0.5
    epochs_drop = 10.0
    lrate = initial_lrate * math.pow(drop, math.floor((1+epoch)/epochs_drop))
    return lrate

# Define the optimizer and compile the model
opt = SGD(learning_rate=0.01, momentum=0.8)
neural_network.compile(loss="mean_absolute_error", optimizer=opt, metrics=["mean_absolute_error"])

# Define the callbacks for the learning rate schedule and early stopping
lrate = LearningRateScheduler(step_decay)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
callbacks_list = [lrate, es]

# Train the model
nn_model = neural_network.fit(X_train_scaled, y_train_scaled, validation_split=0.2, epochs=100, batch_size=28, callbacks=callbacks_list)

# Evaluate the model on the training and testing sets
train_loss, train_mae = neural_network.evaluate(X_train_scaled, y_train_scaled, verbose=0)
test_loss, test_mae = neural_network.evaluate(X_test_scaled, y_test_scaled, verbose=0)

print(f"Train MAE: {train_mae:.2f}")
print(f"Test MAE: {test_mae:.2f}")

# Make predictions on the testing set and inverse transform the scaled predictions and target variable
predictions = neural_network.predict(X_test_scaled)
predicted_salaries = y_scaler.inverse_transform(predictions)
real_salaries = y_scaler.inverse_transform(y_test_scaled)

# Print the
# Evaluate the model using R-squared and correlation coefficient
r2 = np.corrcoef(predicted_salaries.ravel(), real_salaries.ravel())


from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


# make predictions on X_test_scaled
predictions = neural_network.predict(X_test_scaled)

salaries = pd.DataFrame({
    "Real": real_salaries.ravel(),
    "Predicted": predicted_salaries.ravel()
})
print(salaries)

# Calculate metrics
mse = mean_squared_error(real_salaries, predicted_salaries)
mae = mean_absolute_error(real_salaries, predicted_salaries)
r2 = r2_score(real_salaries, predicted_salaries)
corr_coef = np.corrcoef(real_salaries.ravel(), predicted_salaries.ravel())[0, 1]

print("Mean squared error: {:.2f}".format(mse))
print("Mean absolute error: {:.2f}".format(mae))
print("R-squared: {:.2f}".format(r2))
print("Correlation coefficient: {:.2f}".format(corr_coef))




Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 13: early stopping
Train MAE: 0.33
Test MAE: 0.58
      Real   Predicted
0   390.00  282.962952
1    47.50   75.963806
2    62.50   55.525646
3    41.00  119.779243
4   386.50   57.057747
5    90.00   60.365604
6    54.00   56.468586
7    47.00   61.247841
8    65.00   94.369225
9    60.00   53.782459
10   70.00   57.930050
11   80.00   84.190987
12   70.00   65.539917
13   62.50   61.549480
14  172.50   61.290668
15   50.00   57.906506
16  250.50   65.539917
17  230.00   54.475750
18   30.00   58.631615
19   90.00   62.469444
20   67.50   62.558018
21  391.50   61.839989
22   29.00   52.123669
23   35.00   50.737148
24   45.00   54.776745
25  154.50   57.363373
26   55.00   53.838581
27   50.00   61.856682
28   37.50   66.607735
29  410.50   57.642387
30  389.00   58.385674
31   47.16   59.936432
32   80.00   71.790359
33