In [75]:
import json

from bayes_opt import BayesianOptimization
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

# Preprocess Data

In [2]:
def json2df(json_path):
    """Loads scraped json data into a Pandas DF
    
    """
    # load data from json
    with open(json_path) as f:
        data = json.load(f)

    # preprocess json data and convert to df
    all_postings = []
    for search_term in data:
        all_postings.extend(data[search_term])
    df = pd.DataFrame(all_postings)
    
    # drop duplicate postings by link
    df = df.drop_duplicates(subset='url')
    return df

In [3]:
def process_location(df):
    df["city"] = np.where(df.location.str.contains(","), df.location.str.split(",").str.get(0).str.strip(), None)
    df["province"] = np.where(df.location.str.contains(","), df.location.str.split(",").str.get(1).str.strip(), None)
    df["country"] = np.where(df.location == "Canada", "Canada", None)
    province_list = ["ON", "BC", "QC", "AB", "SK", "MB", "NS", "NB", "PE", "YT", "NL"]
    df.loc[df.province.isin(province_list), "country"] = "Canada"
    df.loc[(df.province == "Canada") | ~(df.province.isin(province_list)), "city"] = None
    df.loc[(df.province == "Canada") | ~(df.province.isin(province_list)), "province"] = None
    df.loc[(df.province == "Canada") | ~(df.province.isin(province_list)), "country"] = "Canada"
    df = df.drop("location", axis=1)
    return df

In [4]:
def process_num_applicants(df):
    df.num_applicants = np.where(df.num_applicants.str.split(" ").str.get(0).str.isdigit(), 
        df.num_applicants.str.split(" ").str.get(0), df.num_applicants)
    df.num_applicants = np.where(df.num_applicants.str.split(" ").str.get(0) == "Over", 100, df.num_applicants)
    df.num_applicants = df.num_applicants.astype("float64")
    return df

In [5]:
def salary_string2annual(salary):
    try:
        amount = salary.split("/")[0].split("$")[-1].replace("$", "")
        if "K" in amount:
            amount = amount.replace("K", "")
            amount = float(amount)*1000
        amount = float(amount)
        term = salary.split("/")[1].split(" ")[0]
        if term == 'hr':
            amount = amount*2000
    except Exception as e:
        #print(e)
        amount = salary
    return amount

In [6]:
def process_salary(df):
    df.salary = np.where(df.salary.str.contains("$", regex=False), df.salary, None)
    salaries = []
    for i in range(len(df)):
        salary = df.iloc[i].salary
        if salary is not None:
            try:
                if salary.count("$") == 1:
                    # the salary is not a range
                    salaries.append(salary_string2annual(salary))
                else:
                    split_salary_string = salary.split(" ")
                    lower, upper = "", ""
                    for word in split_salary_string:
                        if "$" in word:
                            if lower == "":
                                lower = word
                            else:
                                upper = word
                                break
                    mean_amount = (salary_string2annual(lower) + salary_string2annual(upper))/2
                    salaries.append(mean_amount)
            except Exception as e:
                print(e)
                pass
        else:
            salaries.append(None)
    df["annual_salary"] = salaries
    df = df.drop("salary", axis=1)
    return df

In [7]:
def add_glassdoor_salaries(df, json_path):
    with open(json_path) as f:
        glassdoor_salaries = json.load(f)
    annual_salaries = df.annual_salary.to_list()
    for i in range(len(df)):
        if pd.isnull(annual_salaries[i]):
            url = df.iloc[i].url
            if url in glassdoor_salaries:
                annual_salaries[i] = glassdoor_salaries[url]
    df.annual_salary = annual_salaries
    df.annual_salary = df.annual_salary.astype('float64')
    return df

In [8]:
def preprocess_scraped_data(linkedin_json_path, glassdoor_json_path):
    df = json2df(linkedin_json_path)
    df = process_location(df)
    df = process_num_applicants(df)
    df = process_salary(df)
    df = add_glassdoor_salaries(df, glassdoor_json_path)
    df["job_title"] = df.title
    df = df.drop("title", axis=1)
    df = df[["job_title", "employer", "num_applicants", "annual_salary", "city", "province", "country", "description", "url"]]
    df.to_pickle("linkedin_scraped_posts_0407_processed.pkl")

In [9]:
preprocess_scraped_data("data/linkedin_scraped_posts_0407.json", "data/glassdoor_scraped_salaries_0407.json")

In [10]:
df = pd.read_pickle("linkedin_scraped_posts_0407_processed.pkl")

In [11]:
# drop title + employer duplicates
df = df.drop_duplicates(subset=["job_title", "employer"])

In [12]:
# check jobs missing salaries
df[~(df.annual_salary > 0)]

Unnamed: 0,job_title,employer,num_applicants,annual_salary,city,province,country,description,url
387,Volunteer: Financial Analyst (Remote/Unpaid Vo...,VolunteerMatch,86.0,0.0,Calgary,AB,Canada,About the job\nEmpowered Futures is a non-prof...,https://www.linkedin.com/jobs/view/3817534547/...
949,Software Engineer (Python) - Up to CAD$180k + ...,Hunter Bond,37.0,,,,Canada,About the job\nClient: \nA prestigious technol...,https://www.linkedin.com/jobs/view/3882857588/...
975,Growth Hacker,Let's Roam,36.0,,,,Canada,About the job\nGrowth Marketing & Product Inno...,https://www.linkedin.com/jobs/view/3862563221/...
1021,MLOps Software Engineer - Elite AI Team - Up t...,Hunter Bond,27.0,,Montreal,QC,Canada,About the job\nJob Title: MLOps Software Engin...,https://www.linkedin.com/jobs/view/3879743623/...


In [13]:
display(df.loc[949].job_title)
display(df.loc[975].job_title)
display(df.loc[1021].job_title)

'Software Engineer (Python) - Up to CAD$180k + Huge Bonus Montreal'

'Growth Hacker'

'MLOps Software Engineer - Elite AI Team - Up to $140k CAD + Bonus'

In [14]:
# update salaries based on title and glassdoor search
df.loc[949, "annual_salary"] = 180000
df.loc[975, "annual_salary"] = 55000
df.loc[1021, "annual_salary"] = 140000

In [15]:
# remove volunteer position to avoid outliers
df = df.drop(387)

In [16]:
df[~(df.annual_salary > 0)]

Unnamed: 0,job_title,employer,num_applicants,annual_salary,city,province,country,description,url


In [17]:
df = df.reset_index()
df = df.drop('index', axis=1)
print(len(df))

596


# Analysis and Feature Engineering

In [18]:
df.head()

Unnamed: 0,job_title,employer,num_applicants,annual_salary,city,province,country,description,url
0,Senior Data Scientist,Akkodis,76.0,118000.0,Toronto,ON,Canada,About the job\nHi Candidate\nI hope you are do...,https://www.linkedin.com/jobs/view/3881928212/...
1,Lead Data Scientist/Machine Learning Engineer ...,Agoda,41.0,132000.0,Toronto,ON,Canada,About the job\nAbout Agoda\n\nAgoda is an onli...,https://www.linkedin.com/jobs/view/3839799162/...
2,"Practice Lead Data Scientist, Data Driven Mark...",Cogeco Connexion,41.0,120000.0,Montreal,QC,Canada,About the job\nOur culture lifts you up—there ...,https://www.linkedin.com/jobs/view/3848292659/...
3,Senior Data Scientist,Clio - Cloud-Based Legal Technology,100.0,165000.0,,,Canada,About the job\nClio is more than just a tech c...,https://www.linkedin.com/jobs/view/3827050664/...
4,Lead Data Scientist,Logikk,100.0,132000.0,Montreal,QC,Canada,About the job\nLead Data Scientist - Leading A...,https://www.linkedin.com/jobs/view/3865773248/...


In [19]:
def cross_with_annual_salary(df, other_column, num_cuts):
    percentiles = []
    for i in range(num_cuts):
        next_percentile = round(np.nanpercentile(df[other_column], i*100/(num_cuts)), 2)
        if next_percentile not in percentiles:
            percentiles.append(next_percentile)
    percentiles.append(max(df[other_column]))

    formatted_ranges, mean_salaries = [], []
    for i in range(1, len(percentiles)):
        if i != len(percentiles)-1:
            formatted_ranges.append(f"[{percentiles[i-1]}-{percentiles[i]})")
            mean_salaries.append(round(df[(df[other_column] >= percentiles[i-1]) & (df[other_column] < percentiles[i])].annual_salary.mean(), 2))
        else:
            formatted_ranges.append(f"[{percentiles[i-1]}-{percentiles[i]}]")
            mean_salaries.append(round(df[(df[other_column] >= percentiles[i-1]) & (df[other_column] <= percentiles[i])].annual_salary.mean(), 2))
    
    display_df = pd.DataFrame()
    display_df[other_column] = formatted_ranges
    display_df["Mean Annual Salary"] = mean_salaries
    return display_df

In [20]:
# salary by number of applicants
cross_with_annual_salary(df, "num_applicants", 10)

Unnamed: 0,num_applicants,Mean Annual Salary
0,[0.0-14.0),87381.36
1,[14.0-29.0),87416.43
2,[29.0-53.0),87218.55
3,[53.0-81.0),83020.83
4,[81.0-100.0),94984.38
5,[100.0-100.0],90964.22


In [21]:
# salary by city
df.groupby("city").annual_salary.agg(['count', 'min', 'max', 'mean']).sort_values(by='count', ascending=False)[:10]

Unnamed: 0_level_0,count,min,max,mean
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Toronto,129,50000.0,200000.0,91430.232558
Vancouver,76,35500.0,209000.0,87375.0
Montreal,64,40000.0,154000.0,89071.09375
Calgary,34,37500.0,137500.0,82220.588235
Winnipeg,18,55000.0,128000.0,84805.555556
Mississauga,15,26000.0,200000.0,84766.666667
Markham,11,59000.0,123000.0,90909.090909
Ottawa,10,59000.0,200000.0,98000.0
Regina,10,50000.0,91000.0,70300.0
Edmonton,9,55000.0,130500.0,84333.333333


In [22]:
# salary by province
df.groupby("province").annual_salary.agg(['count', 'min', 'max', 'mean']).sort_values(by='mean', ascending=False)[:10]

Unnamed: 0_level_0,count,min,max,mean
province,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ON,205,26000.0,200000.0,89404.878049
QC,86,31000.0,154000.0,85587.790698
BC,104,35500.0,209000.0,85167.019231
MB,18,55000.0,128000.0,84805.555556
NB,3,59000.0,132000.0,83333.333333
AB,45,37500.0,137500.0,81422.222222
NS,3,57000.0,107000.0,79666.666667
SK,11,50000.0,91000.0,71409.090909
YT,1,36500.0,36500.0,36500.0


In [25]:
df.city.unique()

array(['Toronto', 'Montreal', None, 'Calgary', 'Waterloo', 'Oakville',
       'Ottawa', 'Aurora', 'Surrey', 'Mississauga', 'Woodbridge',
       'Winnipeg', 'Brampton', 'Vancouver', 'Markham', 'Bedford',
       'Chiasson Office', 'Whitehorse', 'Regina', 'Dorval', 'Vaughan',
       'Kelowna', 'Québec', 'Chalk River', 'Hull', 'Shawinigan',
       'Hamilton', 'North Vancouver', 'Sainte-Anne-de-Bellevue',
       'Edmonton', 'Metro Vancouver A', 'Halifax', 'Burlington',
       'Boucherville', 'Acheson', 'Alberta Beach', 'Brunswick',
       'Greater Vancouver', 'Dartmouth', 'Victoria', 'Gatineau',
       'Burnaby', 'Sherbrooke', 'St. Jacobs', 'Guelph',
       'Dollard-des-Ormeaux', 'Montréal-Est', 'Longueuil',
       'Montréal-Ouest', 'Saskatoon', 'Richmond', 'New Westminster',
       'Nelson', 'Blacks Harbour', 'Pointe-Claire', 'Bowmanville',
       'Kanata', 'Scarborough', 'St-Hyacinthe', 'Laval'], dtype=object)

In [39]:
# convert city and province into one hot encoding
province_mapping = {}
for p in df.province.unique():
    if p is not None:
        province_mapping[p] = len(province_mapping)
city_mapping = {}
for c in df.city.unique():
    if c is not None:
        city_mapping[c] = len(city_mapping)

# save encodings
with open("data/province_encoding.json", "w") as f:
    json.dump(province_mapping, f, indent=4)
with open("data/city_encoding.json", "w") as f:
    json.dump(city_mapping, f, indent=4)

# update df
#df["province_one_hot"] = df.province.map(province_mapping)
#df["city_one_hot"] = df.city.map(city_mapping)

In [43]:
# check how different title pronouns impact salary
def compare_titles(titles):
    title_names, min, mean, max = [], [], [], []
    for title in titles:
        title_names.append(title)
        subset = df[df.job_title.str.lower().str.contains('|'.join(titles[title]))]
        min.append("$" + str(subset.annual_salary.min()))
        mean.append("$" + str(round(subset.annual_salary.mean(), 2)))
        max.append("$" + str(subset.annual_salary.max()))
    display_df = pd.DataFrame()
    display_df["Job Title"] = title_names
    display_df["Min Annual Salary"] = min
    display_df["Mean Annual Salary"] = mean
    display_df["Max Annual Salary"] = max
    return display_df
    
titles = {
    "Junior": ["jr", "junior"],
    "Intermediate": ["intermediate"],
    "Average": [""],
    "Senior": ["sr", "senior"],
    "Manager": ["manage"],
    "Lead": ["lead"],
    "Director": ["director"],
    "Vice President": ["vp"]
}
compare_titles(titles)

Unnamed: 0,Job Title,Min Annual Salary,Mean Annual Salary,Max Annual Salary
0,Junior,$45500.0,$66500.0,$115000.0
1,Intermediate,$65000.0,$71166.67,$76000.0
2,Average,$26000.0,$89302.72,$210000.0
3,Senior,$46000.0,$96432.34,$209000.0
4,Manager,$54500.0,$102409.09,$200000.0
5,Lead,$37500.0,$120108.7,$200000.0
6,Director,$137500.0,$137500.0,$137500.0
7,Vice President,$154000.0,$164750.0,$175500.0


In [44]:
# check for keywords in title impact summary
titles = {
    "Average": [""],
    "AI": ["ai", "ml", "machine learning"],
    "Engineer": ["engineer"],
    "Scientist": ["scientist"],
    "Analyst": ["analyst"],
    "Finance": ["financial", "finance"]
}
compare_titles(titles)

Unnamed: 0,Job Title,Min Annual Salary,Mean Annual Salary,Max Annual Salary
0,Average,$26000.0,$89302.72,$210000.0
1,AI,$45000.0,$106700.53,$209000.0
2,Engineer,$37500.0,$106133.66,$205000.0
3,Scientist,$50000.0,$104835.16,$187500.0
4,Analyst,$26000.0,$76643.63,$162000.0
5,Finance,$35500.0,$75705.49,$175500.0


In [45]:
# add title pronouns and keywords as features
df["is_junior"] = np.where((df.job_title.str.lower().str.contains("jr")) | (df.job_title.str.lower().str.contains("junior")), 1, 0)
df["is_intermediate"] = np.where(df.job_title.str.lower().str.contains("intermediate"), 1, 0)
df["is_senior"] = np.where((df.job_title.str.lower().str.contains("sr")) | (df.job_title.str.lower().str.contains("senior")), 1, 0)
df["is_manager"] = np.where(df.job_title.str.lower().str.contains("manage"), 1, 0)
df["is_lead"] = np.where(df.job_title.str.lower().str.contains("lead"), 1, 0)
df["is_director"] = np.where(df.job_title.str.lower().str.contains("director"), 1, 0)
df["is_vp"] = np.where(df.job_title.str.lower().str.contains("vp"), 1, 0)
df["is_ai"] = np.where((df.job_title.str.lower().str.contains("ai")) | (df.job_title.str.lower().str.contains("ml")) | (df.job_title.str.lower().str.contains("machine learning")), 1, 0)
df["is_engineer"] = np.where(df.job_title.str.lower().str.contains("engineer"), 1, 0)
df["is_scientist"] = np.where(df.job_title.str.lower().str.contains("scientist"), 1, 0)
df["is_analyst"] = np.where(df.job_title.str.lower().str.contains("analyst"), 1, 0)
df["is_finance"] = np.where((df.job_title.str.lower().str.contains("finance")) | (df.job_title.str.lower().str.contains("financial")), 1, 0)

In [46]:
# find maximum number of years experience in description
# and also number of different experiences required
def parse_int(word):
    word = word.split('-')[-1]
    final_int = ""
    for char in word:
        if char.isnumeric():
            final_int += char
        elif len(final_int):
            break
    if not len(final_int):
        final_int = -1
    return int(final_int)

years_experience, number_of_experiences = [], []
for desc in df.description.tolist():
    highest_years = None
    num_experiences = 0
    split = desc.lower().split('year')
    if len(split) > 1:
        for i, phrase in enumerate(split):
            if 'experience' in phrase[-100:] or (i < len(split) - 1 and 'experience' in split[i+1][:100]):
                split_phrase = phrase.strip().split(' ')
                num_years = parse_int(split_phrase[-1])
                if num_years > 0:
                    num_experiences += 1
                    if highest_years is None or num_years > highest_years:
                        highest_years = num_years
    years_experience.append(highest_years)
    number_of_experiences.append(num_experiences)
df['years_experience'] = years_experience
df['number_of_experiences'] = number_of_experiences
df.loc[df.years_experience > 15, 'years_experience'] = None

In [47]:
cross_with_annual_salary(df, "years_experience", 5)

Unnamed: 0,years_experience,Mean Annual Salary
0,[1.0-3.0),90220.0
1,[3.0-4.0),88035.26
2,[4.0-5.0),93885.71
3,[5.0-6.0),92848.66
4,[6.0-14.0],98686.71


In [48]:
cross_with_annual_salary(df, "number_of_experiences", 10)

Unnamed: 0,number_of_experiences,Mean Annual Salary
0,[0.0-1.0),84227.92
1,[1.0-2.0),90112.68
2,[2.0-5],98721.05


In [49]:
df.head()

Unnamed: 0,job_title,employer,num_applicants,annual_salary,city,province,country,description,url,is_junior,...,is_lead,is_director,is_vp,is_ai,is_engineer,is_scientist,is_analyst,is_finance,years_experience,number_of_experiences
0,Senior Data Scientist,Akkodis,76.0,118000.0,Toronto,ON,Canada,About the job\nHi Candidate\nI hope you are do...,https://www.linkedin.com/jobs/view/3881928212/...,0,...,0,0,0,0,0,1,0,0,7.0,3
1,Lead Data Scientist/Machine Learning Engineer ...,Agoda,41.0,132000.0,Toronto,ON,Canada,About the job\nAbout Agoda\n\nAgoda is an onli...,https://www.linkedin.com/jobs/view/3839799162/...,0,...,1,0,0,1,1,1,0,0,4.0,1
2,"Practice Lead Data Scientist, Data Driven Mark...",Cogeco Connexion,41.0,120000.0,Montreal,QC,Canada,About the job\nOur culture lifts you up—there ...,https://www.linkedin.com/jobs/view/3848292659/...,0,...,1,0,0,0,0,1,0,0,10.0,1
3,Senior Data Scientist,Clio - Cloud-Based Legal Technology,100.0,165000.0,,,Canada,About the job\nClio is more than just a tech c...,https://www.linkedin.com/jobs/view/3827050664/...,0,...,0,0,0,0,0,1,0,0,5.0,1
4,Lead Data Scientist,Logikk,100.0,132000.0,Montreal,QC,Canada,About the job\nLead Data Scientist - Leading A...,https://www.linkedin.com/jobs/view/3865773248/...,0,...,1,0,0,0,0,1,0,0,,0


# Model Training

In [50]:
df.dtypes

job_title                 object
employer                  object
num_applicants           float64
annual_salary            float64
city                      object
province                  object
country                   object
description               object
url                       object
is_junior                  int32
is_intermediate            int32
is_senior                  int32
is_manager                 int32
is_lead                    int32
is_director                int32
is_vp                      int32
is_ai                      int32
is_engineer                int32
is_scientist               int32
is_analyst                 int32
is_finance                 int32
years_experience         float64
number_of_experiences      int64
dtype: object

In [53]:
# clean dataset and split into train and test
good_cols = []
for col in df:
    if df[col].dtype != "object":
        good_cols.append(col)
X = df[good_cols]
y = X.annual_salary
X = X.drop("annual_salary", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=57)

In [54]:
X_train

Unnamed: 0,num_applicants,is_junior,is_intermediate,is_senior,is_manager,is_lead,is_director,is_vp,is_ai,is_engineer,is_scientist,is_analyst,is_finance,years_experience,number_of_experiences
312,100.0,0,0,0,0,0,0,0,0,0,0,1,0,5.0,1
361,100.0,0,0,0,0,0,0,0,0,0,0,1,1,,0
248,11.0,0,0,1,0,0,0,0,0,0,0,1,1,7.0,1
51,100.0,0,0,0,0,0,0,0,0,0,0,0,0,12.0,1
553,100.0,0,0,1,0,0,0,0,1,1,0,0,0,5.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40,100.0,0,0,0,0,0,0,0,0,0,1,0,0,1.0,1
98,22.0,0,0,0,0,0,1,0,0,0,0,0,0,8.0,2
79,100.0,0,0,0,1,0,0,0,0,0,0,0,0,,3
406,48.0,0,0,0,0,0,0,0,0,0,0,1,0,,0


In [97]:
# fina optimal hyperparameters

def test_hyperparameters(**hyper_parameters):
    hyper_parameters["n_estimators"] = int(hyper_parameters["n_estimators"])
    hyper_parameters["max_depth"] = int(hyper_parameters["max_depth"])
    xgbc = XGBRegressor(**hyper_parameters, objective="reg:squarederror")
    xgbc.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=0)
    return -xgbc.evals_result()["validation_0"]["rmse"][-1]
    

pbounds = {
    "gamma": (0, 50), 
    "max_depth": (3, 6),
    "min_child_weight": (1, 50),
    "subsample": (0.5, 1), 
    "colsample_bytree": (0.5, 1),
    "colsample_bylevel": (0.5, 1),
    "lambda": (0, 50),
    "alpha": (0, 50),
    "n_estimators": (1, 200),
    "learning_rate": (0, 1)
}

optimizer = BayesianOptimization(
    test_hyperparameters,
    pbounds=pbounds,
    verbose=2,
    random_state=57,
)

optimizer.maximize(
    init_points=2,
    n_iter=30,
)

|   iter    |  target   |   alpha   | colsam... | colsam... |   gamma   |  lambda   | learni... | max_depth | min_ch... | n_esti... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m-2.65e+04[0m | [0m4.367    [0m | [0m0.6152   [0m | [0m0.7055   [0m | [0m15.54    [0m | [0m28.3     [0m | [0m0.5451   [0m | [0m5.421    [0m | [0m45.99    [0m | [0m104.9    [0m | [0m0.7123   [0m |
| [95m2        [0m | [95m-2.428e+0[0m | [95m3.59     [0m | [95m0.9493   [0m | [95m0.7103   [0m | [95m29.11    [0m | [95m10.71    [0m | [95m0.4475   [0m | [95m4.404    [0m | [95m5.931    [0m | [95m185.5    [0m | [95m0.5805   [0m |
| [0m3        [0m | [0m-2.796e+0[0m | [0m0.6921   [0m | [0m0.6559   [0m | [0m0.5947   [0m | [0m29.55    [0m | [0m12.97    [0m | [0m0.9002   [0m | [0m4.38     [0m | [0m7.41     [0m | [0m182.0 

In [101]:
# train final model
best_hyperparameters = optimizer.max["params"]
best_hyperparameters["n_estimators"] = int(best_hyperparameters["n_estimators"])
best_hyperparameters["max_depth"] = int(best_hyperparameters["max_depth"])
xgbc_final = XGBRegressor(**best_hyperparameters, objective="reg:squarederror")
xgbc_final.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=0)

In [106]:
# test final model
predictions = xgbc_final.predict(X_test)
display_df = pd.DataFrame()
display_df["predictions"] = predictions
display_df["actual"] = y_test.tolist()

diff = display_df.predictions - display_df.actual
diff.agg

0      24076.742188
1      51882.734375
2      25697.000000
3      28786.773438
4     -12639.429688
           ...     
115    -6465.601562
116    10312.585938
117   -16934.148438
118   -37390.429688
119    32544.867188
Length: 120, dtype: float64