In [20]:
import pandas as pd
import numpy as np
import json
import Levenshtein
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler
import pickle

In [None]:
# Global Variables
MODEL_FILE  = 'best_model.pkl'
SOURCE_FILE = 'X_test.csv'

In [21]:
with open(MODEL_FILE, 'rb') as f:
    best_model = pickle.load(f)

In [22]:
# The transformers
class JobTitleTransformer(BaseEstimator, TransformerMixin):
    json_file = 'job-titles.json'
    
    def __init__(self):
        # Read the json of correct job titles
        with open(self.json_file, 'r') as f:
            job_titles = json.load(f)
        self.job_titles = job_titles['job-titles']

    def transform(self, X, y=None):
        X['Job Title'] = X['Job Title'].str.replace('|', ' ', regex=False)
        new_mapping = {title: JobTitleTransformer._get_closest_text(title, texts=self.job_titles) for title in 
                       X['Job Title'].unique()}
        X['Job Title'] = X['Job Title'].replace(new_mapping)
        return X

    def fit(self, X, y=None):
        return self
    
    @staticmethod
    def _get_closest_text(query_text, texts):
        closest_text = min(texts, key=lambda x: Levenshtein.distance(query_text, x))
        return closest_text

class JobCategoryTransformer(BaseEstimator, TransformerMixin):
    job_categories_keywords = {
        "Engineering": ['engineer', 'developer', 'software'],
        "Data": ['data', 'analyst', 'scientist'],
        "Marketing": ['marketing', 'sales', 'marketing'],
        "Human Resources": ['hr', 'human resources'],
        "Finance": ['finance', 'financial', 'accountant'],
        "Management": ['manager', 'director', 'lead', 'supervisor', 'head', 'operations', 'ceo'],
    }

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Create an empty array to store the one-hot encoding for each category
        one_hot_encodings = []

        # Iterate over each row in the input DataFrame
        for _, row in X.iterrows():
            job_title = row['Job Title']
            category_encoding = [0] * (len(self.job_categories_keywords) + 1)

            # Check if any of the keywords for each category is present in the job title
            for i, category_keywords in enumerate(self.job_categories_keywords.values()):
                if any(keyword in job_title.lower() for keyword in category_keywords):
                    category_encoding[i] = 1
            
            if sum(category_encoding) == 0:
                category_encoding[-1] = 1
            
            one_hot_encodings.append(category_encoding)

        # Convert the list of encodings to a NumPy array
        one_hot_encodings = np.array(one_hot_encodings)

        X[[*list(self.job_categories_keywords.keys()), 'Other']] = one_hot_encodings
        
        return X

class SeniorityTransformer(BaseEstimator, TransformerMixin):
    seniority_keywords = {
        "Junior": ['junior', 'jr', 'entry'],
        "Mid-Level": [],
        "Senior": ['senior', 'sr', 'lead'],
        "Executive": ['executive', 'chief', 'director', 'ceo'],
    }

    default_seniority = list(seniority_keywords.keys()).index('Mid-Level')
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Create an empty list to store the transformed seniority values
        transformed_seniority = []

        # Iterate over each row in the input DataFrame
        for _, row in X.iterrows():
            job_title = row['Job Title']
            transformed_value = None

            # Check if any of the keywords for each seniority level is present in the job title
            for i, (seniority, seniority_keywords) in enumerate(self.seniority_keywords.items()):
                if any(keyword in job_title.lower() for keyword in seniority_keywords):
                    transformed_value = i
                    break
                else:
                    transformed_value = self.default_seniority
            transformed_seniority.append(transformed_value)

        X['Seniority'] = transformed_seniority
        
        return X

class CountryTransformer(BaseEstimator, TransformerMixin):
    # Define the custom function to fix the values in the 'country' column
    def transform(self, X, y=None):
        X['Country'] = X['Country'].replace({
            'australi': 'australia',
            'us': 'united states',
            'u': 'unknown',
            'chin': 'china',
            'canad': 'canada'
        })
        return X
    
    def fit(self, X, y=None):
        return self

class EducationTitleTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X['Education Level'] = X['Education Level'].replace({
            "high|scho": "High School",
            "bachelor's|degr": "Bachelor's Degree",
            "bachelor": "Bachelor's Degree",
            "master's|degr": "Master's Degree",
            "master": "Master's Degree",
            "p": "Ph.D."
        })
        return X

class OutlierTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X , y=None, years=40, bad_countries=['RazShmuelykingdom'], 
                  bad_races=['NayaCollegegeek'], bad_genders=['O']):
        return X[(X['Years of Experience'] <= years) & 
                 (~X['Country'].isin(bad_countries)) & 
                 (~X['Race'].isin(bad_races)) &
                 (~X['Gender'].isin(bad_genders))].copy()

def drop_bad_columns(X: pd.DataFrame):
    return X.dropna().copy()

In [23]:
# The pipeline
def drop_categorical_features(X):
    return X.drop(['Job Title'], axis=1)

education_order = ['High School', "Bachelor's Degree", "Master's Degree", 'Ph.D.'] # Order for ordinal encoding

# Adding a scaler to the pipeline
ct = ColumnTransformer([
        ('GenderEncoder', OneHotEncoder(drop=None, sparse_output=False), ['Gender', 'Country', 'Race']),
        ('EducationLevelEncoder', OrdinalEncoder(categories=[education_order]), ['Education Level']),
        ('scaler', StandardScaler(), ['Age', 'Years of Experience', 'height', 'weight', 'professionalism']),
    ], remainder='passthrough', verbose_feature_names_out=False)

ct.set_output(transform='pandas')

pipeline = Pipeline([
    ('DropNaTransformer', FunctionTransformer(drop_bad_columns)),
    ('CountryTransformer', CountryTransformer()),
    ('JobTitleTransformer', JobTitleTransformer()),
    ('JobCategoryTransformer', JobCategoryTransformer()),
    ('SeniorityTransformer', SeniorityTransformer()),
    ('EducationTitleTransformer', EducationTitleTransformer()),
    ('OutlierTransformer', OutlierTransformer()),
    ('ColumnTransformers', ct),
    ('DropCategoricalTransformer', FunctionTransformer(drop_categorical_features)),
    ('FinalStep', FunctionTransformer(lambda X: X.loc[:, ~X.columns.isin(
        X.filter(regex='Gender|Country|Race|height|weight', axis=1).columns)]))
])

In [25]:
# read the input dataset, run the pipeline, predict, and save the predictions as a json file
X = pd.read_csv(SOURCE_FILE).drop(['Unnamed: 0', 'Unnamed: 0.1', 'id'], axis=1)
transformed_X = pipeline.fit_transform(X)

predictions = best_model.predict(transformed_X)
transformed_X['predicted_salary'] = predictions
transformed_X['predicted_salary'].to_json('predictions.json')