In [27]:
import pandas as pd
import numpy as np

In [28]:
df = pd.read_csv('../resources/train.csv', delimiter='|')
df.drop(['id'], axis=1, inplace=True)
df.drop(['job_description'], axis=1, inplace=True)
df = df[df.salary_currency != 'USD']

df_predict = pd.read_csv('../resources/predict_case.csv', delimiter='|')
df_predict.drop(['job_description'], axis=1, inplace=True)

In [29]:
mean_salary = df['salary'].mean()
mode_experience_level = df['experience_level'].mode()[0]
mode_employment_type = df['employment_type'].mode()[0]
mode_job_benefits = df['job_benefits'].mode()[0]
mode_company_process_time = df['company_process_time'].mode()[0]
mode_company_size = df['company_size'].mode()[0]
mode_company_indusry = df['company_industry'].mode()[0]


# impute empty salary based on currency
df['salary'] = df['salary'].fillna(mean_salary)
df['salary_currency'] = df['salary_currency'].fillna('IDR')
df['experience_level'] = df['experience_level'].fillna(mode_experience_level)
df['employment_type'] = df['employment_type'].fillna(mode_employment_type)
df['job_benefits'] = df['job_benefits'].fillna(mode_job_benefits)
df['company_process_time'] = df['company_process_time'].fillna(mode_company_process_time)
df['company_size'] = df['company_size'].fillna(mode_company_size)
df['company_industry'] = df['company_industry'].fillna(mode_company_indusry)

In [30]:
mode_experience_level = df_predict['experience_level'].mode()[0]
mode_job_benefits = df_predict['job_benefits'].mode()[0]
mode_company_process_time = df_predict['company_process_time'].mode()[0]
mode_company_size = df_predict['company_size'].mode()[0]
mode_company_indusry = df_predict['company_industry'].mode()[0]


df_predict['experience_level'] = df_predict['experience_level'].fillna(mode_experience_level)
df_predict['job_benefits'] = df_predict['job_benefits'].fillna(mode_job_benefits)
df_predict['company_process_time'] = df_predict['company_process_time'].fillna(mode_company_process_time)
df_predict['company_size'] = df_predict['company_size'].fillna(mode_company_size)
df_predict['company_industry'] = df_predict['company_industry'].fillna(mode_company_indusry)

df_predict.isnull().sum()

id                      0
job_title               0
location                0
salary_currency         0
career_level            0
experience_level        0
education_level         0
employment_type         0
job_function            0
job_benefits            0
company_process_time    0
company_size            0
company_industry        0
dtype: int64

In [40]:
df['job_title'] = df['job_title'].str.lower()
df_predict['job_title'] = df_predict['job_title'].str.lower()

In [42]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define the columns to be encoded
categorical_cols = ['job_title', 'experience_level', 'job_benefits']

# Define the transformer for one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Define the RandomForestRegressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Create the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

# Fit the pipeline on the training data
pipeline.fit(df[['job_title', 'location', 'career_level', 'experience_level', 'education_level', 'employment_type',
        'job_function',  'job_benefits', 'company_size', 'company_industry']], df['salary'])

In [43]:
Y_predict = pipeline.predict(df_predict[['job_title', 'location', 'career_level', 'experience_level', 'education_level', 'employment_type',
        'job_function',  'job_benefits', 'company_size', 'company_industry']])


In [None]:
df_predict['salary'] = Y_predict
mean_idr_salary = df_predict[df_predict['salary_currency'] == 'IDR']['salary'].mean()
mean_usd_salary = df_predict[df_predict['salary_currency'] == 'USD']['salary'].mean()
df_predict.loc[(df_predict['salary_currency'] == 'IDR') & (df_predict['salary'].isnull()), 'salary'] = mean_idr_salary
df_predict.loc[(df_predict['salary_currency'] == 'USD') & (df_predict['salary'].isnull()), 'salary'] = mean_usd_salary
df_predict[['id', 'salary']].to_csv('../resources/predicted_salary.csv', index=False)

In [32]:
# train model
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_categorical = df[['job_title',
        'location', 
        'career_level',
        'experience_level', 
        'education_level', 
        'employment_type',
        'job_function', 
        'job_benefits', 
        # 'company_process_time', 
        'company_size', 
        'company_industry']]

X_predict = df_predict[['job_title',
        'location',
        'career_level',
        'experience_level', 
        'education_level', 
        'employment_type',
        'job_function', 
        'job_benefits', 
        # 'company_process_time', 
        'company_size', 
        'company_industry']]

Y_train = df['salary']
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['job_title'])
X_job_title = tokenizer.texts_to_sequences(df['job_title'])
X_job_title = pad_sequences(X_job_title, padding='post')
X = np.concatenate([X_categorical.to_numpy(), X_job_title], axis=1)

# X_train, X_predict = X_train.align(X_predict, join='left', axis=1, fill_value=0)

print(X)

ModuleNotFoundError: No module named 'tensorflow'

In [None]:

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, Y_train)

In [55]:
Y_predict = model.predict(X_predict)

In [45]:
# Y_pred = model.predict(X_test)
# mse = mean_squared_error(Y_test, Y_pred)
# accuracy = model.score(X_test, Y_test)
# print(f'Mean Squared Error: {mse}')
# print(f'Accuracy: {accuracy*100}%')

Mean Squared Error: 166945006.6086794
Accuracy: 99.1546671918086%


In [56]:
# write csv id and salary
df_predict['salary'] = Y_predict
mean_idr_salary = df_predict[df_predict['salary_currency'] == 'IDR']['salary'].mean()
mean_usd_salary = df_predict[df_predict['salary_currency'] == 'USD']['salary'].mean()
df_predict.loc[(df_predict['salary_currency'] == 'IDR') & (df_predict['salary'].isnull()), 'salary'] = mean_idr_salary
df_predict.loc[(df_predict['salary_currency'] == 'USD') & (df_predict['salary'].isnull()), 'salary'] = mean_usd_salary
df_predict[['id', 'salary']].to_csv('../resources/predicted_salary.csv', index=False)
