In [1]:
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv('../resources/train.csv', delimiter='|')
df_predict = pd.read_csv('../resources/predict_case.csv', delimiter='|')

In [3]:
df_train.drop(['id', 'job_description'], axis=1, inplace=True)
df_predict.drop(['job_description'], axis=1, inplace=True)

In [4]:
def cumulative_categorize(df, column, threshold):
    df[column] = df[column].astype(str)
    counts = df[column].value_counts()
    df[column] = df[column].apply(lambda x: x if counts[x] > threshold else 'Other')
    return df

In [5]:
# clean experience_level
mode_experience_level = df_train['experience_level'].mode()[0]
df_train['experience_level'] = df_train['experience_level'].fillna(mode_experience_level)

df_train = cumulative_categorize(df_train, 'experience_level', 50)

# clean education_levels
unique_values = set(df_train['education_level'].unique()) - set(df_predict['education_level'].unique())
df_train.loc[df_train['education_level'].isin(unique_values), 'education_level'] = 'Other'

# clean employment_type
mode_employment_type = df_train['employment_type'].mode()[0]
df_train['employment_type'] = df_train['employment_type'].fillna(mode_employment_type)

df_train = cumulative_categorize(df_train, 'employment_type', 50)

# clean job_function
unique_values = set(df_train['job_function'].unique()) - set(df_predict['job_function'].unique())
df_train.loc[df_train['job_function'].isin(unique_values), 'job_function'] = 'Other'

# clean job_benefits
mode_job_benefits = df_train['job_benefits'].mode()[0]
df_train['job_benefits'] = df_train['job_benefits'].fillna(mode_job_benefits)

unique_values = set(df_train['job_benefits'].unique()) - set(df_predict['job_benefits'].unique())
df_train.loc[df_train['job_benefits'].isin(unique_values), 'job_benefits'] = 'Other'

# clean company_process_time
mode_company_process_time = df_train['company_process_time'].mode()[0]
df_train['company_process_time'] = df_train['company_process_time'].fillna(mode_company_process_time)

unique_values = set(df_train['company_process_time'].unique()) - set(df_predict['company_process_time'].unique())
df_train.loc[df_train['company_process_time'].isin(unique_values), 'company_process_time'] = 'Other'

# clean company_size
mode_company_size = df_train['company_size'].mode()[0]
df_train['company_size'] = df_train['company_size'].fillna(mode_company_size)

unique_values = set(df_train['company_size'].unique()) - set(df_predict['company_size'].unique())
df_train.loc[df_train['company_size'].isin(unique_values), 'company_size'] = 'Other'

# clean company_industry
mode_company_industry = df_train['company_industry'].mode()[0]
df_train['company_industry'] = df_train['company_industry'].fillna(mode_company_industry)

unique_values = set(df_train['company_industry'].unique()) - set(df_predict['company_industry'].unique())
df_train.loc[df_train['company_industry'].isin(unique_values), 'company_industry'] = 'Other'

# clean salary and salary currency
mode_salary_currency = df_train['salary_currency'].mode()[0]
df_train['salary_currency'].fillna(mode_salary_currency, inplace=True)
mean_idr_salary = df_train[df_train['salary_currency'] == 'IDR']['salary'].mean()
mean_usd_salary = df_train[df_train['salary_currency'] == 'USD']['salary'].mean()

df_train.loc[(df_train['salary_currency'] == 'IDR') & (df_train['salary'].isnull()), 'salary'] = mean_idr_salary
df_train.loc[(df_train['salary_currency'] == 'USD') & (df_train['salary'].isnull()), 'salary'] = mean_usd_salary

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['salary_currency'].fillna(mode_salary_currency, inplace=True)


In [220]:
unique_values = set(df_train['job_title'].unique()) - set(df_predict['job_title'].unique())
df_train.loc[df_train['job_title'].isin(unique_values), 'job_title'] = 'Other'

df_train['job_title'].to_csv('../resources/train_cleaned.csv', index=False)

In [162]:
print(unique_values)

{'ANDROID APP DEVELOPER (MOBILE APP DEVELOPMENT)', 'Data Center Operator (Surabaya)', 'Regional Factory Manager', 'GENERAL AFFAIR COORDINATOR', 'Product Marketing Manager - Financial Services', 'Kepala Divisi Stock', 'MARKETING COMMUNICATION', 'Permit Process & Monitoring Staff', 'Manager Operasional & Produksi', 'MARKETING SUPERVISOR ( TRANSPORTASI/KARGO/LOGISTIK )', 'Sales Kemitraan (Indonesia Timur)', 'Telesales Online', 'Sr. People Ops Specialist', 'React JS Frontend Developer', 'Credit Analyst Coordinator', 'Mekanik 1 Articulated Dump Truck (ADT)', 'Telecom sales hunter', 'FP&A Analyst', 'Supervisor Civil Works', 'Production Leader Factory', 'Admin Staff Cabang Ujungberung', 'Staff Pelayanan', 'Staff Productivity (Service)', 'Buying Section Head', 'Rider Team Lead Shopee Express - Hub Ungaran (Kab. Semarang, Jawa Tengah)', 'MARKETING BUSINESS DEVELOPMENT (PENEMPATAN : BANDUNG, SEMARANG & PALEMBANG)', 'Backend Developer (PHP Laravel)', 'Senior Marketing', 'Creative Production Super

In [59]:
# clean job_title
import json
import re

with open('../resources/library.json') as f:
    library = json.load(f)

stopwords = set(['a', 'an', 'the', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'of', 'and', 'or', 'bidang'])

def clean_job_title(text):
    if (text == 'Other'):
        return text
    text = text.lower()
    text = re.sub(r'\W', ' ', text)

    words = text.split()
    new_title = ''
    prev_word = ''
    for word in words:
        if word in stopwords:
            continue
        if word == "front" or word == "back":
            prev_word = word
            continue
        if word == "end":
            if prev_word == "front" or prev_word == "back":
                new_title += prev_word.capitalize() + '' + word.capitalize() + ' '
                prev_word = ''
            continue
        for category, keywords in library.items():
            if word in keywords:
                if category not in new_title:
                    new_title += category + ' '
    # if not new_title:
    #     new_title = 'Other'
    return new_title.strip()

df_new = df_train['job_title'].apply(clean_job_title)
df_new.to_csv('../resources/job_title_category.csv', index=False)

df_new_2 = df_predict['job_title'].apply(clean_job_title)
df_new_2.to_csv('../resources/job_title_category_predict.csv', index=False)

In [230]:
df_new.head(5)

0                 Operations Manager
1                 Management Trainee
2                     ContentCreator
3    Supervisor Sales Administration
4                         Supervisor
Name: job_title, dtype: object

In [124]:
# clean experience_level
mode_experience_level = df_predict['experience_level'].mode()[0]
df_predict['experience_level'] = df_predict['experience_level'].fillna(mode_experience_level)

# clean job benefits
mode_job_benefits = df_predict['job_benefits'].mode()[0]
df_predict['job_benefits'] = df_predict['job_benefits'].fillna(mode_job_benefits)

# clean company_process_time
mode_company_process_time = df_predict['company_process_time'].mode()[0]
df_predict['company_process_time'] = df_predict['company_process_time'].fillna(mode_company_process_time)

# clean company_size
mode_company_size = df_predict['company_size'].mode()[0]
df_predict['company_size'] = df_predict['company_size'].fillna(mode_company_size)

# clean company_industry
mode_company_industry = df_predict['company_industry'].mode()[0]
df_predict['company_industry'] = df_predict['company_industry'].fillna(mode_company_industry)


# clean job_title
df_predict['job_title'] = df_predict['job_title'].apply(clean_job_title)

In [126]:
df_predict['job_title'].value_counts()

job_title
Other                           185
Marketing                       180
Manager                         106
Finance                          81
Marketing Manager                34
                               ... 
HRD Finance Manager               1
Quality Assurance Management      1
Manager Designer                  1
Finance HRD                       1
HRD Consultant                    1
Name: count, Length: 195, dtype: int64

In [163]:
# train model
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train = df_train[['job_title',
        'location',
        'salary_currency',
        'career_level',
        'experience_level', 
        'education_level', 
        'employment_type',
        'job_function', 
        'job_benefits', 
        'company_process_time', 
        'company_size', 
        'company_industry']]

X_train = pd.get_dummies(X_train)
Y_train = df_train['salary']

# split traun and test data
X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)
print("mse: ", mean_squared_error(Y_test, Y_pred))
print("accuracy: ", model.score(X_test, Y_test) * 100)

mse:  4430429361378.733
accuracy:  21.218364976067296


In [None]:

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, Y_train)

In [55]:
Y_predict = model.predict(X_predict)

In [45]:
# Y_pred = model.predict(X_test)
# mse = mean_squared_error(Y_test, Y_pred)
# accuracy = model.score(X_test, Y_test)
# print(f'Mean Squared Error: {mse}')
# print(f'Accuracy: {accuracy*100}%')

Mean Squared Error: 166945006.6086794
Accuracy: 99.1546671918086%


In [56]:
# write csv id and salary
df_predict['salary'] = Y_predict
mean_idr_salary = df_predict[df_predict['salary_currency'] == 'IDR']['salary'].mean()
mean_usd_salary = df_predict[df_predict['salary_currency'] == 'USD']['salary'].mean()
df_predict.loc[(df_predict['salary_currency'] == 'IDR') & (df_predict['salary'].isnull()), 'salary'] = mean_idr_salary
df_predict.loc[(df_predict['salary_currency'] == 'USD') & (df_predict['salary'].isnull()), 'salary'] = mean_usd_salary
df_predict[['id', 'salary']].to_csv('../resources/predicted_salary.csv', index=False)
