In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.impute import SimpleImputer

In [4]:
def cleanSalary(txt):
    if isinstance(txt, str):
        numbers = re.findall(r'\d{1,3}(?:,\d{3})*', txt)
        if numbers:
            max_salary = int(numbers[-1].replace(',', ''))
            return max_salary
        else:
            num = re.findall(r'\d{1,3}(?:\.\d{3})*', txt)
            if num:
                max_salary = int(num[-1].replace('.', ''))
                return max_salary
        return None
    return txt

In [5]:
df = pd.DataFrame()

for i in range(1, 14):
    file_name = f'scrapping_result/jobdata{i}.csv'
    dfx = pd.read_csv(file_name) 
    dfx['Salary'] = dfx['Salary'].apply(cleanSalary) 

    num_cols = dfx.select_dtypes(include=['float64']).columns
    imp = SimpleImputer(strategy='mean')
    dfx[num_cols] = imp.fit_transform(dfx[num_cols])

    df = pd.concat([df, dfx], ignore_index=True, sort=False)  


for i in range(1, 14):
    file_name = f'scrapping_result/jobstreet{i}.csv'
    dfx = pd.read_csv(file_name) 
    dfx.drop('Date', axis=1, inplace=True)
    dfx['Salary'] = dfx['Salary'].apply(cleanSalary) 

    num_cols = dfx.select_dtypes(include=['float64']).columns
    imp = SimpleImputer(strategy='mean')
    dfx[num_cols] = imp.fit_transform(dfx[num_cols])

    df = pd.concat([df, dfx], ignore_index=True, sort=False)


df.rename(str.lower, axis='columns', inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5138 entries, 0 to 5137
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   job title        5137 non-null   object 
 1   company name     5127 non-null   object 
 2   location         5137 non-null   object 
 3   salary           5138 non-null   float64
 4   job type         4176 non-null   object 
 5   job description  5136 non-null   object 
dtypes: float64(1), object(5)
memory usage: 241.0+ KB


In [4]:
df.drop_duplicates(inplace=True)
df = df.dropna(subset=['job title','company name', 'job description'])
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 3307 entries, 0 to 5137
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   job title        3307 non-null   object 
 1   company name     3307 non-null   object 
 2   location         3307 non-null   object 
 3   salary           3307 non-null   float64
 4   job type         2866 non-null   object 
 5   job description  3307 non-null   object 
dtypes: float64(1), object(5)
memory usage: 180.9+ KB


In [5]:
def cleanTitle(txt):
    x = re.sub(r"\s*-\s*job post$", "", txt, flags=re.IGNORECASE).strip().lower()
    return x

df['job title'] = df['job title'].apply(cleanTitle)

df['job title']

0                                    full stack developer
1                                    full stack developer
2                             full stack junior developer
3         senior full-stack developer (angular/expressjs)
4                             front-end (react) developer
                              ...                        
5133                                 blockchain developer
5134                                 blockchain developer
5135                    integrator & operations (backend)
5136                                    symfony developer
5137    application support developer (for banking ind...
Name: job title, Length: 3307, dtype: object

In [6]:
language_df = pd.read_csv('nama.csv')
languages = language_df['title'].tolist()

def extract_languages(job_description):
    found_languages = []
    for language in languages:
        if re.search(fr'\b{language}\b', job_description, re.IGNORECASE):
            found_languages.append(language)
    return found_languages

df['programming language'] = df['job description'].apply(extract_languages)
df.head()

Unnamed: 0,job title,company name,location,salary,job type,job description,programming language
0,full stack developer,Edugate,Jakarta,7000000.0,Full-time,Software Developer with the following requirem...,"[Java, JavaScript, JS, PHP, Node.js, Dart, F*,..."
1,full stack developer,Techtiera Services Indonesia,Bekasi,8000000.0,Full-time,"As a Full-Stack Developer, you will be tasked ...","[JS, F*, JS++, P*, A+]"
2,full stack junior developer,PT Smart Milenium Effisiensi,Depok,5067381.0,Full-time,Kualifikasi\nLaki-laki\nUsia Max 30 Thn\nBelum...,"[JavaScript, JS, SQL, PHP, MySQL, PostgreSQL, ..."
3,senior full-stack developer (angular/expressjs),Worldly,Jakarta,11004710.0,Full-time,Senior Full-Stack Developer (Angular/ExpressJS...,"[JavaScript, JS, MySQL, PostgreSQL, TypeScript..."
4,front-end (react) developer,Bibit.id,Jakarta,11004710.0,Full-time,Company Description\n\nBibit.id is a platform ...,"[JavaScript, JS, F*, Plus, FOCUS, JS++, P*, A+]"


In [7]:
df = df[df['programming language'].str.len() > 0].reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3307 entries, 0 to 3306
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   job title             3307 non-null   object 
 1   company name          3307 non-null   object 
 2   location              3307 non-null   object 
 3   salary                3307 non-null   float64
 4   job type              2866 non-null   object 
 5   job description       3307 non-null   object 
 6   programming language  3307 non-null   object 
dtypes: float64(1), object(6)
memory usage: 181.0+ KB


In [8]:
def basic_combine(lang_list):
    return ['Visual Basic' if lang in ['BASIC Programming', 'BASIC'] else lang for lang in lang_list]

def js_typofix(lang_list):
    return ['JavaScript' if lang in ['JS'] else lang for lang in lang_list]

def python_typofix(lang_list):
    return ['Python' if lang in ['Phyton'] else lang for lang in lang_list]

def remove_duplicates(lang_list):
    return list(set(lang_list))

df['programming language'] = df['programming language'].apply(basic_combine)
df['programming language'] = df['programming language'].apply(js_typofix)
df['programming language'] = df['programming language'].apply(python_typofix)
df['programming language'] = df['programming language'].apply(remove_duplicates)

In [9]:
d = {}

def note_numlanguage(languages):
    for lang_list in languages:
        for lang in lang_list:  
            if lang in d:
                d[lang] += 1  
            else:
                d[lang] = 1  

note_numlanguage(df['programming language'])

print(len(d))
for key, value in d.items():
    print(f"{key}: {value}")

203
PHP: 541
Java: 781
P*: 3307
JavaScript: 1183
JS++: 581
Node.js: 312
Dart: 34
F*: 3307
A+: 2327
Visual Basic: 253
PostgreSQL: 395
TypeScript: 228
.QL: 861
T: 185
SQL: 860
MySQL: 543
USD: 18
CHAIN: 65
D++: 112
Flow: 128
FOCUS: 337
Plus: 809
never: 71
video: 47
Python: 722
Link: 42
kernel: 1
Address: 148
S3: 35
ASP.NET: 91
C: 402
C++: 406
Go: 224
Ruby: 114
GraphQL: 65
Ada: 88
MariaDB: 27
GAME: 95
Scheme: 6
Low*: 58
Expect: 33
Copilot: 8
Solidity: 5
pipelines: 250
solid: 360
RAPID: 31
lain: 88
NPL: 1
april: 8
Nice: 75
Fun: 186
flagship: 11
Swift: 108
Small: 75
Kotlin: 130
Simplicity: 5
Perl: 19
Scratch: 27
R: 153
R++: 153
F: 14
Scala: 37
Groovy: 10
Rust: 28
Revolution: 61
Apex: 13
Click: 55
MATLAB: 17
Adept: 21
ten: 8
dss: 2
SPARK: 86
Pure: 3
PL/SQL: 24
GAP: 24
Falcon: 2
Alpha: 3
Plot: 46
Joy: 57
CONVERT: 23
Frank: 1
Objective-C: 21
lever: 1
P: 39
JADE: 5
Blue: 3
NATURAL: 69
Inform: 35
Reason: 11
ARM: 6
SR: 7
Crystal: 12
PowerHouse: 11
Conceptual: 24
Factor: 11
False: 2
RPL: 4
SQLite: 

In [13]:
remove_list = [
    'F*', 'P*', 'A+','T','S3','video','address','USD','Plus','never','Link','GAME','lain','Fun','small','click',
    'oil','ten','Nice'
]

def remove_unwanted_languages(lang_list):
    return [lang for lang in lang_list if lang not in remove_list]

df['programming language'] = df['programming language'].apply(remove_unwanted_languages)


In [14]:
df = df[df['salary'] > 0].reset_index(drop=True)
df.to_csv('job_cleaned.csv', index=False)

In [15]:
df.head()

Unnamed: 0,job title,company name,location,salary,job type,job description,programming language
0,full stack developer,Edugate,Jakarta,7000000.0,Full-time,Software Developer with the following requirem...,"[PHP, Java, JavaScript, JS++, Node.js, Dart]"
1,full stack developer,Techtiera Services Indonesia,Bekasi,8000000.0,Full-time,"As a Full-Stack Developer, you will be tasked ...","[JavaScript, JS++]"
2,full stack junior developer,PT Smart Milenium Effisiensi,Depok,5067381.0,Full-time,Kualifikasi\nLaki-laki\nUsia Max 30 Thn\nBelum...,"[PHP, Visual Basic, JavaScript, PostgreSQL, Ty..."
3,senior full-stack developer (angular/expressjs),Worldly,Jakarta,11004710.0,Full-time,Senior Full-Stack Developer (Angular/ExpressJS...,"[JavaScript, PostgreSQL, TypeScript, JS++, Nod..."
4,front-end (react) developer,Bibit.id,Jakarta,11004710.0,Full-time,Company Description\n\nBibit.id is a platform ...,"[FOCUS, JavaScript, JS++]"
