In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.impute import SimpleImputer

In [2]:
def cleanSalary(txt):
    if isinstance(txt, str):
        numbers = re.findall(r'\d{1,3}(?:,\d{3})*', txt)
        if numbers:
            max_salary = int(numbers[-1].replace(',', ''))
            return max_salary
        else:
            num = re.findall(r'\d{1,3}(?:\.\d{3})*', txt)
            if num:
                max_salary = int(num[-1].replace('.', ''))
                return max_salary
        return None
    return txt

In [3]:
df = pd.DataFrame()

for i in range(1, 14):
    file_name = f'scrapping_result/jobdata{i}.csv'
    dfx = pd.read_csv(file_name) 
    dfx['Salary'] = dfx['Salary'].apply(cleanSalary) 

    num_cols = dfx.select_dtypes(include=['float64']).columns
    imp = SimpleImputer(strategy='mean')
    dfx[num_cols] = imp.fit_transform(dfx[num_cols])

    df = pd.concat([df, dfx], ignore_index=True, sort=False)  


for i in range(1, 14):
    file_name = f'scrapping_result/jobstreet{i}.csv'
    dfx = pd.read_csv(file_name) 
    dfx.drop('Date', axis=1, inplace=True)
    dfx['Salary'] = dfx['Salary'].apply(cleanSalary) 

    num_cols = dfx.select_dtypes(include=['float64']).columns
    imp = SimpleImputer(strategy='mean')
    dfx[num_cols] = imp.fit_transform(dfx[num_cols])

    df = pd.concat([df, dfx], ignore_index=True, sort=False)


df.rename(str.lower, axis='columns', inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5138 entries, 0 to 5137
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   job title        5137 non-null   object 
 1   company name     5127 non-null   object 
 2   location         5137 non-null   object 
 3   salary           5138 non-null   float64
 4   job type         4176 non-null   object 
 5   job description  5136 non-null   object 
dtypes: float64(1), object(5)
memory usage: 241.0+ KB


In [4]:
df.drop_duplicates(inplace=True)
df = df.dropna(subset=['job title','company name', 'job description'])
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 3307 entries, 0 to 5137
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   job title        3307 non-null   object 
 1   company name     3307 non-null   object 
 2   location         3307 non-null   object 
 3   salary           3307 non-null   float64
 4   job type         2866 non-null   object 
 5   job description  3307 non-null   object 
dtypes: float64(1), object(5)
memory usage: 180.9+ KB


In [5]:
def cleanTitle(txt):
    x = re.sub(r"\s*-\s*job post$", "", txt, flags=re.IGNORECASE).strip().lower()
    return x

df['job title'] = df['job title'].apply(cleanTitle)

df['job title']

0                                    full stack developer
1                                    full stack developer
2                             full stack junior developer
3         senior full-stack developer (angular/expressjs)
4                             front-end (react) developer
                              ...                        
5133                                 blockchain developer
5134                                 blockchain developer
5135                    integrator & operations (backend)
5136                                    symfony developer
5137    application support developer (for banking ind...
Name: job title, Length: 3307, dtype: object

In [6]:
language_df = pd.read_csv('nama.csv')
languages = language_df['title'].tolist()
print(languages)

['Java', 'JavaScript', 'JS', 'C', 'Python', 'Phyton', 'SQL', 'C++', 'PHP', 'Perl', 'MATLAB', 'Ruby', 'C#', 'MySQL', 'Fortran', 'R', 'Go', 'Scala', 'Swift', 'Ada', 'COBOL', 'Rust', 'PostgreSQL', 'Kotlin', 'Arduino', 'PowerShell', 'Pascal', 'Haskell', 'TypeScript', 'SAS', 'ARM', 'Lisp', 'Lua', 'Julia', 'Prolog', 'Node.js', 'Objective-C', 'Clojure', 'Scheme', 'Assembly', 'Erlang', 'Bash', 'Mathematica', 'Elixir', 'CUDA', 'Dart', 'Modula-2', 'Visual Basic', 'Tcl', 'ActionScript', 'VBA', 'BASIC', 'Delphi', 'Solidity', 'F#', 'Regular Expressions', 'OCaml', 'Eiffel', 'Crystal', 'Modula-3', 'Racket', 'Groovy', 'Common Lisp', 'Reason', 'Sage', 'Maple', 'CoffeeScript', 'APL', 'OpenCL', 'XQuery', 'GraphQL', 'SPSS', 'Scratch', 'Visual Basic .NET', 'PureBasic', 'Haxe', 'VBScript', 'Oberon', 'LabVIEW G', 'Applescript', 'Oz', 'Lasso', 'ECMAScript', 'Red', 'Simula', 'Idris', 'SQLite', 'AMPL', 'BCPL', 'PL/SQL', 'Dylan', 'REBOL', 'PureScript', 'FLUX', 'Fantom', 'Squirrel', 'EuLisp', 'Monkey', 'Limbo', '

In [7]:

def extract_languages(job_description):
    found_languages = []
    for language in languages:
        if language == "C++":
            pattern = r'\bC\+\+(?!\w)'  # Match C++ only
        elif language == "R++":
            pattern = r'\bR\+\+(?!\w)'  # Match R++ only
        elif language == "D++":
            pattern = r'\bD\+\+(?!\w)'  # Match D++ only
        elif language == "Pro*C":
            pattern = r'\bPro\*C\b'  # Match Pro*C only
        elif language == "Low*":
            pattern = r'\bLow\*\b'  # Match Low* only
        elif language == "ASP.NET":
            pattern = r'\bASP[ .]?NET\b'  # Match ASP.NET or ASP .NET
        elif language == "Visual FoxPro":
            pattern = r'\bVisual\s?Fox\s?Pro\b'  
        elif language == "Node.js":
            pattern = r'\bNode[.\s]?js\b'  # Match "Node.js", "node js", "Nodejs"
        else:
            pattern = fr'\b{language}\b' if language.isalnum() else fr'\b{language}\B'
        
        if re.search(pattern, job_description, re.IGNORECASE):
            found_languages.append(language)
    return found_languages

df['programming language'] = df['job description'].apply(extract_languages)


csharp_jobs = df[df['programming language'].apply(lambda x: 'C#' in x)]
print("\nRows with C# found:")
print(csharp_jobs)





Rows with C# found:
                                              job title  \
11                                 full stack developer   
18    technical team lead, trilogy (remote) - $60,00...   
34                                   software developer   
37               it core banking (full stack developer)   
45                        software developer / engineer   
...                                                 ...   
5103                     global software engineer (c++)   
5118                                  software engineer   
5119                                  software engineer   
5124         fullstack .net developer - leasing company   
5127                                     odoo developer   

                                company name                    location  \
11                      Dnalytics Consulting                     Jakarta   
18                                 Crossover                   Indonesia   
34                 PT ALTERA CIPTA TEKNOLO

In [15]:
csharp_jobs = df[df['programming language'].apply(lambda x: 'C#' in x)]
print(csharp_jobs)

                                              job title  \
11                                 full stack developer   
18    technical team lead, trilogy (remote) - $60,00...   
34                                   software developer   
37               it core banking (full stack developer)   
45                        software developer / engineer   
...                                                 ...   
3012                     global software engineer (c++)   
3027                                  software engineer   
3028                                  software engineer   
3033         fullstack .net developer - leasing company   
3036                                     odoo developer   

                                company name                    location  \
11                      Dnalytics Consulting                     Jakarta   
18                                 Crossover                   Indonesia   
34                 PT ALTERA CIPTA TEKNOLOGI                   

In [16]:
df = df[df['programming language'].str.len() > 0].reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2718 entries, 0 to 2717
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   job title             2718 non-null   object 
 1   company name          2718 non-null   object 
 2   location              2718 non-null   object 
 3   salary                2718 non-null   float64
 4   job type              2318 non-null   object 
 5   job description       2718 non-null   object 
 6   programming language  2718 non-null   object 
dtypes: float64(1), object(6)
memory usage: 148.8+ KB


In [17]:
def basic_combine(lang_list):
    return ['Visual Basic' if lang in ['BASIC Programming', 'BASIC'] else lang for lang in lang_list]

def js_typofix(lang_list):
    return ['JavaScript' if lang in ['JS'] else lang for lang in lang_list]

def python_typofix(lang_list):
    return ['Python' if lang in ['Phyton'] else lang for lang in lang_list]

def remove_duplicates(lang_list):
    return list(set(lang_list))

df['programming language'] = df['programming language'].apply(basic_combine)
df['programming language'] = df['programming language'].apply(js_typofix)
df['programming language'] = df['programming language'].apply(python_typofix)
df['programming language'] = df['programming language'].apply(remove_duplicates)

In [18]:
d = {}

def note_numlanguage(languages):
    for lang_list in languages:
        for lang in lang_list:  
            if lang in d:
                d[lang] += 1  
            else:
                d[lang] = 1  

note_numlanguage(df['programming language'])

print(len(d))
for key, value in d.items():
    print(f"{key}: {value}")

178
Java: 728
JavaScript: 1111
Dart: 32
Node.js: 450
PHP: 505
Visual Basic: 234
MySQL: 511
TypeScript: 211
SQL: 793
PostgreSQL: 365
Flow: 120
CHAIN: 64
Z++: 99
FOCUS: 326
Python: 675
kernel: 1
Address: 141
ASP.NET: 84
C#: 225
JS++: 181
C: 362
GraphQL: 62
Go: 211
Ada: 71
Ruby: 112
MariaDB: 24
Scheme: 6
Expect: 33
Copilot: 8
Solidity: 4
solid: 338
pipelines: 244
RAPID: 31
NPL: 1
april: 3
flagship: 11
Swift: 96
Small: 72
Kotlin: 117
Simplicity: 5
Perl: 18
C++: 118
Scratch: 25
R: 148
F: 14
Rust: 27
Groovy: 10
Scala: 36
Revolution: 61
MATLAB: 17
Apex: 11
Click: 54
Adept: 21
.QL: 26
dss: 2
SPARK: 82
Pure: 3
GAP: 23
Falcon: 2
Alpha: 3
Plot: 46
Joy: 57
CONVERT: 23
Frank: 1
P: 38
lever: 1
JADE: 5
Blue: 3
THINK C: 11
Inform: 32
NATURAL: 68
Reason: 11
ARM: 6
SR: 6
Crystal: 11
PowerHouse: 11
Conceptual: 22
Factor: 10
False: 2
RPL: 4
SQLite: 12
fe: 7
Chrome: 3
Tao: 4
Flavors: 4
Cyber: 15
Bash: 42
DEMOS: 7
Q: 4
PROMETHEUS: 29
Gist: 3
PILOT: 6
REDUCE: 28
Monkey: 1
Z: 1
abs: 1
FaCT: 5
MACRO: 2
FLUX: 4

In [19]:
remove_list = [
    'F*', 'P*', 'A+','T','S3','video','address','USD','Plus','never','Link','GAME','lain','Fun','small','click',
    'oil','ten','Nice'
]

def remove_unwanted_languages(lang_list):
    return [lang for lang in lang_list if lang not in remove_list]

df['programming language'] = df['programming language'].apply(remove_unwanted_languages)


In [20]:
df = df[df['salary'] > 0].reset_index(drop=True)
df.to_csv('job_cleaned.csv', index=False)

In [21]:
df.head()

Unnamed: 0,job title,company name,location,salary,job type,job description,programming language
0,full stack developer,Edugate,Jakarta,7000000.0,Full-time,Software Developer with the following requirem...,"[Java, JavaScript, Dart, Node.js, PHP]"
1,full stack developer,Techtiera Services Indonesia,Bekasi,8000000.0,Full-time,"As a Full-Stack Developer, you will be tasked ...",[JavaScript]
2,full stack junior developer,PT Smart Milenium Effisiensi,Depok,5067381.0,Full-time,Kualifikasi\nLaki-laki\nUsia Max 30 Thn\nBelum...,"[Visual Basic, JavaScript, MySQL, TypeScript, ..."
3,senior full-stack developer (angular/expressjs),Worldly,Jakarta,11004710.0,Full-time,Senior Full-Stack Developer (Angular/ExpressJS...,"[Flow, JavaScript, MySQL, TypeScript, Node.js,..."
4,front-end (react) developer,Bibit.id,Jakarta,11004710.0,Full-time,Company Description\n\nBibit.id is a platform ...,"[FOCUS, JavaScript]"
