# **Data Cleaning**

In [116]:
import pandas as pd

In [117]:
dataset = pd.read_csv('dataset.csv')
dataset

Unnamed: 0,id,name,description,skills
0,1,Cloud Engineer Intern,- Must be undegraduate,
1,2,Website Developer,<p><strong>Kualifikasi :</strong>&nbsp;</p><ul...,"['Komunikasi', 'Kreatif', 'Pemahaman Industri'..."
2,3,Software Architect Intern,<p>To help architect team on demonstration of ...,"['Komunikasi', 'Kreatif', 'Desain Grafis', 'Co..."


## **Combine Data**

In [118]:
import ast

In [119]:
dataset['skills'] = dataset['skills'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else '')

dataset['skills'] = dataset['skills'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

dataset['combined_text'] = dataset['name'] + ' ' + dataset['skills'].astype(str) + ' ' + dataset['description']

dataset['combined_text']

0        Cloud Engineer Intern  - Must be undegraduate
1    Website Developer Komunikasi, Kreatif, Pemaham...
2    Software Architect Intern Komunikasi, Kreatif,...
Name: combined_text, dtype: object

In [120]:
print(dataset['combined_text'][0])

Cloud Engineer Intern  - Must be undegraduate


## **Case Folding**

In [121]:
dataset['result_case_folding_data'] = dataset['combined_text'].apply(lambda x: x.casefold())
dataset[['id', 'combined_text','result_case_folding_data']]

Unnamed: 0,id,combined_text,result_case_folding_data
0,1,Cloud Engineer Intern - Must be undegraduate,cloud engineer intern - must be undegraduate
1,2,"Website Developer Komunikasi, Kreatif, Pemaham...","website developer komunikasi, kreatif, pemaham..."
2,3,"Software Architect Intern Komunikasi, Kreatif,...","software architect intern komunikasi, kreatif,..."


In [123]:
print(dataset['result_case_folding_data'][0])

cloud engineer intern  - must be undegraduate


## **Remove Html Tags**

In [124]:
from lxml import etree

In [125]:
def remove_html_tags(text):
    parser = etree.HTMLParser()
    tree = etree.fromstring(text, parser)
    return etree.tostring(tree, encoding='unicode', method='text')

dataset['result_remove_html_tags'] = dataset['result_case_folding_data'].apply(remove_html_tags)
dataset[['id', 'result_case_folding_data' ,'result_remove_html_tags']]

Unnamed: 0,id,result_case_folding_data,result_remove_html_tags
0,1,cloud engineer intern - must be undegraduate,cloud engineer intern - must be undegraduate
1,2,"website developer komunikasi, kreatif, pemaham...","website developer komunikasi, kreatif, pemaham..."
2,3,"software architect intern komunikasi, kreatif,...","software architect intern komunikasi, kreatif,..."


In [132]:
dataset['result_remove_html_tags'][1] = """website developer ['komunikasi', 'kreatif', 'pemahaman industri', 'manajemen proyek', 'kreativitas', 'kerjasama tim', 'keahlian teknologi', 'pemasaran produk'] kualifikasi : wajib memiliki laptop sendiri. mampu membuat website dan landing page sesuai dengan kebutuhan perusahaan mampu  optimasi seo masa magang minimum 3 bulan dengan jam kerja normal harus menyertakan portofolio benefit : sertifikat magang konversi nilai sistem wfh"""

dataset['result_remove_html_tags'][2] = """software architect intern ['komunikasi', 'kreatif', 'desain grafis', 'coding'] to help architect team on demonstration of an improvement idea in which work is focused on determining whether an idea can be turned into a reality. the improvement might include:- test optimization- test automation- framework usage e.g test framework or design"""

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  dataset['result_remove_html_tags'][1] = """website developer ['komunikasi', 'kreatif', 'pemahaman industri', 'manajemen proyek', 'kreativitas', 'kerjasama tim', 'keahlian teknologi', 'pemasaran produk'] kualifikasi : wajib memiliki laptop sendiri. mampu membuat website d

In [134]:
print(dataset['result_remove_html_tags'][0])

cloud engineer intern  - must be undegraduate


## **Remove Non-Alphanumeric Character**

In [135]:
import re

In [136]:
dataset['result_remove_non_alphanumeric_character'] = dataset['result_remove_html_tags'].apply(lambda x: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", x))
dataset[['id', 'result_remove_html_tags', 'result_remove_non_alphanumeric_character']]

Unnamed: 0,id,result_remove_html_tags,result_remove_non_alphanumeric_character
0,1,cloud engineer intern - must be undegraduate,cloud engineer intern must be undegraduate
1,2,"website developer ['komunikasi', 'kreatif', 'p...",website developer komunikasi kreatif pemahaman...
2,3,"software architect intern ['komunikasi', 'krea...",software architect intern komunikasi kreatif d...


In [141]:
print(dataset['result_remove_non_alphanumeric_character'][0])

cloud engineer intern   must be undegraduate


## **Tokenization**

In [142]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aszay\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [143]:
dataset['result_tokenization'] = dataset['result_remove_non_alphanumeric_character'].apply(lambda x: word_tokenize(x))
dataset[['id', 'result_remove_non_alphanumeric_character', 'result_tokenization']]

Unnamed: 0,id,result_remove_non_alphanumeric_character,result_tokenization
0,1,cloud engineer intern must be undegraduate,"[cloud, engineer, intern, must, be, undegraduate]"
1,2,website developer komunikasi kreatif pemahaman...,"[website, developer, komunikasi, kreatif, pema..."
2,3,software architect intern komunikasi kreatif d...,"[software, architect, intern, komunikasi, krea..."


In [147]:
print(dataset['result_tokenization'][0])

['cloud', 'engineer', 'intern', 'must', 'be', 'undegraduate']


## **Remove Stopwords**

In [148]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aszay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [149]:
english_stopwords = stopwords.words('english')

stopword_factory = StopWordRemoverFactory()
list_stopwords = ArrayDictionary(stopword_factory.get_stop_words() + english_stopwords)


stopword = StopWordRemover(list_stopwords)

In [150]:
def remove_stopwords(text):
    filtered_words = [stopword.remove(word) for word in text]
    return filtered_words

# Apply remove_stopwords to the 'result_tokenization' column
dataset['result_remove_stopwords'] = dataset['result_tokenization'].apply(remove_stopwords)

dataset['result_remove_stopwords'] = dataset['result_remove_stopwords'].apply(lambda x: ' '.join(x))

dataset[['id', 'result_tokenization', 'result_remove_stopwords']]

Unnamed: 0,id,result_tokenization,result_remove_stopwords
0,1,"[cloud, engineer, intern, must, be, undegraduate]",cloud engineer intern must undegraduate
1,2,"[website, developer, komunikasi, kreatif, pema...",website developer komunikasi kreatif pemahaman...
2,3,"[software, architect, intern, komunikasi, krea...",software architect intern komunikasi kreatif d...


In [154]:
print(dataset['result_remove_stopwords'][0])

cloud engineer intern must  undegraduate


## **Stemming**

In [155]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [156]:
stemmer = StemmerFactory().create_stemmer()

dataset['result_stemming'] = dataset['result_remove_stopwords'].apply(lambda x: stemmer.stem(x))
dataset[['id', 'result_remove_stopwords', 'result_stemming']]

Unnamed: 0,id,result_remove_stopwords,result_stemming
0,1,cloud engineer intern must undegraduate,cloud engineer intern must undegraduate
1,2,website developer komunikasi kreatif pemahaman...,website developer komunikasi kreatif paham ind...
2,3,software architect intern komunikasi kreatif d...,software architect intern komunikasi kreatif d...


In [159]:
print(dataset['result_stemming'][1])

website developer komunikasi kreatif paham industri manajemen proyek kreativitas kerjasama tim ahli teknologi pasar produk kualifikasi wajib milik laptop sendiri mampu buat website landing page sesuai butuh usaha mampu optimasi seo masa magang minimum 3 bulan jam kerja normal serta portofolio benefit sertifikat magang konversi nilai sistem wfh


## **Vectorizer**

In [163]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [164]:
vectors = TfidfVectorizer().fit_transform(dataset['result_stemming'])
vectors.toarray()

vectors

<3x74 sparse matrix of type '<class 'numpy.float64'>'
	with 77 stored elements in Compressed Sparse Row format>

# **Modeling**

In [165]:
from sklearn.metrics.pairwise import cosine_similarity

In [168]:
similarity = cosine_similarity(vectors)
similarity[1][0:]

array([0.        , 1.        , 0.02343044])

In [169]:
def recommend_by_content_based_filtering(item):
  item_index = dataset[dataset['id']==item].index[0]
  distances = similarity[item_index]
  item_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[0:]

  recommended_items = []

  for i in item_list:
    recommended_items.append([dataset.iloc[i[0]].id]+[i[1]])

  return recommended_items

In [171]:
recommend_by_content_based_filtering(2)

[[2, 1.0000000000000002], [3, 0.023430435514537336], [1, 0.0]]

: 