In [85]:
!pip install pyspark
import pyspark
import numpy as np
import pandas as pd




In [86]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StringType
from pyspark.ml.feature import Tokenizer, IDF,HashingTF
from pyspark.sql.functions import regexp_replace
from sklearn.metrics.pairwise import cosine_similarity
from pyspark.sql import SparkSession
from pyspark.sql.functions import lower, concat_ws, collect_list
from pyspark.sql.window import Window


In [87]:
spark = SparkSession.builder.appName("MovieRecommendation").getOrCreate()


In [88]:
data = spark.read.csv("/content/drive/MyDrive/PKL-JTK-Eben/September/Datalearns247/datalearns_tags.csv", header=True, inferSchema=True)
data

DataFrame[nid: int, title: string, topic_name: string, tag_name: string, field_tags_target_id: int, field_topic_target_id: string]

In [89]:
data.show()

+---+--------------------+-----------------+--------+--------------------+---------------------+
|nid|               title|       topic_name|tag_name|field_tags_target_id|field_topic_target_id|
+---+--------------------+-----------------+--------+--------------------+---------------------+
| 12|Mengenal Machine ...| Machine Learning| Article|                  11|                  120|
| 63|  21 Tahun Solusi247|             NULL| Article|                  11|                 NULL|
| 69|Mengenal Feature ...| Machine Learning| Article|                  11|                  120|
| 69|Mengenal Feature ...|Feature Selection| Article|                  11|                  121|
| 71|Feature Selection...|           Pandas| Article|                  11|                  124|
| 71|Feature Selection...|           Python| Article|                  11|                   85|
| 71|Feature Selection...|Feature Selection| Article|                  11|                  121|
| 71|Feature Selection...|    

In [90]:
data = data.withColumn("topic_name", data["topic_name"].cast("string"))
grouped_data = data.groupBy("nid").agg(concat_ws(", ", collect_list("topic_name")).alias("topic_name_grouped"))
data = data.join(grouped_data, on="nid", how="inner")
window_spec = Window.partitionBy("title").orderBy("nid")
data = data.withColumn("row_number", F.row_number().over(window_spec))
data = data.filter(data["row_number"] == 1).drop("row_number")
data = data.withColumn("features", lower(concat_ws(" ", data["title"], data["topic_name_grouped"], data["tag_name"])))


In [93]:
tokenizer = Tokenizer(inputCol="features", outputCol="words")
words_data = tokenizer.transform(data)

In [94]:
# words_data.show()

In [108]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=1000)
tf_data = hashingTF.transform(words_data)

In [109]:
idf = IDF(inputCol="rawFeatures", outputCol="tfidf_features")
idf_model = idf.fit(tf_data)
tfidf_data = idf_model.transform(tf_data)

In [110]:
tfidf_data_pandas = tfidf_data.toPandas()

In [98]:
# tfidf_data_pandas

In [111]:

tfidf_vectors = tfidf_data_pandas["tfidf_features"].to_list()
cosine_sim = cosine_similarity(tfidf_vectors, tfidf_vectors)

In [112]:
data = tfidf_data_pandas

In [113]:
data.head()

Unnamed: 0,nid,title,topic_name,tag_name,field_tags_target_id,field_topic_target_id,topic_name_grouped,features,words,rawFeatures,tfidf_features
0,191,13 Python Package Populer untuk Analisis Time ...,Python,Article,11,85.0,"Python, Timeseries, Statsmodels, Pmdarima, Pro...",13 python package populer untuk analisis time ...,"[13, python, package, populer, untuk, analisis...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.23359222..."
1,63,21 Tahun Solusi247,,Article,11,,"NULL, NULL","21 tahun solusi247 null, null article","[21, tahun, solusi247, null,, null, article]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,244,30 Tools Generative AI Tahun 2023,AI,Article,11,109.0,"AI, Generative AI","30 tools generative ai tahun 2023 ai, generati...","[30, tools, generative, ai, tahun, 2023, ai,, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,190,"Analisis Time Series, Definisi, Pola dan Algor...",Timeseries,Article,11,112.0,Timeseries,"analisis time series, definisi, pola dan algor...","[analisis, time, series,, definisi,, pola, dan...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,236,Belajar Apache Spark Dengan Python : Mengenal ...,Pyspark,Tutorial,12,84.0,"Pyspark, Python",belajar apache spark dengan python : mengenal ...,"[belajar, apache, spark, dengan, python, :, me...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [114]:
def recommend_items_by_item_id(item_id, num_recommendations=7):
    item_index = data[data['nid'] == item_id].index[0]
    item_similarity = cosine_sim[item_index]
    similar_item_indices = item_similarity.argsort()[::-1][1:num_recommendations+1]
    recommended_item_ids = data.iloc[similar_item_indices]['nid'].values
    recommended_item_titles = data.iloc[similar_item_indices]['title'].values

    return recommended_item_ids, recommended_item_titles

In [115]:
item_id = 12
recommended_item_ids, recommended_item_titles = recommend_items_by_item_id(item_id)

selected_item = data[data['nid'] == item_id]['title'].values[0]

print(f"Title for ({item_id}): {selected_item}\n")

print(f"Recommended items for item ID {item_id}:")
for item_id, item_title in zip(recommended_item_ids, recommended_item_titles):
    print(f"Item ID: {item_id}, Title: {item_title}")


Title for (12): Mengenal Machine Learning

Recommended items for item ID 12:
Item ID: 69, Title: Mengenal Feature Selection dalam Machine Learning
Item ID: 202, Title: Perjalanan Artificial Neural Network, Dari Perceptron ke Deep Learning
Item ID: 233, Title: Yuk Mengenal Apache Spark
Item ID: 236, Title: Belajar Apache Spark Dengan Python : Mengenal DataFrame
Item ID: 212, Title: HGrid247 Data Engineering
Item ID: 63, Title: 21 Tahun Solusi247
Item ID: 201, Title: Deteksi Stasioneritas Dalam Time Series


In [116]:
item_id = 235
recommended_item_ids, recommended_item_titles = recommend_items_by_item_id(item_id)

selected_item = data[data['nid'] == item_id]['title'].values[0]

print(f"Title for ({item_id}): {selected_item}\n")

print(f"Recommended items for item ID {item_id}:")
for item_id, item_title in zip(recommended_item_ids, recommended_item_titles):
    print(f"Item ID: {item_id}, Title: {item_title}")


Title for (235): Belajar PySpark - Select, Filter dan Where Pada DataFrame

Recommended items for item ID 235:
Item ID: 245, Title: Belajar PySpark - SQL pada Dataframe dengan expr()
Item ID: 237, Title: Belajar PySpark - Transformasi DataFrame dengan withColumn
Item ID: 246, Title: Belajar PySpark - GroupBy dan Agregasi
Item ID: 247, Title: Belajar PySpark - Join Dataframe
Item ID: 238, Title: Belajar PySpark - Transformasi Dataframe dengan When-Otherwise
Item ID: 236, Title: Belajar Apache Spark Dengan Python : Mengenal DataFrame
Item ID: 191, Title: 13 Python Package Populer untuk Analisis Time Series


In [117]:
item_id = 191
recommended_item_ids, recommended_item_titles = recommend_items_by_item_id(item_id)

selected_item = data[data['nid'] == item_id]['title'].values[0]

print(f"Title for ({item_id}): {selected_item}\n")

print(f"Recommended items for item ID {item_id}:")
for item_id, item_title in zip(recommended_item_ids, recommended_item_titles):
    print(f"Item ID: {item_id}, Title: {item_title}")


Title for (191): 13 Python Package Populer untuk Analisis Time Series

Recommended items for item ID 191:
Item ID: 201, Title: Deteksi Stasioneritas Dalam Time Series
Item ID: 190, Title: Analisis Time Series, Definisi, Pola dan Algoritma
Item ID: 235, Title: Belajar PySpark - Select, Filter dan Where Pada DataFrame
Item ID: 236, Title: Belajar Apache Spark Dengan Python : Mengenal DataFrame
Item ID: 17, Title: local class incompatible: stream classdesc serialVersionUID = 2, local class serialVersionUID = 3
Item ID: 263, Title: Inovasi Masa Depan: Ide Bisnis AI untuk Startup yang Menjanjikan
Item ID: 238, Title: Belajar PySpark - Transformasi Dataframe dengan When-Otherwise


In [106]:
item_id = 202
recommended_item_ids, recommended_item_titles = recommend_items_by_item_id(item_id)

selected_item = data[data['nid'] == item_id]['title'].values[0]

print(f"Title for ({item_id}): {selected_item}\n")

print(f"Recommended items for item ID {item_id}:")
for item_id, item_title in zip(recommended_item_ids, recommended_item_titles):
    print(f"Item ID: {item_id}, Title: {item_title}")


Title for (202): Perjalanan Artificial Neural Network, Dari Perceptron ke Deep Learning

Recommended items for item ID 202:
Item ID: 12, Title: Mengenal Machine Learning
Item ID: 69, Title: Mengenal Feature Selection dalam Machine Learning
Item ID: 37, Title: Instalasi Apache Hive Pada Ubuntu
Item ID: 13, Title: Instalasi Hadoop 3.2.2 pada Windows 10 WSL
Item ID: 244, Title: 30 Tools Generative AI Tahun 2023
Item ID: 212, Title: HGrid247 Data Engineering
Item ID: 233, Title: Yuk Mengenal Apache Spark


In [107]:
item_id = 212
recommended_item_ids, recommended_item_titles = recommend_items_by_item_id(item_id)

selected_item = data[data['nid'] == item_id]['title'].values[0]

print(f"Title for ({item_id}): {selected_item}\n")

print(f"Recommended items for item ID {item_id}:")
for item_id, item_title in zip(recommended_item_ids, recommended_item_titles):
    print(f"Item ID: {item_id}, Title: {item_title}")


Title for (212): HGrid247 Data Engineering

Recommended items for item ID 212:
Item ID: 221, Title: Fitur Reverse Engineering pada HGrid247 DE
Item ID: 233, Title: Yuk Mengenal Apache Spark
Item ID: 63, Title: 21 Tahun Solusi247
Item ID: 201, Title: Deteksi Stasioneritas Dalam Time Series
Item ID: 12, Title: Mengenal Machine Learning
Item ID: 190, Title: Analisis Time Series, Definisi, Pola dan Algoritma
Item ID: 71, Title: Feature Selection Menggunakan Scikit-learn
