In [1]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

The following additional packages will be installed:
  libxtst6 openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra fonts-nanum fonts-ipafont-gothic
  fonts-ipafont-mincho fonts-wqy-microhei fonts-wqy-zenhei fonts-indic
The following NEW packages will be installed:
  libxtst6 openjdk-8-jdk-headless openjdk-8-jre-headless
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 39.7 MB of archives.
After this operation, 144 MB of additional disk space will be used.
Selecting previously unselected package libxtst6:amd64.
(Reading database ... 123632 files and directories currently installed.)
Preparing to unpack .../libxtst6_2%3a1.2.3-1build4_amd64.deb ...
Unpacking libxtst6:amd64 (2:1.2.3-1build4) ...
Selecting previously unselected package openjdk-8-jre-headless:amd64.
Preparing to unpack .../openjdk-8-jre-headless_8u432-ga~us1-0ubuntu2~22.04_amd64.deb ...
Unpacking openjdk-8-jre-headless:amd64 (8u432-ga~us1-0ub

In [2]:
!pip install konlpy
!apt-get update
!apt-get install -y openjdk-11-jdk

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading jpype1-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m96.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jpype1-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (493 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.8/493.8 kB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.1 konlpy-0.6.0
Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:4 http://arc

In [3]:
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [31]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col, udf
from pyspark.sql.functions import col, udf, split
from pyspark.sql.functions import when, lit
from pyspark.ml.feature import CountVectorizer
from pyspark.sql.types import FloatType
import numpy as np
from pyspark.ml.linalg import DenseVector

In [10]:
# 1. SparkSession 생성
spark = SparkSession.builder \
    .appName("Job Recommendation System") \
    .getOrCreate()

# 2. 채용 공고 데이터 로드
job_postings = spark.read.csv("processed_data.csv", header=True)

In [19]:
# Step 3: Select relevant columns
job_postings = job_postings.select("comp_name", "category", "tech_stack")

# Step 4: Preprocess the tech_stack column (split into arrays of technologies)
job_postings = job_postings.withColumn(
    "tech_stack_array", split(col("tech_stack"), r"\|")  # Split '|' into an array
)

# Replace null or empty values in tech_stack_array with an empty array
job_postings = job_postings.withColumn(
    "tech_stack_array",
    when(col("tech_stack_array").isNull(), lit([])).otherwise(col("tech_stack_array"))
)
# Step 5: Vectorize the `tech_stack_array` column
vectorizer = CountVectorizer(inputCol="tech_stack_array", outputCol="features")
vectorizer_model = vectorizer.fit(job_postings)
job_postings_vectorized = vectorizer_model.transform(job_postings)


# Step 6: Create a DataFrame for user input
user_competency = input("Describe your competence (within 500 characters): ")
print("---------------------------------------------------------------------")
print("---------------------------------------------------------------------")
user_input = input("Enter your technology stack, separated by commas(,): ")
user_tech_stack = [tech.strip().lower().capitalize() for tech in user_input.split(",")]
user_df = spark.createDataFrame([(0, user_tech_stack)], ["id", "tech_stack_array"])
user_vectorized = vectorizer_model.transform(user_df)

# Extract the user's feature vector
user_features = user_vectorized.select("features").collect()[0]["features"]

# Step 7: Define a UDF to calculate cosine similarity
def cosine_similarity(v1, v2):
    v1_array = np.array(v1.toArray())
    v2_array = np.array(v2.toArray())
    dot_product = np.dot(v1_array, v2_array)
    norm_v1 = np.linalg.norm(v1_array)
    norm_v2 = np.linalg.norm(v2_array)
    return float(dot_product / (norm_v1 * norm_v2) if norm_v1 and norm_v2 else 0.0)

cosine_similarity_udf = udf(lambda x: cosine_similarity(x, user_features), FloatType())

# Step 8: Apply the UDF to calculate similarity scores
job_postings_with_similarity = job_postings_vectorized.withColumn(
    "similarity", cosine_similarity_udf(col("features"))
)

# Step 9: Sort jobs by similarity and select top 5 recommendations
top_recommendations = job_postings_with_similarity.orderBy(col("similarity").desc()).limit(20)

# Step 10: Display the results
top_recommendations.select("comp_name", "category", "similarity").show()

Describe your competence (within 500 characters): 안녕하십니까
---------------------------------------------------------------------
---------------------------------------------------------------------
Enter your technology stack, separated by commas(,): python, react
+----------------+----------------------------------+----------+
|       comp_name|                          category|similarity|
+----------------+----------------------------------+----------+
|            해줌|          기술지원|정보보안 담당자|0.70710677|
|          아그모|  인공지능(AI)|임베디드 소프트웨어|       0.5|
|시프트다이나믹스|   인공지능(AI)|임베디드 소프트...|       0.5|
|    부동산플래닛|                 빅데이터 엔지니어|       0.5|
|    퍼플아카데미|                 빅데이터 엔지니어|       0.5|
|        토모큐브|                VR/AR/3D|SW/솔루션|       0.5|
|디스페이스코리아|       devops/시스템 엔지니어|S...|       0.5|
|          노르마|                 인공지능/머신러닝|       0.5|
|      베스텔라랩|                        IOS 개발자| 0.4472136|
|      베스텔라랩|                 안드로이드 개발자| 0.4472136|
|        거북스쿨|크로스플랫폼 

In [3]:
!pyspark --version

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.5.3
      /_/
                        
Using Scala version 2.12.18, OpenJDK 64-Bit Server VM, 11.0.25
Branch HEAD
Compiled by user haejoon.lee on 2024-09-09T05:20:05Z
Revision 32232e9ed33bb16b93ad58cfde8b82e0f07c0970
Url https://github.com/apache/spark
Type --help for more information.


In [33]:
# Step 1: SparkSession 생성
spark = SparkSession.builder \
    .appName("Job Recommendation System") \
    .getOrCreate()

# Step 2: 채용 공고 데이터 로드
job_postings = spark.read.csv("processed_data.csv", header=True)

# Step 3: Select relevant columns
job_postings = job_postings.select("comp_name", "category", "tech_stack")

# Step 4: Preprocess the tech_stack column
job_postings = job_postings.withColumn(
    "tech_stack_array", split(col("tech_stack"), r"\|")  # Split '|' into an array
)
job_postings = job_postings.withColumn(
    "tech_stack_array",
    when(col("tech_stack_array").isNull(), lit([])).otherwise(col("tech_stack_array"))
)

# Step 5: Vectorize the `tech_stack_array` column
vectorizer = CountVectorizer(inputCol="tech_stack_array", outputCol="features")
vectorizer_model = vectorizer.fit(job_postings)
job_postings_vectorized = vectorizer_model.transform(job_postings)

# Step 6: Main loop
def cosine_similarity(v1, v2):
    v1_array = np.array(v1.toArray())
    v2_array = np.array(v2.toArray())
    dot_product = np.dot(v1_array, v2_array)
    norm_v1 = np.linalg.norm(v1_array)
    norm_v2 = np.linalg.norm(v2_array)
    return float(dot_product / (norm_v1 * norm_v2) if norm_v1 and norm_v2 else 0.0)

cosine_similarity_udf = udf(lambda x: cosine_similarity(x, user_features), FloatType())

while True:
    # Clear previously entered values and prompt for new input
    print("---------------------------------------------------------------------")
    user_competency = input("Describe your competence (within 500 characters): ")
    print("---------------------------------------------------------------------")
    print("---------------------------------------------------------------------")

    user_input = input("Enter your technology stack, separated by commas(,): ")
    user_tech_stack = [tech.strip().lower().capitalize() for tech in user_input.split(",")]

    # Step 7: Create a DataFrame for user input
    user_df = spark.createDataFrame([(0, user_tech_stack)], ["id", "tech_stack_array"])
    user_vectorized = vectorizer_model.transform(user_df)

    # Extract the user's feature vector
    user_features = user_vectorized.select("features").collect()[0]["features"]

    # Step 8: Apply the UDF to calculate similarity scores
    job_postings_vectorized = vectorizer_model.transform(job_postings)
    job_postings_with_similarity = job_postings_vectorized.withColumn(
    "similarity", cosine_similarity_udf(col("features")))

    # Step 9: Sort jobs by similarity and select top recommendations
    top_recommendations = job_postings_with_similarity.orderBy(col("similarity").desc()).limit(10)

    # Step 10: Display the results
    top_recommendations.select("comp_name", "category", "similarity").show()

    # Ask whether the user wants to continue or quit
    next_action = input("Press 'q' to quit or 'r' to recommend again: ").strip().lower()
    if next_action == 'q':
        print("Exiting the program. Goodbye!")
        break
    elif next_action == 'r':
        print("Restarting recommendation...")
        continue
    else:
        print("Invalid input. Exiting the program.")
        break

---------------------------------------------------------------------
Describe your competence (within 500 characters): dd
---------------------------------------------------------------------
---------------------------------------------------------------------
Enter your technology stack, separated by commas(,): python
+----------------+--------------------------------+----------+
|       comp_name|                        category|similarity|
+----------------+--------------------------------+----------+
|            해줌|        기술지원|정보보안 담당자|       1.0|
|    부동산플래닛|               빅데이터 엔지니어|0.70710677|
|          아그모|인공지능(AI)|임베디드 소프트웨어|0.70710677|
|시프트다이나믹스| 인공지능(AI)|임베디드 소프트...|0.70710677|
|        토모큐브|              VR/AR/3D|SW/솔루션|0.70710677|
|    퍼플아카데미|               빅데이터 엔지니어|0.70710677|
|디스페이스코리아|     devops/시스템 엔지니어|S...|0.70710677|
|          노르마|               인공지능/머신러닝|0.70710677|
|      라온데이터|               머신러닝 엔지니어|0.57735026|
|           코드v|                     서버/백엔드

KeyboardInterrupt: Interrupted by user