**DATA SAMPLING**

In [None]:
from pyspark.sql import SparkSession

def combine_and_sample_correctly_with_multiple_columns(job_desc_file, resumes_file, output_file, sample_size=5700):
    # Initialize Spark session
    spark = SparkSession.builder \
        .appName("Combine and Sample Datasets with Multiple Columns") \
        .config("spark.executor.memory", "4g") \
        .config("spark.driver.memory", "4g") \
        .config("spark.sql.csv.multiLine", "true") \
        .config("spark.sql.csv.escape", '"') \
        .getOrCreate()

    print("Loading datasets...")

    # Load the job descriptions dataset
    job_desc_df = spark.read.csv(job_desc_file, header=True, inferSchema=True)

    # Load the resumes dataset with multiline support
    resumes_df = spark.read.option("multiLine", "true") \
                           .option("quote", '"') \
                           .option("escape", '"') \
                           .option("header", "true") \
                           .csv(resumes_file)

    # Verify data
    if job_desc_df.count() == 0 or resumes_df.count() == 0:
        raise ValueError("One of the input datasets is empty. Please check the input files.")

    print(f"Job Description Rows: {job_desc_df.count()}, Resume Rows: {resumes_df.count()}")

    # Perform Cartesian product
    print("Performing Cartesian product...")
    combined_df = job_desc_df.crossJoin(resumes_df)

    # Total number of rows after Cartesian product
    total_records = combined_df.count()
    print(f"Total records after Cartesian product: {total_records}")

    # Sample the specified number of records
    if sample_size > total_records:
        print(f"Warning: Sample size {sample_size} exceeds total records {total_records}. Taking all records.")
        sampled_df = combined_df
    else:
        sampled_df = combined_df.sample(withReplacement=False, fraction=sample_size / total_records, seed=1)

    print(f"Saving sampled dataset to: {output_file}")
    # Save the sampled dataset to a CSV file
    sampled_df.write.option("quote", '"') \
                    .option("escape", '"') \
                    .option("multiLine", "true") \
                    .csv(output_file, header=True, mode="overwrite")
    print(f"Sampled dataset saved successfully to: {output_file}")

# File paths
job_desc_file = "/FileStore/tables/jd.csv"
resumes_file = "/FileStore/tables/UpdatedResumeDataSet.csv"
output_file = "/FileStore/tables/sampled_datasets.csv"

# Sample size
sample_size = 5700

combine_and_sample_correctly_with_multiple_columns(job_desc_file, resumes_file, output_file, sample_size)

Loading datasets...
Job Description Rows: 5001, Resume Rows: 962
Performing Cartesian product...
Total records after Cartesian product: 4810962
Saving sampled dataset to: /FileStore/tables/sampled_datasets.csv
Sampled dataset saved successfully to: /FileStore/tables/sampled_datasets.csv


**DATA CLEANING**

In [None]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("Dataset_Inspection_and_Cleaning").getOrCreate()

# Load the dataset with updated options
file_path = "/FileStore/tables/sampled_datasets.csv"
data = spark.read.csv(
    file_path,
    header=True,
    inferSchema=True,
    multiLine=True,  # Handles multiline data in cells
    escape='"',      # Handles escaped quotes within data
    quote='"',       # Handles quoted strings properly
    encoding="UTF-8" # Ensures correct text encoding
)

# Show the schema to confirm proper loading
data.printSchema()

# Display the first few rows
data.show(truncate=False, n=20)

root
 |-- JD_Experience: string (nullable = true)
 |-- JD_Qualifications: string (nullable = true)
 |-- JD_Preference: string (nullable = true)
 |-- JD_Job Title: string (nullable = true)
 |-- JD_Role: string (nullable = true)
 |-- JD_Job Description: string (nullable = true)
 |-- JD_skills: string (nullable = true)
 |-- JD_Responsibilities: string (nullable = true)
 |-- Resume_Category: string (nullable = true)
 |-- Resume_information: string (nullable = true)

+-------------+-----------------+-------------+---------------------+-------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
# Number of rows
num_rows = data.count()

# Number of columns
num_cols = len(data.columns)

# Print the shape
print(f"Shape of the DataFrame: ({num_rows}, {num_cols})")

Shape of the DataFrame: (5690, 10)


In [None]:
from pyspark.sql.functions import col, regexp_extract, when
from pyspark.sql.types import IntegerType

# Step 1: Extract the minimum experience from the JD_Experience column
# Regex explanation: \d+ captures one or more digits (the numbers in the string)
data = data.withColumn("JD_Minimum_Experience",
                       regexp_extract(col("JD_Experience"), r"(\d+)", 1).cast(IntegerType()))

# Step 2: Replace nulls or missing values with 0 (if necessary)
data = data.withColumn("JD_Minimum_Experience",
                       when(col("JD_Minimum_Experience").isNull(), 0).otherwise(col("JD_Minimum_Experience")))

# Step 3: Drop the original JD_Experience column if it's no longer needed
data = data.drop("JD_Experience")

# Step 4: Display the updated DataFrame
data.select("JD_Minimum_Experience").show(5)

+---------------------+
|JD_Minimum_Experience|
+---------------------+
|                    4|
|                    1|
|                    0|
|                    2|
|                    5|
+---------------------+
only showing top 5 rows



In [None]:
from pyspark.sql.functions import col, sum

# Count the number of nulls in each column
null_counts = data.select(
    *[sum(col(column).isNull().cast("int")).alias(column) for column in data.columns]
)

# Show the null count for each column
null_counts.show()

+-----------------+-------------+------------+-------+------------------+---------+-------------------+---------------+------------------+---------------------+
|JD_Qualifications|JD_Preference|JD_Job Title|JD_Role|JD_Job Description|JD_skills|JD_Responsibilities|Resume_Category|Resume_information|JD_Minimum_Experience|
+-----------------+-------------+------------+-------+------------------+---------+-------------------+---------------+------------------+---------------------+
|                0|            0|           0|      0|                 0|        0|                  0|              0|                 0|                    0|
+-----------------+-------------+------------+-------+------------------+---------+-------------------+---------------+------------------+---------------------+



In [None]:
from pyspark.sql.functions import col, sum, when

# Count empty strings for each column
empty_string_counts = data.select(
    *[sum(when(col(column) == "", 1).otherwise(0)).alias(column) for column in data.columns]
)

# Show counts of empty strings
empty_string_counts.show()

+-----------------+-------------+------------+-------+------------------+---------+-------------------+---------------+------------------+---------------------+
|JD_Qualifications|JD_Preference|JD_Job Title|JD_Role|JD_Job Description|JD_skills|JD_Responsibilities|Resume_Category|Resume_information|JD_Minimum_Experience|
+-----------------+-------------+------------+-------+------------------+---------+-------------------+---------------+------------------+---------------------+
|                0|            0|           0|      0|                 0|        0|                  0|              0|                 0|                    0|
+-----------------+-------------+------------+-------+------------------+---------+-------------------+---------------+------------------+---------------------+



In [None]:
from pyspark.sql.functions import col, regexp_replace, trim, lower, when

# Columns to clean
columns_to_clean = [
    "JD_Qualifications", "JD_Preference", "JD_Job Title", "JD_Role",
    "JD_Job Description", "JD_skills", "JD_Responsibilities",
    "Resume_Category", "Resume_information"
]

# Step 1: Replace null values with an empty string
for column in columns_to_clean:
    data = data.withColumn(column, when(col(column).isNull(), "").otherwise(col(column)))

# Step 2: Clean the text
for column in columns_to_clean:
    data = data.withColumn(
        column,
        # Remove URLs, extra whitespace, and convert to lowercase
        lower(trim(regexp_replace(
            regexp_replace(col(column), r"https?://\S+|www\.\S+", ""),  # Remove URLs
            r"\s+", " "  # Replace multiple spaces with a single space
        )))
    )

# Show the cleaned DataFrame
data.select(columns_to_clean).show(truncate=False)

+-----------------+-------------+---------------------+-------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting spacy
  Using cached spacy-3.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
Collecting spacy-legacy<3.1.0,>=3.0.11
  Using cached spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Collecting catalogue<2.1.0,>=2.0.6
  Using cached catalogue-2.0.10-py3-none-any.whl (17 kB)
Collecting typer<1.0.0,>=0.3.0
  Using cached typer-0.13.1-py3-none-any.whl (44 kB)
Collecting murmurhash<1.1.0,>=0.28.0
  Using cached murmurhash-1.0.11-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (124 kB)
Collecting cymem<2.1.0,>=2.0.2
  Using cached cymem-2.0.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (208 kB)
Collecting preshed<3.1.0,>=3.0.2
  Using cached preshed-3.0.9-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (157 kB)
Collecting weasel<0.5.0,>=0.1.0
  Using cached weasel-0.4.1-py3-none-any.whl (50 kB)
Collecting tqdm<5.0.0,>=4.38.0
  Using ca

In [None]:
from pyspark.sql.functions import regexp_replace, col

# List of columns to apply the correction
columns_to_correct = data.columns

# Correct the spelling "exprience" to "experience" across all columns
for column in columns_to_correct:
    data = data.withColumn(column, regexp_replace(col(column), r'\bexprience\b', 'experience'))

# Show the updated DataFrame
data.show(truncate=False)

+-----------------+-------------+---------------------+-------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
# Extract unique qualifications from the JD_Qualifications column
if "JD_Qualifications" in data.columns:
    unique_qualifications = data.select("JD_Qualifications").distinct().rdd.map(lambda row: row["JD_Qualifications"]).collect()
else:
    unique_qualifications = []

# Print the unique qualifications
print("Unique Qualifications in JD_Qualifications column:")
print(unique_qualifications)

Unique Qualifications in JD_Qualifications column:
['bca', 'phd', 'mca', 'ba', 'b.tech', 'm.tech', 'b.com', 'bba', 'm.com', 'mba']


In [None]:
import re
from pyspark.sql.functions import udf, col, regexp_replace, lower, trim
from pyspark.sql.types import StringType

# Define the set of technical terms or keywords to preserve as-is
preserve_keywords = {"angular.js", "aws", "azure", "c#", "c++", "css", "django", "docker", "flask", "html",
    "java", "javascript", "kubernetes", "ms excel", "ms office", "ms power point",
    "node.js", "oracle", "pytorch", "python", "r", "react.js", "ruby", "sql",
    "tensorflow", "ui", "ux", "ux/ui", "bca", "phd", "mca", "ba", "mcom", "bcom", "bba", "mba", "btech", "mtech"}

# Define a list of entity keywords to exclude
exclude_keywords = {"organization", "location", "date", "time", "person", "event"}

# Function to clean text while preserving and excluding certain keywords
def clean_and_preserve_keywords(text):
    if text is None:
        return ""

    # Lowercase the text
    text = text.lower()

    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)

    # Split text into words
    words = text.split()

    # Process each word
    cleaned_words = []
    for word in words:
        if word in preserve_keywords:  # Preserve keywords exactly
            cleaned_words.append(word)
        elif word in exclude_keywords:  # Exclude unwanted keywords
            continue
        else:
            # Remove punctuation and keep alphanumeric characters
            cleaned_word = re.sub(r'[^\w\s+#]', '', word)
            if cleaned_word:  # Add the word if it is not empty
                cleaned_words.append(cleaned_word)

    # Join cleaned words back into a single string
    return " ".join(cleaned_words).strip()

# Register the cleaning function as a UDF
clean_text_udf = udf(clean_and_preserve_keywords, StringType())

# Step 1: Remove HTML tags and punctuations from all columns using regexp_replace
# Assuming `data` is your input DataFrame
columns_to_clean = data.columns  # Apply to all columns

for column in columns_to_clean:
    data = data.withColumn(
        column,
        regexp_replace(  # Remove HTML tags
            regexp_replace(col(column), r"<[^>]+>", ""),  # Remove punctuations
            r"[^\w\s]", ""  # Retain only alphanumeric characters and whitespace
        )
    )

# Step 2: Apply the PySpark-based cleaning to further clean and preserve technical terms
# Exclude 'JD_Experience' from additional cleaning if required
columns_to_clean = [col_name for col_name in data.columns if col_name != 'JD_Experience']

for column in columns_to_clean:
    data = data.withColumn(f"Cleaned_{column}", clean_text_udf(col(column)))

# Show the cleaned columns
data.select([col(f"Cleaned_{column}") for column in columns_to_clean]).show(truncate=False)


+-------------------------+---------------------+---------------------+-------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
# Standardize column names: replace invalid characters with underscores
data = data.select([col(column).alias(column.replace(" ", "_")
                                      .replace("(", "")
                                      .replace(")", "")
                                      .replace("\n", "")
                                      .replace("\t", "")
                                      .replace("=", "")
                                      .replace("{", "")
                                      .replace("}", "")
                                      .replace(";", "")) for column in data.columns])

# Verify updated column names
print("Updated column names:")
data.printSchema()


Updated column names:
root
 |-- JD_Qualifications: string (nullable = true)
 |-- JD_Preference: string (nullable = true)
 |-- JD_Job_Title: string (nullable = true)
 |-- JD_Role: string (nullable = true)
 |-- JD_Job_Description: string (nullable = true)
 |-- JD_skills: string (nullable = true)
 |-- JD_Responsibilities: string (nullable = true)
 |-- Resume_Category: string (nullable = true)
 |-- Resume_information: string (nullable = true)
 |-- JD_Minimum_Experience: string (nullable = true)
 |-- Cleaned_JD_Qualifications: string (nullable = true)
 |-- Cleaned_JD_Preference: string (nullable = true)
 |-- Cleaned_JD_Job_Title: string (nullable = true)
 |-- Cleaned_JD_Role: string (nullable = true)
 |-- Cleaned_JD_Job_Description: string (nullable = true)
 |-- Cleaned_JD_skills: string (nullable = true)
 |-- Cleaned_JD_Responsibilities: string (nullable = true)
 |-- Cleaned_Resume_Category: string (nullable = true)
 |-- Cleaned_Resume_information: string (nullable = true)
 |-- Cleaned_JD_

**Feature Engineering**

In [None]:
data.columns

['JD_Qualifications',
 'JD_Preference',
 'JD_Job_Title',
 'JD_Role',
 'JD_Job_Description',
 'JD_skills',
 'JD_Responsibilities',
 'Resume_Category',
 'Resume_information',
 'JD_Minimum_Experience',
 'Cleaned_JD_Qualifications',
 'Cleaned_JD_Preference',
 'Cleaned_JD_Job_Title',
 'Cleaned_JD_Role',
 'Cleaned_JD_Job_Description',
 'Cleaned_JD_skills',
 'Cleaned_JD_Responsibilities',
 'Cleaned_Resume_Category',
 'Cleaned_Resume_information',
 'Cleaned_JD_Minimum_Experience',
 'Resume_Score',
 'Cleaned_Cleaned_JD_Qualifications',
 'Cleaned_Cleaned_JD_Preference',
 'Cleaned_Cleaned_JD_Job_Title',
 'Cleaned_Cleaned_JD_Role',
 'Cleaned_Cleaned_JD_Job_Description',
 'Cleaned_Cleaned_JD_skills',
 'Cleaned_Cleaned_JD_Responsibilities',
 'Cleaned_Cleaned_Resume_Category',
 'Cleaned_Cleaned_Resume_information',
 'Cleaned_Cleaned_JD_Minimum_Experience',
 'Cleaned_Resume_Score']

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer

# Load the saved table into a DataFrame
data = spark.sql("SELECT * FROM cleaned_data_table")

# Columns to tokenize
columns_to_tokenize = [
    "Cleaned_JD_Qualifications",
    "Cleaned_JD_Preference",
    "Cleaned_JD_Job_Title",
    "Cleaned_JD_Role",
    "Cleaned_JD_Job_Description",
    "Cleaned_JD_skills",
    "Cleaned_JD_Responsibilities",
    "Cleaned_Resume_Category",
    "Cleaned_Resume_information"
]

# Tokenize each column
for column in columns_to_tokenize:
    tokenizer = Tokenizer(inputCol=column, outputCol=f"{column}_tokens")
    data = tokenizer.transform(data)

# Display the tokenized columns
data.select([col for col in data.columns if "tokens" in col]).show(truncate=False)


+--------------------------------+----------------------------+---------------------------+-----------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------+-----------------------------------------

In [None]:
# List of columns to drop
columns_to_drop = [
    "JD_Qualifications",
    "JD_Preference",
    "JD_Job_Title",
    "JD_Role",
    "JD_Job_Description",
    "JD_skills",
    "JD_Responsibilities",
    "Resume_Category",
    "Resume_information",
    "JD_Minimum_Experience"
]

# Drop the specified columns
data = data.drop(*columns_to_drop)

# Show the remaining columns
print("Remaining columns:", data.columns)


Remaining columns: ['Cleaned_JD_Qualifications', 'Cleaned_JD_Preference', 'Cleaned_JD_Job_Title', 'Cleaned_JD_Role', 'Cleaned_JD_Job_Description', 'Cleaned_JD_skills', 'Cleaned_JD_Responsibilities', 'Cleaned_Resume_Category', 'Cleaned_Resume_information', 'Cleaned_JD_Minimum_Experience', 'Cleaned_JD_Qualifications_tokens', 'Cleaned_JD_Preference_tokens', 'Cleaned_JD_Job_Title_tokens', 'Cleaned_JD_Role_tokens', 'Cleaned_JD_Job_Description_tokens', 'Cleaned_JD_skills_tokens', 'Cleaned_JD_Responsibilities_tokens', 'Cleaned_Resume_Category_tokens', 'Cleaned_Resume_information_tokens']


In [None]:
pip install transformers



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np
import tensorflow_hub as hub
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, FloatType
import transformers
from transformers import AutoTokenizer, BertModel
import torch
import numpy as np


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, size, array_join

# Step 1: Initialize SparkSession
spark = SparkSession.builder \
    .appName("Resolve Array Column Issue") \
    .getOrCreate()

# Step 2: Load DataFrame (replace with your data source)
df = spark.read.csv("/content/cleaned_dataset.csv", header=True, inferSchema=True)

# Step 3: Inspect Schema and Columns
print("Schema of the DataFrame:")
df.printSchema()

print("Columns in the DataFrame:")
print(df.columns)

# Step 4: Rename Columns with Spaces (if needed)
df = df.select([col(c).alias(c.replace(" ", "_")) for c in df.columns])
print("Updated Columns After Renaming:")
print(df.columns)

# Step 5: Verify the Target Column
target_column = "Cleaned_JD_Job_Description_tokens"
if target_column not in df.columns:
    print(f"Column '{target_column}' not found in DataFrame.")
    print("Available columns:")
    print(df.columns)
else:
    print(f"Column '{target_column}' found. Proceeding with transformations.")

# Step 6: Handle the ARRAY<STRING> Column
if target_column in df.columns:
    # Add a column with the size of the array (number of tokens)
    df = df.withColumn("Cleaned_JD_Job_Description_tokens_length", size(col(target_column)))

    # Add a column by joining the array elements into a single string
    df = df.withColumn(
        "Cleaned_JD_Job_Description_tokens_combined",
        array_join(col(target_column), " ")
    )

    # Calculate the length of the combined string
    df = df.withColumn(
        "Cleaned_JD_Job_Description_tokens_combined_length",
        col("Cleaned_JD_Job_Description_tokens_combined").cast("string").length()
    )

# Step 8: Further Processing (Optional)
def compute_bert_embedding_from_tokens(df, column_name):
    # Placeholder function: Replace with actual logic to compute embeddings
    from pyspark.sql.functions import length
    return df.withColumn(f"{column_name}_bert_embedding", length(col(column_name)))

tokens_column = "Cleaned_JD_Job_Description_tokens_combined"
if tokens_column in df.columns:
    df = compute_bert_embedding_from_tokens(df, tokens_column)
    print(f"Computed BERT embeddings for '{tokens_column}' successfully.")
else:
    print(f"Tokens column '{tokens_column}' not found in DataFrame.")


Schema of the DataFrame:
root
 |-- JD_Qualifications: string (nullable = true)
 |-- JD_Preference: string (nullable = true)
 |-- JD_Job Title: string (nullable = true)
 |-- JD_Role: string (nullable = true)
 |-- JD_Job Description: string (nullable = true)
 |-- JD_skills: string (nullable = true)
 |-- JD_Responsibilities: string (nullable = true)
 |-- Resume_Category: string (nullable = true)
 |-- Resume_information: string (nullable = true)
 |-- JD_Minimum_Experience: integer (nullable = true)
 |-- Cleaned_JD_Qualifications: string (nullable = true)
 |-- Cleaned_JD_Preference: string (nullable = true)
 |-- Cleaned_JD_Job Title: string (nullable = true)
 |-- Cleaned_JD_Role: string (nullable = true)
 |-- Cleaned_JD_Job Description: string (nullable = true)
 |-- Cleaned_JD_skills: string (nullable = true)
 |-- Cleaned_JD_Responsibilities: string (nullable = true)
 |-- Cleaned_Resume_Category: string (nullable = true)
 |-- Cleaned_Resume_information: string (nullable = true)
 |-- Cleaned

In [None]:
# Install required libraries
!pip install tensorflow pyspark mlflow

Collecting mlflow
  Downloading mlflow-2.18.0-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==2.18.0 (from mlflow)
  Downloading mlflow_skinny-2.18.0-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.18.0->mlflow)
  Downloading databricks_sdk-0.38.0-py3-none-any.whl.metadata (38 kB)
Collecting Mako (from alembic!=1.10.0,<2->mlflow)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.5-py3-none-any.whl.metadata (10 kB)
Colle

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import mlflow
import mlflow.keras

In [None]:
# Inspect Data and Define Features and Target
print("Dataset Row Count:", df.count())
print("Dataset Column Count:", len(df.columns))
print("Columns in the DataFrame:", df.columns)

# Show a sample of the data
df.show(5, truncate=False)

# Show the schema of the DataFrame
df.printSchema()


Dataset Row Count: 5690
Dataset Column Count: 21
Columns in the DataFrame: ['JD_Qualifications', 'JD_Preference', 'JD_Job_Title', 'JD_Role', 'JD_Job_Description', 'JD_skills', 'JD_Responsibilities', 'Resume_Category', 'Resume_information', 'JD_Minimum_Experience', 'Cleaned_JD_Qualifications', 'Cleaned_JD_Preference', 'Cleaned_JD_Job_Title', 'Cleaned_JD_Role', 'Cleaned_JD_Job_Description', 'Cleaned_JD_skills', 'Cleaned_JD_Responsibilities', 'Cleaned_Resume_Category', 'Cleaned_Resume_information', 'Cleaned_JD_Minimum_Experience', 'Resume_Score']
+-----------------+-------------+---------------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------

In [None]:
from pyspark.sql.functions import col, when, count

# Check for missing values in each column
missing_values = df.select(
    [
        count(when(col(c).isNull() | (col(c) == ""), c)).alias(c)
        for c in df.columns
    ]
)

# Show missing values for each column
print("Missing Values in Each Column:")
missing_values.show()


Missing Values in Each Column:
+-----------------+-------------+------------+-------+------------------+---------+-------------------+---------------+------------------+---------------------+-------------------------+---------------------+--------------------+---------------+--------------------------+-----------------+---------------------------+-----------------------+--------------------------+-----------------------------+------------+
|JD_Qualifications|JD_Preference|JD_Job_Title|JD_Role|JD_Job_Description|JD_skills|JD_Responsibilities|Resume_Category|Resume_information|JD_Minimum_Experience|Cleaned_JD_Qualifications|Cleaned_JD_Preference|Cleaned_JD_Job_Title|Cleaned_JD_Role|Cleaned_JD_Job_Description|Cleaned_JD_skills|Cleaned_JD_Responsibilities|Cleaned_Resume_Category|Cleaned_Resume_information|Cleaned_JD_Minimum_Experience|Resume_Score|
+-----------------+-------------+------------+-------+------------------+---------+-------------------+---------------+------------------+-----

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.sql.functions import col
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import mlflow
import mlflow.keras

# Step 1: Initialize Spark Session
spark = SparkSession.builder \
    .appName("Resume Score Prediction with All Features") \
    .getOrCreate()

# Step 2: Load Data
file_path = "/content/cleaned_dataset.csv"  # Update this path to your file location
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Step 3: Rename Columns to Remove Spaces
df = df.select([col(c).alias(c.replace(" ", "_")) for c in df.columns])

# Step 4: Drop rows with null values
df = df.na.drop()

# Text columns for TF-IDF processing
text_columns = [
    "Cleaned_JD_Job_Description",
    "Cleaned_Resume_information",
    "Cleaned_JD_skills",
    "Cleaned_JD_Responsibilities"
]

# Apply Tokenizer, HashingTF, and IDF for each text column
for col_name in text_columns:
    tokenizer = Tokenizer(inputCol=col_name, outputCol=f"{col_name}_tokens")
    df = tokenizer.transform(df)

    hashingTF = HashingTF(inputCol=f"{col_name}_tokens", outputCol=f"{col_name}_tf", numFeatures=1000)
    df = hashingTF.transform(df)

    idf = IDF(inputCol=f"{col_name}_tf", outputCol=f"{col_name}_tfidf")
    df = idf.fit(df).transform(df)

# Categorical columns for encoding
categorical_columns = ["Cleaned_JD_Qualifications", "Cleaned_JD_Preference", "Cleaned_JD_Job_Title", "Cleaned_JD_Role"]
for col_name in categorical_columns:
    indexer = StringIndexer(inputCol=col_name, outputCol=f"{col_name}_index")
    df = indexer.fit(df).transform(df)

    encoder = OneHotEncoder(inputCol=f"{col_name}_index", outputCol=f"{col_name}_ohe")
    df = encoder.fit(df).transform(df)

# Numerical columns (already ready for use)
numerical_columns = ["JD_Minimum_Experience"]

# Combine all features into a single vector
feature_columns = [f"{col}_tfidf" for col in text_columns] + \
                  [f"{col}_ohe" for col in categorical_columns] + \
                  numerical_columns

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df = assembler.transform(df).select("features", "Resume_Score")

# Step 5: Convert DataFrame to Pandas for TensorFlow processing
data = df.toPandas()
X = np.array(data["features"].tolist())
y = np.array(data["Resume_Score"])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Build Neural Network Model
model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.4),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dense(1)
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

# Step 7: Train Model with MLflow
mlflow.keras.autolog()
# Check if there's an active MLflow run and end it
if mlflow.active_run() is not None:
    mlflow.end_run()

# Train the Model with MLflow
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

with mlflow.start_run(run_name="Resume Score Prediction with All Features"):
    history = model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=100,
        batch_size=32,
        callbacks=[early_stop]
    )

# Step 8: Evaluate Model
loss, mae = model.evaluate(X_test, y_test, verbose=0)
y_pred = model.predict(X_test)
r2_score = 1 - (np.sum((y_test - y_pred.flatten())**2) / np.sum((y_test - np.mean(y_test))**2))

print(f"Test Loss: {loss}")
print(f"Test MAE: {mae}")
print(f"Test R²: {r2_score}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 40ms/step - loss: 1548.8268 - mae: 30.6295 - val_loss: 385.9706 - val_mae: 16.3619
Epoch 2/100
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 36ms/step - loss: 387.1842 - mae: 16.2139 - val_loss: 408.5176 - val_mae: 16.6386
Epoch 3/100
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 31ms/step - loss: 353.0737 - mae: 15.5371 - val_loss: 470.8750 - val_mae: 17.8026
Epoch 4/100
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 39ms/step - loss: 333.2518 - mae: 14.8694 - val_loss: 381.6942 - val_mae: 16.2817
Epoch 5/100
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 31ms/step - loss: 317.1545 - mae: 14.6758 - val_loss: 391.5811 - val_mae: 16.4216
Epoch 6/100
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 44ms/step - loss: 293.8922 - mae: 13.9673 - val_loss: 396.3701 - val_mae: 16.5104
Epoch 7/100
[1m138/138[0m [32m━━━━━━