# Reviews Company

In [None]:
# !conda install -c conda-forge sentence-transformers
# !pip install scikit-learn
# !pip install numpy pandas
# !pip show sentence-transformers

In [None]:
# import required packages
import pandas as pd
import numpy as np
import spacy
import re
from spacy.matcher import PhraseMatcher
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings("ignore")

In [None]:
pd.set_option("display.max_colwidth",None)
# pd.reset_option("display.max_row",None)

In [None]:
data = pd.read_csv("Data_Merge_Reviews.csv")
data

In [None]:
# using 
data2 = data

In [None]:
def split_info(df: pd.Series)-> pd.DataFrame:
    employment_type = []
    role_type = []
    duration = []
    for raw in df:
        data = raw.split()
        emp = rol = dur = "default"

        if len(data) >=2:
            emp = data[0]
            rol = data[1].replace(",", "")

        if "less than 1 year" in raw:
            dur = "<1 year"
        elif "more than 1 year" in raw:
            dur = "1 - 3 year"
        elif "more than 3 years" in raw:
            dur = ">3 year"
        employment_type.append(emp)
        role_type.append(rol)
        duration.append(dur)
         
    return pd.DataFrame({
       "Employment_Type":employment_type,
        "Role_Type":role_type,
        "Duration":duration
    })
new_cols = split_info(data2["Job Status"])

In [None]:
data2 = pd.concat([data2, new_cols], axis=1)
data2 = data2[
    ["Job Title", "Job Rating", "Time", "Job Status", 
     "Employment_Type", "Role_Type", "Duration",
     "Pros", "Cons", "Company_Name"]
]

In [None]:
# data2.drop(["Employment_Type", "Role_Type", "Duration"], axis=1, inplace=True)

In [None]:
data2

In [None]:
data2["Employment_Type"].unique().tolist()

In [None]:
data2[data2["Employment_Type"] == "KEY"]

In [None]:
data2.drop(467, axis=0, inplace=True)

In [None]:
data2.set_index("Job Title", inplace=True)

In [None]:
data2= data2.reset_index()

In [None]:
data2["Role_Type"].unique().tolist()

In [None]:
# jobrole = {"intern":"Internship"}

In [None]:
data2["Role_Type"].replace("intern", "internship",inplace=True)

In [None]:
# Normalize text (lowercase, remove weird symbols, etc.).
data2["Pros"] = data2["Pros"].str.lower()

In [None]:
data2["Job Title"] = data2["Job Title"].str.lower()

In [None]:
# This command is used to change the it's to it is 
import contractions
data2["Cons"] = data2["Cons"].apply(lambda x: contractions.fix(x) if isinstance(x, str) else x)

In [None]:
data2[["Cons","Company_Name"]].head(50)

In [None]:
from spellchecker import SpellChecker
spell = SpellChecker()

def correct_sentence(text):
    if not isinstance(text, str):
        text = ""
    words = text.split()
    correct_words = []

    for word in words:
        if word.lower() in spell:
            correct_words.append(word)
        else:
            correction = spell.correction(word)
            correct_words.append(correction if correction else word)
            
    return " ".join(correct_words)

In [None]:
# data2["Corrected_Cons"] = data2["Cons"].fillna("").apply(correct_sentence)
data2

In [None]:
# load spacy
nlp = spacy.load("en_core_web_lg")

# patterns = [
#            "ict technician", "ict officer", "ict manager", "ict consultant", "ict manager", "ict project manager",
#            "ict security analyst", "ict service desk analyst", "ict - software engineer", "ict applications support"]

# adding entity rule
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
unique_titles = data2["Job Title"].dropna().unique().tolist()
pattern_doc = [nlp.make_doc(title.lower()) for title in unique_titles]
matcher.add("JOB_ROLE", pattern_doc)

status_map = {
    "ft": "Full-time",
    "full-time": "Full-time",
    "full time": "Full-time",
    "intern": "Internship",
    "internship": "Internship",
    "intern (paid)": "Internship",
    "contract": "Contract",
    "contractor": "Contract"
}
mapping = {"one": 1.0, "two": 2.0,"three": 3.0, "four": 4.0,"five":5.0}

def clean_status(status):
    if pd.isna(status): return None
    if "intern" in status: return "Internship"
    if "contract" in status: return "Contract"
    if "full" in status: return "Full-time"
    status = status.lower().strip()
    return status_map.get(status, status.title())

# normalize the job rating
def clean_rating(rating):
    if pd.isna(rating): return None
    if isinstance(rating, str):
        rating = rating.lower().strip()
        if rating in rating_map:
            return rating_map(rating)
        match = re.search(r"[0-9.]+", rating)
        return float(match.group()) if match else None
    return float(rating)

# text cleaning with spacy
def spacy_clean(text):
    if not isinstance(text,str) or not text.strip():
        return "", []
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    matches = matcher(doc)
    job_roles = [doc[start:end].text for _, start, end in matches]
    return " ".join(tokens), {"NER": entities, "ROLES": job_roles}
        
def process_row(row):
    job_title_clean, job_title_entities = spacy_clean(row.get("Job Title", ""))
    pros_clean, pros_entities = spacy_clean(row.get("Pros", ""))
    cons_clean, cons_entities = spacy_clean(row.get("Cons", ""))

    return pd.Series({
        "Job Title Clean": job_title_clean,
        "Job Title Entities": job_title_entities,
        "Pros Clean": pros_clean,
        "Pros Entities": pros_entities,
        "Cons Clean": cons_clean,
        "Cons Entities": cons_entities,
        "Cleaned Status": clean_status(row.get("Job Status", "")),
        "Cleaned Rating": clean_rating(row.get("Job Rating", ""))
    })

data2_cleaned = data2.apply(process_row, axis=1)
print(data2_cleaned.head())

In [None]:
nlp("data scientist").similarity(nlp("machine learning engineer"))

In [None]:
nlp("internship").similarity(nlp("full-time"))

In [None]:
nlp("project manager").similarity(nlp("team lead"))

In [None]:
data2.columns

In [None]:
# apply the cleaning
data2["Employment_Type"] = data2["Employment_Type"].apply(clean_status)

In [None]:
data2["Job Rating"] = data2["Job Rating"].apply(clean_rating)

In [None]:
data2["Company_Name"] = data2["Company_Name"].str.lower().str.replace('r[^a-zA-Z]','', regex=True).str.strip()

In [None]:
data2[["Job Title Entities", "Job Title"]].head(50)

In [None]:
data2_cleaned

# Transformers implementation

In [None]:
data2_cleaned