In [1]:
import pandas as pd
import re
import joblib
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer



In [2]:
import re

manual_stopwords = {
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'but', 'by', 'for',
    'if', 'in', 'into', 'is', 'it', 'no', 'not', 'of', 'on', 'or',
    'such', 'that', 'the', 'their', 'then', 'there', 'these', 'they',
    'this', 'to', 'was', 'will', 'with', 'you', 'your'
}

def clean_skills(text):
    if not isinstance(text, str) or not text.strip():
        return ""

    # Lowercase
    text = text.lower()

    # Replace non-alphanumeric characters with space
    text = re.sub(r"[^\w\s]", " ", text)

    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    # Tokenize by splitting and remove stopwords
    tokens = [word for word in text.split() if word not in manual_stopwords]

    return " ".join(tokens)


In [3]:

df = pd.read_excel("Cleaned_Dataset_for_taison.xlsx", sheet_name="Fact_Wuzzuf")
df["Cleaned_Skills"] = df["Skills"].apply(clean_skills)
df.drop_duplicates(subset=["Job Title", "Job Position", "Company", "Link"], inplace=True)



In [5]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df["Cleaned_Skills"])


print("✅ TF-IDF Matrix created.")
print(f"➡️ Number of jobs: {tfidf_matrix.shape[0]}")
print(f"➡️ Number of unique skill features: {tfidf_matrix.shape[1]}")
print("\nSome of the features (skills):")
print(tfidf.get_feature_names_out()[30:60])

✅ TF-IDF Matrix created.
➡️ Number of jobs: 2638
➡️ Number of unique skill features: 1323

Some of the features (skills):
['administrative' 'administrator' 'adminstration' 'adobe' 'adope' 'ads'
 'advanced' 'advertising' 'advising' 'advisor' 'adwords' 'affairs' 'after'
 'aftereffect' 'agencies' 'agent' 'agentic' 'agile' 'agriculture' 'ai'
 'aicerts' 'aid' 'air' 'airline' 'alarm' 'alexandria' 'align' 'aligning'
 'allocation' 'alsadat']


In [6]:
# === Step 4: Try user input ===
user_input = input("\n💬 Enter your skills: ")
user_cleaned = clean_skills(user_input)
print(f"🔍 Cleaned: {user_cleaned}")

user_vec = tfidf.transform([user_cleaned])

🔍 Cleaned: machine learning


In [7]:
print("\n✅ Non-zero features in your input:")
feature_names = tfidf.get_feature_names_out()
nonzero_indices = user_vec.nonzero()[1]
for idx in nonzero_indices:
    print(f" - {feature_names[idx]}")


✅ Non-zero features in your input:
 - machine
 - learning


In [8]:
print("✅ TF-IDF built.")
print(f"📄 Total jobs: {tfidf_matrix.shape[0]}")
print(f"🧠 Unique skill tokens: {tfidf_matrix.shape[1]}")


✅ TF-IDF built.
📄 Total jobs: 2638
🧠 Unique skill tokens: 1323


In [9]:
print("\n📝 Enter job filters (or leave blank to skip):")
city = input("City: ").strip()
job_type = input("Job Type (Full Time / Part Time): ").strip()
work_mode = input("Work Mode (Remote / On-site / Hybrid): ").strip()
min_exp_str = input("Minimum years of experience: ").strip()
user_skills = input("\n💬 Enter your skills (comma-separated): ")

min_exp = int(min_exp_str) if min_exp_str.isdigit() else None


📝 Enter job filters (or leave blank to skip):


In [10]:
filtered_df = df.copy()

if city:
    filtered_df = filtered_df[filtered_df["City"].str.lower() == city.lower()]
if job_type:
    filtered_df = filtered_df[filtered_df["Job Type"].str.lower() == job_type.lower()]
if work_mode:
    filtered_df = filtered_df[filtered_df["Work Mode"].str.lower() == work_mode.lower()]

if min_exp:
    def exp_filter(val):
        if pd.isna(val) or val.lower().strip() == "not mentioned":
            return True
        match = re.findall(r"\d+", val)
        return int(match[0]) <= min_exp if match else True
    filtered_df = filtered_df[filtered_df["Experience"].apply(exp_filter)]

if filtered_df.empty:
    print("\n❌ No jobs match your filters.")
    

In [11]:
user_cleaned = clean_skills(user_skills)
user_vec = tfidf.transform([user_cleaned])

filtered_indices = filtered_df.index.tolist()
filtered_df = filtered_df.reset_index(drop=True)
filtered_tfidf = tfidf_matrix[filtered_df.index]

similarities = cosine_similarity(user_vec, filtered_tfidf).flatten()
filtered_df = filtered_df.copy()
filtered_df["Match Score"] = (similarities * 100).round(2)

top_matches = filtered_df.sort_values(by="Match Score", ascending=False).head(7)

In [12]:
for i, row in top_matches.iterrows():
    print(f"{row['Job Title']} – {row['Job Position']}")
    print(f"🏢 Company Location: {row['CompanyLocation']}")
    print(f"🧠 Required Skills: {row['Skills']}")
    print(f"📌 Match Score: {row['Match Score']}%")
    print(f"🔗 Link: {row['Link']}")
    print("-" * 60)


ai engineer – Product Manager – Data & AI Products
🏢 Company Location: Cairo
🧠 Required Skills: IT/Software Development · Communication · Computer Science · data-driven · Engineering · Information Technology (IT) · Management · Product Management · SQL · Product Strategy & Roadmapping
📌 Match Score: 56.51%
🔗 Link: https://wuzzuf.net/jobs/p/7qu8aw9vozor-product-manager-data-ai-products-hiremoters-cairo-egypt
------------------------------------------------------------
ai engineer – Business Analyst – AI & Web3
🏢 Company Location: Cairo
🧠 Required Skills: IT/Software Development · Project/Program Management · Business Analysis · Information Technology (IT) · Project Management · Software Testing · Communication skills · analytical · Agile
📌 Match Score: 56.07%
🔗 Link: https://wuzzuf.net/jobs/p/3z7mhjvlnvii-business-analyst-–-ai-web3-kaiizn-llc-cairo-egypt
------------------------------------------------------------
sales representative – Inbound Telesales Agent
🏢 Company Location: Cairo


In [25]:
w = df.loc[df['Job Title'] == 'data analyst', 'Cleaned_Skills']
if not w.empty:
    print(w.loc[0])
else:
    print("No matching job found.")


analyst research power bi data analysis ssis sql python
