In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import scipy

# بارگذاری داده‌ها
df = pd.read_csv("quera-karyabi.csv")

In [2]:
# نمایش ابعاد داده‌ها
print("Shape of dataset:", df.shape)

Shape of dataset: (200000, 15)


In [3]:
# تعداد مقادیر null در هر ستون
print("Number of nulls per column:")
print(df.isnull().sum())

Number of nulls per column:
Applicant ID                                             4200
Source                                                      0
Order                                                       0
Developer State                                             0
Seen                                                        0
Job ID                                                      0
Job Title                                                   0
Careers Job - Job → Level                               13833
Careers Job - Job → Offers Remote                           0
Careers Job - Job → Minimum Salary                     150958
Careers Job - Job → Maximum Salary                     151618
Careers Job - Job → Hit Count                               0
Accounts City - City → Name                             87806
Careers Disqualifyreason - Disqualify Reason → Name    137137
Job_ID_Title                                                0
dtype: int64


In [None]:
# حذف سطرهایی که Applicant ID آنها null است
df = df[df["Applicant ID"].notnull()]

In [None]:
normalization_map = {
    # --- Roles / نقش‌ها ---
    "برنامه نویسی": "developer",
    "برنامه‌نویسی": "developer",
    "برنامه نویس": "developer",
    "برنامه‌نویس": "developer",
    "توسعه دهنده": "developer",
    "توسعه‌دهنده": "developer",
    "مهندس نرم افزار": "software engineer",
    "مهندس نرم‌افزار": "software engineer",
    "کارشناس نرم افزار": "software specialist",
    "کارشناس نرم‌افزار": "software specialist",
    "توسعه دهنده نرم افزار": "software developer",
    "توسعه‌دهنده نرم‌افزار": "software developer",
    "پشتیبان": "support",
    "تحلیلگر": "analyst",
    "تحلیل گر": "analyst",
    "طراح": "designer",
    "مدیر": "manager",
    "متخصص": "specialist",
    "کارشناس": "specialist",
    # --- Levels / سطوح ---
    "ارشد": "senior",
    "سنیور": "senior",
    "جونیور": "junior",
    "کارآموز": "intern",
    "کاراموز": "intern",
    "تازه کار": "junior",
    "تازهکار": "junior",
    "میدلول": "mid-level",
    "mid level": "mid-level",
    # --- Technologies & Fields / تکنولوژی‌ها و حوزه‌ها ---
    "تحلیل": " analysis",
    "تحلیلگر داده": "data analyst",
    "ساینس": "science",
    "ساینتیست": "scientist",
    "وب": "web",
    "لاراول": "laravel",
    "نرم‌افزار": "software",
    "جنگو": "django",
    "پایتون": "python",
    "جاوااسکریپت": "javascript",
    "جاوا اسکریپت": "javascript",
    "پی اچ پی": "php",
    "پی‌اچ‌پی": "php",
    "سی شارپ": "c#",
    "سی‌شارپ": "c#",
    "دات نت": ".net",
    "دات‌نت": ".net",
    "فرانت اند": "frontend",
    "فرانت‌اند": "frontend",
    "بک اند": "backend",
    "بک‌اند": "backend",
    "فول استک": "fullstack",
    "دواپس": "devops",
    "دوآپس": "devops",
    "هوش مصنوعی": "artificial intelligence",
    "یادگیری ماشین": "machine learning",
    "دیتا": "data",
    "داده": "data",
    "ساینتیست": "scientist",
    "دانشمند": "scientist",
    "علم داده": "data science",
    "اندروید": "android",
    "آی او اس": "ios",
    "آی‌او‌اس": "ios",
    "موبایل": "mobile",
    "امنیت": "security",
    "شبکه": "network",
    "اس کیو ال": "sql",
    "پایگاه داده": "database",
    "داده": "data",
    "دورکاری": "intern",
    "-": "",
    # --- Character Normalization / نرمال‌سازی کاراکترها ---
    "ك": "ک",  # کاف عربی به فارسی
    "ي": "ی",  # یای عربی به فارسی
}

import re


def normalize_title(title):
    if not isinstance(title, str):
        return ""  # یا هر مقدار پیش‌فرض دیگر برای داده‌های غیرمتنی

    title = title.lower()
    for key, value in normalization_map.items():
        title = title.replace(key, value)
    title = re.sub(r"\s+", " ", title).strip()

    return title


original_title_column = "Job Title"
normalized_title_column = "normal_title"

# اعمال تابع روی ستون و ساخت ستون جدید
df[normalized_title_column] = df[original_title_column].apply(normalize_title)


def get_job_domain(title):
    """
    عنوان شغلی را به یک حوزه اصلی (Domain) دسته‌بندی می‌کند.
    """
    title = str(title).lower()

    ai_data_keywords = [
        "data scientist",
        "machine learning",
        "یادگیری ماشین",
        "هوش مصنوعی",
        "artificial intelligence",
        "تحلیلگر داده",
        "تحلیل",
        "data analyst",
        "data engineer",
        "مهندس داده",
        "data science",
        "علم داده",
        "داده کاوی",
        "big data",
        "بیگ دیتا",
    ]
    backend_keywords = [
        "backend",
        "back-end",
        "بک اند",
        "django",
        "flask",
        "net.",
        "fastapi",
        "node.js",
        "php",
        "laravel",
        "c#",
        "back end",
        ".net",
        "java",
        "spring",
        "go",
        "python",
        "python developer",
        "ruby",
        "ruby on rails",
        "rails",
        ".net",
        "nodejs",
    ]
    frontend_keywords = [
        "frontend",
        "front-end",
        "فرانت اند",
        "react",
        "vue",
        "angular",
        "javascript",
        "web",
        "front end",
    ]
    android_keywords = [
        "android",
        "اندروید",
        "kotlin",
        "کاتلین",
        "jetpack",
        "compose",
        "موبایل",
        "flutter",
        "ios",
    ]
    uiux_keywords = ["ui", "ux", "uiux", "designer", "دیزاینر", "figma"]

    if any(keyword in title for keyword in ai_data_keywords):
        return "AI_Data"
    elif any(keyword in title for keyword in android_keywords):
        return "Android"
    elif any(keyword in title for keyword in backend_keywords):
        return "Backend"
    elif any(keyword in title for keyword in frontend_keywords):
        return "Frontend"
    elif any(keyword in title for keyword in uiux_keywords):
        return "UIUX"
    else:
        return "Other"


df["Job Domain"] = df["normal_title"].apply(get_job_domain)

In [None]:
df["interaction"] = 1




df["Job_ID_Title"] = df["Job ID"].astype(str) + " | " + df["Job Title"].astype(str)

In [None]:
# ساخت ماتریس کاربر-آیتم با استفاده از ستون‌های: سطر = Applicant ID، ستون = Job_ID_Title، مقدار = interaction
user_item_matrix = pd.pivot_table(
    df, index="Applicant ID", columns="Job_ID_Title", values="interaction", fill_value=0
)

In [8]:
print(user_item_matrix.shape)
user_item_matrix.head(15)

(29570, 13799)


Job_ID_Title,100008 | متخصص هوش مصنوعی (NLP),100032 | کارشناس کنترل پروژه,100056 | توسعه‌دهنده ارشد Front-end,100104 | توسعه‌دهنده‌ی Front-end,100128 | Front-end Developer (ReactJS),100152 | توسعه‌دهنده‌ی Front-end,100224 | کارشناس اسکرام مستر,100272 | طراح وب,10032 | برنامه نویس (آقا),100368 | توسعه‌دهنده Front-end,...,99792 | کارشناس DevOps,99816 | Web Designer,9984 | برنامه نویس Android,99840 | طراح رابط کاربری UI/UX,99864 | کارشناس تولید محتوا,99888 | مدیر محصول,99912 | توسعه‌دهنده Flutter,99936 | Senior Back-End Developer (پایتون),99960 | (Back-End Developer (ASP.NET,99984 | (Front-end Developer (ReactJS
Applicant ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
144.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
192.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1344.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1488.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2112.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2160.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4992.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5280.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8256.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
min_apps_per_job = 15
min_jobs_per_user = 10

In [10]:
# مجموع تعاملات (رزومه‌ها) برای هر آگهی
job_application_counts = user_item_matrix.sum(axis=0)

# انتخاب فقط ستون‌هایی (آگهی‌هایی) که حداقل 15 درخواست دارند
filtered_jobs = job_application_counts[job_application_counts >= min_apps_per_job].index

# فیلتر کردن ستون‌ها در ماتریس
user_item_matrix = user_item_matrix[filtered_jobs]

In [None]:
# مجموع تعاملات (رزومه‌ها) برای هر کاربر
user_application_counts = user_item_matrix.sum(axis=1)

# انتخاب فقط کاربرانی که حداقل به 10 آگهی درخواست داده‌اند
filtered_users = user_application_counts[
    user_application_counts >= min_jobs_per_user
].index

# فیلتر کردن سطرها در ماتریس
user_item_matrix = user_item_matrix.loc[filtered_users]

In [None]:
print("Final user-item matrix shape:", user_item_matrix.shape)

Final user-item matrix shape: (3746, 3267)


In [None]:
top_users = user_item_matrix.sum(axis=1).sort_values(ascending=False).head(15)
print(top_users)

Applicant ID
10894224.0    198.0
3902592.0     184.0
3237456.0     183.0
5813664.0     147.0
5105952.0     141.0
5401248.0     140.0
2275008.0     136.0
8049936.0     115.0
5202816.0     112.0
3435024.0     110.0
2674848.0     104.0
7769856.0     103.0
9985248.0     101.0
5500704.0     101.0
1395696.0      97.0
dtype: float64


In [None]:
# گرفتن ماتریس آیتم-کاربر (ترانهاده user_item_matrix)
item_user_matrix = user_item_matrix.T
item_names = item_user_matrix.index
n_items = len(item_names)

# آرایه‌های خالی برای شباهت‌ها
jaccard_similarity_array = np.zeros((n_items, n_items))
dice_similarity_array = np.zeros((n_items, n_items))

# محاسبه دستی شباهت Jaccard و Dice
for i in range(n_items):
    for j in range(i, n_items):
        vec_i = item_user_matrix.iloc[i].values
        vec_j = item_user_matrix.iloc[j].values

        intersection = np.sum(np.logical_and(vec_i, vec_j))
        union = np.sum(np.logical_or(vec_i, vec_j))
        len_i = np.sum(vec_i)
        len_j = np.sum(vec_j)

        jaccard = intersection / union if union != 0 else 0
        dice = (2 * intersection) / (len_i + len_j) if (len_i + len_j) != 0 else 0

        jaccard_similarity_array[i, j] = jaccard
        jaccard_similarity_array[j, i] = jaccard

        dice_similarity_array[i, j] = dice
        dice_similarity_array[j, i] = dice

In [None]:
jaccard_similarity_array = pd.DataFrame(
    jaccard_similarity_array, index=item_names, columns=item_names
)
dice_similarity_array = pd.DataFrame(
    dice_similarity_array, index=item_names, columns=item_names
)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity


def calculate_cosine_similarity(ui_matrix):
    item_user_matrix = ui_matrix.T
    cosine_sim_np = cosine_similarity(item_user_matrix)
    similarity_df = pd.DataFrame(
        cosine_sim_np, index=ui_matrix.columns, columns=ui_matrix.columns
    )

    print("✅ محاسبه ماتریس شباهت COSINE با موفقیت انجام شد.")
    return similarity_df


item_similarity_cosine = calculate_cosine_similarity(user_item_matrix)

✅ محاسبه ماتریس شباهت COSINE با موفقیت انجام شد.


In [None]:
from scipy.stats import pearsonr


def calculate_overall_correlation(df1, df2):
    if (
        not df1.shape == df2.shape
        or not all(df1.index == df2.index)
        or not all(df1.columns == df2.columns)
    ):
        raise ValueError("دیتافریم‌ها باید ابعاد و اندیس‌های یکسان داشته باشند.")

    values1 = df1.values[np.triu_indices_from(df1.values, k=1)]
    values2 = df2.values[np.triu_indices_from(df2.values, k=1)]

    correlation, p_value = pearsonr(values1, values2)
    return correlation, p_value


# محاسبه همبستگی‌ها
corr_jaccard_dice, _ = calculate_overall_correlation(
    jaccard_similarity_array, dice_similarity_array
)
print(f"✅ همبستگی پیرسون بین Jaccard و Dice: {corr_jaccard_dice:.4f}")

corr_jaccard_cosine, _ = calculate_overall_correlation(
    jaccard_similarity_array, item_similarity_cosine
)
print(f"✅ همبستگی پیرسون بین Jaccard و Cosine: {corr_jaccard_cosine:.4f}")

✅ همبستگی پیرسون بین Jaccard و Dice: 0.9979
✅ همبستگی پیرسون بین Jaccard و Cosine: 0.9901


In [None]:
def recommender_system(user_id, metric_name):
    # انتخاب ماتریس شباهت
    if metric_name.lower() == "jaccard":
        sim_df = jaccard_df
    elif metric_name.lower() == "cosine":
        sim_df = cosine_df
    else:
        raise ValueError(
            "❌ معیار شباهت نامعتبر است. فقط از 'Jaccard' یا 'Cosine' استفاده کنید."
        )

    # آگهی‌هایی که کاربر با آنها تعامل داشته (رزومه فرستاده)
    if user_id not in user_item_matrix.index:
        print("❌ کاربر مورد نظر در داده‌ها وجود ندارد.")
        return pd.DataFrame()

    user_jobs = user_item_matrix.loc[user_id]
    interacted_jobs = user_jobs[user_jobs > 0].index

    # امتیازدهی به سایر آگهی‌ها بر اساس شباهت با آگهی‌هایی که کاربر انتخاب کرده
    similar_scores = sim_df[interacted_jobs].sum(axis=1)

    # حذف آگهی‌هایی که کاربر قبلاً دیده
    similar_scores = similar_scores.drop(interacted_jobs, errors="ignore")

    # انتخاب 10 آگهی برتر
    top_10_jobs = similar_scores.sort_values(ascending=False).head(10).index

    # ساخت دیتافریم خروجی با دامنه شغلی
    result_df = df[["Job_ID_Title", "Job Domain"]].drop_duplicates()
    result_df = result_df[result_df["Job_ID_Title"].isin(top_10_jobs)]

    return result_df.reset_index(drop=True)

In [None]:
def get_user_applied_jobs(
    user_id, num_jobs_to_return=15, user_item_matrix_df=user_item_matrix
):
    if user_id not in user_item_matrix_df.index:
        print(f"خطا: کاربر با شناسه '{user_id}' در ماتریس کاربر-آیتم یافت نشد.")
        return []

    # انتخاب ردیف مربوط به کاربر هدف
    user_interactions_series = user_item_matrix_df.loc[user_id]

    # فیلتر کردن آیتم‌هایی که کاربر با آن‌ها تعامل داشته (مقدارشان 1.0 یا بیشتر است)
    applied_jobs_series = user_interactions_series[user_interactions_series >= 1.0]

    # استخراج نام آیتم‌ها (که همان نام ستون‌ها در ردیف انتخاب شده است)
    applied_jobs_list = applied_jobs_series.index.tolist()

    # برگرداندن حداکثر num_jobs_to_return شغل
    return applied_jobs_list[:num_jobs_to_return]


get_user_applied_jobs(user_id=3902592.0)

['100272 | طراح وب',
 '102792 | کارآموز برنامه\u200cنویسی Net.',
 '106104 | توسعه\u200cدهنده Front-end',
 '108240 | کارآموز React Native',
 '108264 | کارآموز Front-end',
 '109632 | توسعه\u200cدهنده Front-end',
 '111048 | کارآموز برنامه\u200cنویس Front End',
 '112944 | توسعه\u200cدهنده Back-end',
 '115800 | توسعه\u200cدهنده React) Front-end)',
 '118056 | کارآموز React',
 '121056 | کارآموز Front-End',
 '123336 | کارآموز توسعه\u200cدهنده وب',
 '123936 | کاز آموز توسعه\u200cدهنده فرانت اند',
 '134064 | کارآموز برنامه\u200cنویسی وب',
 '150072 | React js Front-end Developer']