In [1]:
# pip install mlxtend

In [13]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
import warnings

In [14]:
# Suppress specific warnings from mlxtend (optional, but can clean up output)
warnings.filterwarnings('ignore', category=FutureWarning)

In [15]:
import os

In [16]:
print(os.getcwd())

/Users/amarachiordor/Downloads/data_analysis_linkedin_jobs/notebooks


In [17]:
import pandas as pd

# Step 1: Define the file path (adjust as needed)
file_path_bridge = '../model/bridge_job_skill.xlsx'
file_path_skill = '../model/dim_skill.xlsx'

# Step 2: Load the Excel files
df_bridge_job_skill = pd.read_excel(file_path_bridge)
df_dim_skill = pd.read_excel(file_path_skill)


In [18]:
df_bridge_job_skill.head()

Unnamed: 0,job_id,skill_id,category_id
0,4013873022,21,3
1,4013873022,10,1
2,4013873022,7,1
3,4013873022,20,3
4,4013873022,19,3


In [19]:
df_dim_skill.head()

Unnamed: 0,skill_id,skill_name,category_id
0,1,communication,1
1,2,attention to detail,1
2,3,problem-solving,1
3,4,presentation,1
4,5,storytelling,1


In [20]:
df_bridge_job_skill.drop('category_id', axis=1, inplace=True)

In [21]:
df_bridge_job_skill.head()

Unnamed: 0,job_id,skill_id
0,4013873022,21
1,4013873022,10
2,4013873022,7
3,4013873022,20
4,4013873022,19


In [22]:
df_bridge_job_skill = df_bridge_job_skill[['job_id', 'skill_id']]
df_dim_skill = df_dim_skill[['skill_id', 'skill_name']]

In [23]:
# --- 2. Join Tables to Get Skill Names for Each Job ---
# We need to join bridge_job_skill with dim_skill to get the actual skill_name for each job_id.
# This results in a DataFrame with job_id and skill_name, which is the format needed for TransactionEncoder.
df_job_skills = df_bridge_job_skill.merge(
    df_dim_skill[['skill_id', 'skill_name']], # Select only necessary columns from dim_skill
    on='skill_id',
    how='inner' # Use inner join to ensure we only get valid skill_ids
)

print("df_job_skills (after join) head:\n", df_job_skills.head())
print("\n" + "="*50 + "\n")

df_job_skills (after join) head:
        job_id  skill_id skill_name
0  4013873022        21          r
1  4013727701        21          r
2  3958657766        21          r
3  3978416260        21          r
4  4034819398        21          r




In [25]:
# --- 3. Prepare Data for Apriori ---
# Create a list of lists, where each inner list is the skills for one job.
# This groups all skills associated with a single job_id into a list.
transactions = df_job_skills.groupby('job_id')['skill_name'].apply(list).tolist()

# Use TransactionEncoder to convert the list of lists into a one-hot encoded DataFrame.
# This creates a binary matrix (jobs x skills) where 1 means the skill is present for that job.
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_transactions = pd.DataFrame(te_ary, columns=te.columns_)

print("df_transactions (one-hot encoded skills) head:\n", df_transactions.head())
print("\n" + "="*50 + "\n")

df_transactions (one-hot encoded skills) head:
    airflow  apache spark  attention to detail  automation    aws  azure  \
0    False         False                False       False  False  False   
1    False         False                False       False  False  False   
2    False         False                False       False  False  False   
3    False         False                False       False  False  False   
4    False         False                False       False  False  False   

   bachelor  clustering  collaboration  communication  ...  snowflake    sql  \
0      True       False          False           True  ...      False   True   
1     False       False          False           True  ...      False  False   
2     False       False          False           True  ...      False  False   
3     False       False           True           True  ...      False  False   
4      True       False           True          False  ...      False   True   

   stakeholder manag

In [26]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from tqdm import tqdm
import time

# Simulate step tracking with progress bars
def step_progress(step_name, task_function):
    print(f"\n{step_name}")
    for _ in tqdm(range(1), desc=step_name):
        result = task_function()
    return result

# --- Step 1: Generate Frequent Itemsets ---
def generate_frequent_itemsets():
    time.sleep(1)  # Simulate time delay for progress bar visibility
    return apriori(df_transactions, min_support=0.01, use_colnames=True)

frequent_itemsets = step_progress("Generating Frequent Itemsets", generate_frequent_itemsets)
print(f"âœ” Found {len(frequent_itemsets)} frequent itemsets.")

# --- Step 2: Generate Association Rules ---
def generate_rules():
    time.sleep(1)  # Simulate time delay
    return association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

rules = step_progress("Generating Association Rules", generate_rules)
print(f"âœ” Generated {len(rules)} association rules.")

# --- Step 3: Process Rules (e.g., Add Length Column) ---
def process_rules():
    tqdm.pandas(desc="Processing Rules")
    return rules.progress_apply(
        lambda row: len(row["antecedents"]) + len(row["consequents"]),
        axis=1
    )

print("\nProcessing Rules (adding 'rule_length' column)...")
rules["rule_length"] = process_rules()

# --- Step 4: Display Sample ---
print("\nðŸ“‹ Sample Rules (first 10):\n", rules.head(10))
print("\n" + "="*50 + "\n")


Generating Frequent Itemsets


Generating Frequent Itemsets: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:49<00:00, 49.84s/it]


âœ” Found 87187 frequent itemsets.

Generating Association Rules


Generating Association Rules: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:06<00:00,  6.95s/it]


âœ” Generated 725764 association rules.

Processing Rules (adding 'rule_length' column)...


Processing Rules: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 725764/725764 [00:02<00:00, 303697.83it/s]


ðŸ“‹ Sample Rules (first 10):
              antecedents           consequents  antecedent support  \
0              (airflow)       (communication)            0.023651   
1              (airflow)  (data visualization)            0.023651   
2              (airflow)         (engineering)            0.023651   
3              (airflow)              (python)            0.023651   
4              (airflow)                 (sql)            0.023651   
5              (airflow)             (tableau)            0.023651   
6  (attention to detail)       (communication)            0.171081   
7  (attention to detail)  (data visualization)            0.171081   
8  (attention to detail)               (excel)            0.171081   
9  (attention to detail)     (problem-solving)            0.171081   

   consequent support   support  confidence      lift  leverage  conviction  \
0            0.546098  0.013737    0.580838  1.063615  0.000822    1.082880   
1            0.565359  0.015437    0.65




In [27]:
# --- 5. Format Rules for Power BI ---
# Convert frozensets (which are not directly Power BI friendly) to comma-separated strings.
# Sorting the skills within the string ensures consistency (e.g., "A, B" vs "B, A").
rules['If Skills (Antecedents)'] = rules['antecedents'].apply(lambda x: ', '.join(sorted(list(x))))
rules['Then Skills (Consequents)'] = rules['consequents'].apply(lambda x: ', '.join(sorted(list(x))))

In [28]:
# Select and reorder columns for clarity in Power BI.
rules_for_pb = rules[[
    'If Skills (Antecedents)',
    'Then Skills (Consequents)',
    'support',
    'confidence',
    'lift'
]]

In [29]:
# Rename columns to be more user-friendly in Power BI.
rules_for_pb = rules_for_pb.rename(columns={
    'support': 'Support',
    'confidence': 'Confidence',
    'lift': 'Lift'
})

In [30]:
print("Formatted Rules for Power BI (first 10):\n", rules_for_pb.head(10))
print("\n" + "="*50 + "\n")

Formatted Rules for Power BI (first 10):
   If Skills (Antecedents) Then Skills (Consequents)   Support  Confidence  \
0                 airflow             communication  0.013737    0.580838   
1                 airflow        data visualization  0.015437    0.652695   
2                 airflow               engineering  0.013454    0.568862   
3                 airflow                    python  0.018836    0.796407   
4                 airflow                       sql  0.023085    0.976048   
5                 airflow                   tableau  0.013171    0.556886   
6     attention to detail             communication  0.126753    0.740894   
7     attention to detail        data visualization  0.109616    0.640728   
8     attention to detail                     excel  0.135533    0.792219   
9     attention to detail           problem-solving  0.087381    0.510762   

       Lift  
0  1.063615  
1  1.154478  
2  1.671551  
3  1.678636  
4  1.373978  
5  1.612869  
6  1.356705 

In [31]:
# --- 6. Save the Rules to a CSV File ---
# This CSV file will be imported directly into Power BI.
output_file_path = '../model/association_rules.csv'
rules_for_pb.to_csv(output_file_path, index=False)

print(f"Association rules generated and saved to {output_file_path}")

Association rules generated and saved to ../model/association_rules.csv
