In [13]:
import pandas as pd
import os

# ------------------------------------------------------------------------------------
# 1) Check where we are (helpful if paths break)
# ------------------------------------------------------------------------------------
print("Current working directory:", os.getcwd())

# ------------------------------------------------------------------------------------
# 2) Load all your scraped CSV files
#    NOTE: paths assume this notebook is in:  src/dataCleaningProcessing/
#          and your CSVs are in:             data/
#          So we go: ../../data/<file>.csv
# ------------------------------------------------------------------------------------
df_resources = pd.read_csv("../../data/raw/success_portal_resources.csv")
df_career    = pd.read_csv("../../data/raw/career_centre_faq.csv")
df_orient    = pd.read_csv("../../data/raw/orientation_faq.csv")
df_rights    = pd.read_csv("../../data/raw/student_rights_faq.csv")

print("Shapes:")
print("  resources:", df_resources.shape)
print("  career   :", df_career.shape)
print("  orient   :", df_orient.shape)
print("  rights   :", df_rights.shape)

# ------------------------------------------------------------------------------------
# 3) Convert success_portal_resources into Question/Answer format
# ------------------------------------------------------------------------------------
df_resources_kb = pd.DataFrame({
    "question":   df_resources["title"].astype(str),
    "answer":     df_resources["summary"].astype(str),
    "source_url": df_resources["detail_url"].astype(str),
    "source_type": "portal_resource"
})

# ------------------------------------------------------------------------------------
# 4) Add source_type to FAQ datasets
# ------------------------------------------------------------------------------------
df_career["source_type"] = "career_centre_faq"
df_orient["source_type"] = "orientation_faq"
df_rights["source_type"] = "student_rights_faq"

# ------------------------------------------------------------------------------------
# 5) Combine everything into a single knowledge base dataframe
# ------------------------------------------------------------------------------------
kb_df = pd.concat(
    [
        df_career[["question", "answer", "source_url", "source_type"]],
        df_orient[["question", "answer", "source_url", "source_type"]],
        df_rights[["question", "answer", "source_url", "source_type"]],
        df_resources_kb[["question", "answer", "source_url", "source_type"]],
    ],
    ignore_index=True
)

print("Combined raw KB shape:", kb_df.shape)

# ------------------------------------------------------------------------------------
# 6) Basic cleaning and deduplication
# ------------------------------------------------------------------------------------
kb_df["question"] = kb_df["question"].fillna("").str.strip()
kb_df["answer"]   = kb_df["answer"].fillna("").str.strip()

# Drop rows with empty question or answer
kb_df = kb_df[(kb_df["question"] != "") & (kb_df["answer"] != "")]

# Drop duplicate Q&A pairs
kb_df = kb_df.drop_duplicates(subset=["question", "answer"])

print("After cleaning & deduplication:", kb_df.shape)

# ------------------------------------------------------------------------------------
# 7) Save final knowledge base CSV into data folder
# ------------------------------------------------------------------------------------
output_path = "../../data/student_affairs_knowledge_base.csv"
kb_df.to_csv(output_path, index=False, encoding="utf-8-sig")

print("\nSaved final knowledge base to:", output_path)
print("Total Q/A entries:", kb_df.shape[0])
display(kb_df.head())


Current working directory: c:\Users\aishw\OneDrive\Desktop\AIML\FoundationOfML\FinalProject\CSCN8010_FinalProject\src\dataCleaningProcessing
Shapes:
  resources: (100, 4)
  career   : (14, 3)
  orient   : (17, 3)
  rights   : (13, 3)
Combined raw KB shape: (144, 4)
After cleaning & deduplication: (144, 4)

Saved final knowledge base to: ../../data/student_affairs_knowledge_base.csv
Total Q/A entries: 144


Unnamed: 0,question,answer,source_url,source_type
0,How can I access a Career Centre?,To access services at Community Career Centres...,https://www.conestogac.on.ca/career-centre/faq,career_centre_faq
1,I am a Conestoga student and I can't access th...,Current Conestoga students: contact talenthub@...,https://www.conestogac.on.ca/career-centre/faq,career_centre_faq
2,I am a Conestoga graduate and I can't access t...,Canadian graduates: contact waterloocareercent...,https://www.conestogac.on.ca/career-centre/faq,career_centre_faq
3,How much does a session cost at the Community ...,Career Centre services are free.,https://www.conestogac.on.ca/career-centre/faq,career_centre_faq
4,How can I access funded training?,This is one of the many options to discuss wit...,https://www.conestogac.on.ca/career-centre/faq,career_centre_faq


import