In [7]:
import pandas as pd
import random
import string

# Load filtered MOOC course dataset
df = pd.read_csv("MOOC.csv")  # adjust path if needed

# Filter relevant courses (CSE/IT/ML/Python)
keywords = ["computer", "cse", "it", "machine learning", "ml", "python", "programming", "data", "ai", "neural", "deep"]
df_filtered = df[df["Course Name"].str.lower().str.contains('|'.join(keywords), na=False)]

# Prepare lists
course_titles = df_filtered["Course Name"].dropna().unique().tolist()
all_skills = df_filtered["all_skill"].dropna().unique().tolist()

# Static data
occupations = ["Student", "Software Engineer", "Data Scientist", "IT Professional", "Researcher"]
countries = ["India", "USA", "Canada", "Germany", "UK", "Australia"]
difficulty_levels = ["Beginner", "Intermediate", "Advanced"]
engagement_levels = ["High", "Medium", "Low"]

# Helper: generate random name
def generate_name():
    first = ''.join(random.choices(string.ascii_uppercase, k=1)) + ''.join(random.choices(string.ascii_lowercase, k=random.randint(3,6)))
    last = ''.join(random.choices(string.ascii_uppercase, k=1)) + ''.join(random.choices(string.ascii_lowercase, k=random.randint(4,7)))
    return f"{first} {last}"

# Helper: generate email
def generate_email(name):
    user = name.lower().replace(" ", ".")
    domain = random.choice(["gmail.com", "yahoo.com", "outlook.com"])
    return f"{user}{random.randint(100,999)}@{domain}"

# Create dataset
users = []

for i in range(1, 100001):
    name = generate_name()
    email = generate_email(name)
    occupation = random.choice(occupations)
    difficulty = random.choice(difficulty_levels)
    country = random.choice(countries)
    freq = random.randint(1, 10)
    engagement = random.choice(engagement_levels)

    topics = random.sample(course_titles, k=3) if len(course_titles) >= 3 else course_titles
    skills = random.choice(all_skills).split()[:5] if all_skills else ["Python"]
    completed = random.sample(course_titles, k=10) if len(course_titles) >= 10 else course_titles
    clicks = random.sample(course_titles, k=3) if len(course_titles) >= 3 else course_titles

    users.append({
        "User_ID": f"U{i:05}",
        "Name": name,
        "Email_ID": email,
        "Occupation": occupation,
        "Preferred Difficulty Level": difficulty,
        "Preferred Topics": ", ".join(topics),
        "Learning Goals": f"To improve knowledge in {random.choice(['AI', 'Python', 'Data Science', 'Programming', 'Software Development'])}",
        "Previous Learning History (Courses Completed)": ", ".join(completed),
        "Skills Acquired": ", ".join(skills),
        "Click History (Top 3 Courses Clicked)": ", ".join(clicks),
        "Engagement Level": engagement,
        "Learning Frequency (Courses per Month)": freq,
        "Country": country
    })

# Save to CSV
user_df = pd.DataFrame(users)
user_df.to_csv("synthetic_mooc_user_dataset_10000.csv", index=False)

print("✅ Dataset created without faker.")


✅ Dataset created without faker.
