In [1]:
pip install faker bcrypt pandas

Collecting faker
  Downloading Faker-30.1.0-py3-none-any.whl.metadata (15 kB)
Collecting bcrypt
  Downloading bcrypt-4.2.0-cp39-abi3-win_amd64.whl.metadata (9.9 kB)
Downloading Faker-30.1.0-py3-none-any.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
    --------------------------------------- 0.0/1.8 MB 991.0 kB/s eta 0:00:02
   ---------------- ----------------------- 0.8/1.8 MB 9.7 MB/s eta 0:00:01
   ---------------------------------------  1.8/1.8 MB 16.7 MB/s eta 0:00:01
   ---------------------------------------- 1.8/1.8 MB 13.0 MB/s eta 0:00:00
Downloading bcrypt-4.2.0-cp39-abi3-win_amd64.whl (151 kB)
   ---------------------------------------- 0.0/151.7 kB ? eta -:--:--
   ---------------------------------------- 151.7/151.7 kB 8.8 MB/s eta 0:00:00
Installing collected packages: bcrypt, faker
Successfully installed bcrypt-4.2.0 faker-30.1.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import pandas as pd
import bcrypt
import random
from faker import Faker

fake = Faker()

# Constants
NUM_USERS = 100
NUM_COURSES = 150
NUM_SKILLS = 200
DESIGNATIONS = [
    "Software Developer",
    "Data Engineer",
    "HR",
    "Sales Executive",
    "IT Specialist",
    "Customer Support",
    "Project Manager",
    "Financial Analyst",
]
NUM_DESIGNATIONS = len(DESIGNATIONS)
PROGRESS_ENTRIES_PER_USER = 5
PASSWORD = "1245"
DOMAIN = "@shcrew.com"


In [6]:
# Generate Designations
designation_df = pd.DataFrame({
    'id': range(1, NUM_DESIGNATIONS + 1),
    'name': DESIGNATIONS
})
designation_df.to_csv('designations.csv', index=False)
designation_df.head(8)  # Display sample


Unnamed: 0,id,name
0,1,Software Developer
1,2,Data Engineer
2,3,HR
3,4,Sales Executive
4,5,IT Specialist
5,6,Customer Support
6,7,Project Manager
7,8,Financial Analyst


In [7]:
# Generate Courses
courses = []
for _ in range(NUM_COURSES):
    courses.append({
        'id': len(courses) + 1,
        'title': fake.catch_phrase(),
        'url': fake.url(),
        'shortIntro': fake.text(max_nb_chars=200),
        'difficulty': random.choice(['Beginner', 'Intermediate', 'Advanced']),
        'language': random.choice(['English', 'Spanish', 'French', 'German', 'Hindi', 'Tamil', 'Japanese']),
        'totalTime': random.randint(30, 500),  # in minutes
        'totalModules': random.randint(5, 50),
        'createdAt': fake.date_time(),
        'updatedAt': fake.date_time()
    })
course_df = pd.DataFrame(courses)
course_df.to_csv('courses.csv', index=False)
course_df.head()  # Display sample


Unnamed: 0,id,title,url,shortIntro,difficulty,language,totalTime,totalModules,createdAt,updatedAt
0,1,Ergonomic system-worthy product,https://www.holt.biz/,Risk mention son. Officer safe remain common w...,Advanced,English,260,28,1971-07-02 08:32:07,1996-06-06 22:55:57
1,2,Switchable regional architecture,http://www.sweeney.com/,Single operation several couple certain either...,Intermediate,Tamil,495,14,1980-05-11 10:49:39,2023-04-23 02:45:38
2,3,Multi-layered real-time challenge,https://www.klein-poole.com/,Fall knowledge under white high. Movement memb...,Intermediate,German,467,32,2014-03-06 08:00:15,1974-02-01 11:02:31
3,4,Devolved static moderator,https://www.ross.com/,Least opportunity board general practice diffe...,Advanced,Spanish,129,38,1974-04-12 02:56:51,2012-05-15 23:16:23
4,5,Adaptive multi-tasking instruction set,http://www.white-allen.info/,Identify be fast pattern especially smile. Fin...,Intermediate,Japanese,474,13,2024-03-10 23:20:40,1992-10-15 21:22:37


In [8]:
# Generate Skills
skills = [fake.word() for _ in range(NUM_SKILLS)]
skill_df = pd.DataFrame({
    'id': range(1, NUM_SKILLS + 1),
    'name': skills
})
skill_df.to_csv('skills.csv', index=False)
skill_df.head()  # Display sample


Unnamed: 0,id,name
0,1,wait
1,2,short
2,3,theory
3,4,officer
4,5,common


In [9]:
# Generate Users
users = []
for i in range(NUM_USERS):
    username = fake.user_name()
    email = f"{username}{DOMAIN}"
    salt = bcrypt.gensalt(rounds=10)
    hashed_password = bcrypt.hashpw(PASSWORD.encode('utf-8'), salt).decode('utf-8')
    users.append({
        'id': i + 1,
        'username': username,
        'passwordHash': hashed_password,
        'email': email,
        'gender': random.choice(['Male', 'Female', 'Others']),
        'role': random.choice(['admin', 'employee']),
        'designationId': random.randint(1, NUM_DESIGNATIONS),  # Use designations from the list
        'phoneNumber': fake.phone_number(),
        'createdAt': fake.date_time(),
        'updatedAt': fake.date_time()
    })
user_df = pd.DataFrame(users)
user_df.to_csv('users.csv', index=False)
user_df.head()  # Display sample


Unnamed: 0,id,username,passwordHash,email,gender,role,designationId,phoneNumber,createdAt,updatedAt
0,1,anthony88,$2b$10$5CSIsi9s3ldiXwl6U0gH0uHM0uqQrdne0XW1Czf...,anthony88@shcrew.com,Male,admin,5,+1-622-936-3966,1990-08-22 05:33:14,1971-09-17 15:29:41
1,2,raymondchandler,$2b$10$xeg1Pg..3Hjzppa3xKuLwufEjZWqPrcB0fa.y00...,raymondchandler@shcrew.com,Female,employee,8,001-295-395-2724x00667,1986-07-22 00:39:27,2008-09-24 04:21:12
2,3,egray,$2b$10$50NOu8yI195ygwmg6lF51OrLhbtCk/K3cwSGfxg...,egray@shcrew.com,Female,admin,2,001-574-980-5368x249,2022-06-18 08:19:42,1973-05-18 09:18:14
3,4,herreradonald,$2b$10$yJX4gJuoddv/zMJu.V2XOupKFkCATz6SqZX77q6...,herreradonald@shcrew.com,Female,employee,2,677.910.1024,1992-02-13 03:07:35,1988-12-24 06:45:09
4,5,staylor,$2b$10$7tefnX1FJ4KACZPfPbmv7.mVfvEvZ5txPFCct2j...,staylor@shcrew.com,Female,employee,1,(703)567-4848x7829,1971-03-04 09:15:54,2009-05-17 04:52:53


In [10]:
# Generate DesignationSkills
designation_skills = []
for designation_id in range(1, NUM_DESIGNATIONS + 1):
    num_skills = random.randint(1, 5)  # Each designation has between 1 and 5 skills
    designation_skill_ids = random.sample(range(1, NUM_SKILLS + 1), num_skills)
    for skill_id in designation_skill_ids:
        designation_skills.append({
            'id': len(designation_skills) + 1,
            'designationId': designation_id,
            'skillId': skill_id
        })
designation_skill_df = pd.DataFrame(designation_skills)
designation_skill_df.to_csv('designation_skills.csv', index=False)
designation_skill_df.head()  # Display sample


Unnamed: 0,id,designationId,skillId
0,1,1,162
1,2,1,54
2,3,1,90
3,4,1,80
4,5,2,129


In [11]:
# Generate CourseSkills
course_skills = []
for course_id in range(1, NUM_COURSES + 1):
    num_skills = random.randint(1, 5)  # Each course has between 1 and 5 skills
    course_skill_ids = random.sample(range(1, NUM_SKILLS + 1), num_skills)
    for skill_id in course_skill_ids:
        course_skills.append({
            'id': len(course_skills) + 1,
            'courseId': course_id,
            'skillId': skill_id,
            'level': random.choice(['Beginner', 'Intermediate', 'Advanced'])
        })
course_skill_df = pd.DataFrame(course_skills)
course_skill_df.to_csv('course_skills.csv', index=False)
course_skill_df.head()  # Display sample


Unnamed: 0,id,courseId,skillId,level
0,1,1,125,Beginner
1,2,1,145,Beginner
2,3,1,14,Intermediate
3,4,2,63,Advanced
4,5,2,130,Advanced


In [12]:
# Generate EmployeeProgress
employee_progress = []
user_course_mapping = {}  # To track which courses each user is enrolled in

for user in users:
    # Assign a random number of courses to the user
    num_courses = random.randint(1, 3)  # Each user takes between 1 to 3 courses
    assigned_courses = random.sample(range(1, NUM_COURSES + 1), num_courses)
    user_course_mapping[user['id']] = assigned_courses
    
    for course_id in assigned_courses:
        total_modules = courses[course_id - 1]['totalModules']  # Match the course
        # Initial progress
        employee_progress.append({
            'id': len(employee_progress) + 1,
            'userId': user['id'],
            'courseId': course_id,
            'progressStatus': 'not_started',
            'lastUpdated': fake.date_time(),
            'modulesCompleted': 0,
            'certificateProof': fake.url()
        })
        
        # Generate additional progress entries for the course
        for j in range(1, PROGRESS_ENTRIES_PER_USER):
            modules_completed = random.randint(1, total_modules) if j < PROGRESS_ENTRIES_PER_USER - 1 else total_modules
            progress_status = 'completed' if modules_completed == total_modules else 'in_progress'
            employee_progress.append({
                'id': len(employee_progress) + 1,
                'userId': user['id'],
                'courseId': course_id,
                'progressStatus': progress_status,
                'lastUpdated': fake.date_time(),
                'modulesCompleted': modules_completed,
                'certificateProof': fake.url()
            })

employee_progress_df = pd.DataFrame(employee_progress)
employee_progress_df.to_csv('employee_progress.csv', index=False)
employee_progress_df.head()  # Display sample


Unnamed: 0,id,userId,courseId,progressStatus,lastUpdated,modulesCompleted,certificateProof
0,1,1,39,not_started,2016-10-05 12:24:07,0,https://richardson-mayer.com/
1,2,1,39,in_progress,1980-06-15 03:19:50,23,https://wright-garcia.com/
2,3,1,39,in_progress,1995-12-02 04:24:40,9,https://cabrera.com/
3,4,1,39,in_progress,2001-12-25 07:33:53,15,https://www.chan.com/
4,5,1,39,completed,2018-08-28 08:19:44,28,https://www.williams.com/


In [13]:
# Generate UserSkills
user_skills = []
for user in users:
    num_skills = random.randint(1, 5)  # Random skills for the user
    user_skill_ids = random.sample(range(1, NUM_SKILLS + 1), num_skills)
    
    for skill_id in user_skill_ids:
        user_skills.append({
            'id': len(user_skills) + 1,
            'userId': user['id'],
            'skillId': skill_id,
            'level': random.choice(['Beginner', 'Intermediate', 'Advanced'])
        })
    
    # Add skills from completed courses
    completed_courses = [progress for progress in employee_progress if progress['userId'] == user['id'] and progress['progressStatus'] == 'completed']
    for progress in completed_courses:
        course_skills = [skill['skillId'] for skill in course_skill_df.to_dict('records') if skill['courseId'] == progress['courseId']]
        for skill_id in course_skills:
            # Check if the skill is already assigned to the user
            existing_skill = next((skill for skill in user_skills if skill['skillId'] == skill_id and skill['userId'] == user['id']), None)
            new_skill_level = random.choice(['Beginner', 'Intermediate', 'Advanced'])

            if existing_skill:
                # Compare levels and keep the highest one
                if new_skill_level == 'Advanced':
                    existing_skill['level'] = 'Advanced'
                elif new_skill_level == 'Intermediate' and existing_skill['level'] != 'Advanced':
                    existing_skill['level'] = 'Intermediate'
                elif new_skill_level == 'Beginner' and existing_skill['level'] not in ['Intermediate', 'Advanced']:
                    existing_skill['level'] = 'Beginner'
            else:
                # If the skill is not already with the user, add it
                user_skills.append({
                    'id': len(user_skills) + 1,
                    'userId': user['id'],
                    'skillId': skill_id,
                    'level': new_skill_level
                })

user_skill_df = pd.DataFrame(user_skills)
user_skill_df.to_csv('user_skills.csv', index=False)
user_skill_df.head()  # Display sample


Unnamed: 0,id,userId,skillId,level
0,1,1,2,Intermediate
1,2,1,139,Intermediate
2,3,1,146,Beginner
3,4,1,182,Advanced
4,5,1,53,Intermediate
