In [16]:
# pip install faker bcrypt pandas

In [1]:
import pandas as pd
import bcrypt
import random
from faker import Faker

fake = Faker()

NUM_USERS = 500
NUM_COURSES = 400
DESIGNATIONS = [
    "Software Developer",
    "Data Engineer",
    "HR",
    "Sales Executive",
    "IT Specialist",
    "Customer Support",
    "Project Manager",
    "Financial Analyst",
]
NUM_DESIGNATIONS = len(DESIGNATIONS)
PROGRESS_ENTRIES_PER_USER = 5
PASSWORD = "1245"
DOMAIN = "@mgcrew.com"


In [2]:
# Generate Designations
designation_df = pd.DataFrame({
    'id': range(1, NUM_DESIGNATIONS + 1),
    'name': DESIGNATIONS
})
designation_df.to_csv('designations.csv', index=False)
designation_df.head(8)  


Unnamed: 0,id,name
0,1,Software Developer
1,2,Data Engineer
2,3,HR
3,4,Sales Executive
4,5,IT Specialist
5,6,Customer Support
6,7,Project Manager
7,8,Financial Analyst


# Generate Courses

In [3]:
courses = []
for _ in range(NUM_COURSES):
    # A random createdAt date within the last 2 years
    createdAt = fake.date_time_between(start_date='-2y', end_date='now')
    
    # An updatedAt date to be later than createdAt
    updatedAt = fake.date_time_between(start_date=createdAt, end_date='now')
    
    courses.append({
        'id': len(courses) + 1,
        'title': fake.catch_phrase(),
        'url': fake.url(),
        'shortIntro': fake.text(max_nb_chars=200),
        'difficulty': random.choice(['Beginner', 'Intermediate', 'Advanced']),
        'language': random.choice(['English', 'Spanish', 'French', 'German', 'Hindi', 'Tamil', 'Japanese']),
        'totalTime': random.randint(30, 500), 
        'totalModules': random.randint(5, 30),
        'createdAt': createdAt,
        'updatedAt': updatedAt
    })

course_df = pd.DataFrame(courses)
course_df.to_csv('courses.csv', index=False)
course_df.head()

Unnamed: 0,id,title,url,shortIntro,difficulty,language,totalTime,totalModules,createdAt,updatedAt
0,1,Total asymmetric knowledgebase,https://murray-foley.com/,Family member letter. Guy side over reflect. S...,Advanced,French,65,9,2023-08-19 03:05:31,2023-10-04 19:01:56
1,2,Organized intangible flexibility,http://www.walker.info/,Number sometimes take would page. Most Democra...,Beginner,Spanish,462,5,2023-10-10 11:16:17,2024-01-19 19:52:28
2,3,Total interactive archive,http://www.booth-garcia.net/,Serious research along last. Forward hundred p...,Intermediate,Spanish,390,28,2023-09-29 16:06:59,2024-03-22 02:03:10
3,4,Face-to-face 5thgeneration concept,http://mullins-beard.com/,Similar something plan which decade success. S...,Advanced,English,409,18,2024-03-27 09:35:17,2024-06-21 14:59:04
4,5,Organized hybrid customer loyalty,http://www.barnes.com/,Dog apply name list. Civil indicate assume ski...,Advanced,Spanish,177,27,2022-12-22 12:49:43,2023-12-12 15:49:53


# Generate Users

In [4]:
from datetime import datetime, timedelta

users = []

# Get the current date and calculate the range for 2.5 years
end_date = datetime.now()
start_date = end_date - timedelta(days=2.5 * 365)

# First user with specific details
first_user_password_hash = bcrypt.hashpw(PASSWORD.encode('utf-8'), bcrypt.gensalt(rounds=10)).decode('utf-8')
first_user_created_at = fake.date_time_between(start_date=start_date, end_date=end_date)
first_user_updated_at = fake.date_time_between(start_date=first_user_created_at, end_date=end_date)

first_user = {
    'id': 1,
    'username': "Lara",
    'passwordHash': first_user_password_hash,
    'email': "lara@g.co",
    'gender': "Female",
    'role': "admin",
    'designationId': None,
    'phoneNumber': "9874563210",
    'createdAt': first_user_created_at,
    'updatedAt': first_user_updated_at
}
users.append(first_user)

# Generate remaining users
for i in range(1, NUM_USERS):
    username = fake.name()
    email = f"{fake.user_name()}{i}{DOMAIN}"
    salt = bcrypt.gensalt(rounds=10)
    hashed_password = bcrypt.hashpw(PASSWORD.encode('utf-8'), salt).decode('utf-8')

    # Create createdAt and updatedAt timestamps
    created_at = fake.date_time_between(start_date=start_date, end_date=end_date)
    updated_at = fake.date_time_between(start_date=created_at, end_date=end_date)

    users.append({
        'id': i + 1,
        'username': username,
        'passwordHash': hashed_password,
        'email': email,
        'gender': random.choice(['Male', 'Female', 'Others']),
        'role': 'employee',
        'designationId': random.randint(1, NUM_DESIGNATIONS),
        'phoneNumber': fake.phone_number()[:20],
        'createdAt': created_at,
        'updatedAt': updated_at
    })

# Convert to DataFrame and save to CSV
user_df = pd.DataFrame(users)
user_df.to_csv('users.csv', index=False)
print(user_df.head())


   id          username                                       passwordHash  \
0   1              Lara  $2b$10$EsLebuLbxPNaLanlKFXZreKG7d8BpnbesZEOoz2...   
1   2  Julie Maynard MD  $2b$10$JmF8H/9uFYtaFug2Uf1i/ubdnuWGKUZTMMg6w3n...   
2   3   Jessica Kennedy  $2b$10$DCvXIKWDcRgQo1ZZYX/3EOeE4n9xiJ8NXoZG/tU...   
3   4         Seth Vega  $2b$10$eZwPcrCzA1ZTg9eODq8YnuihajCmwT1/BEKFdGu...   
4   5        Mary Clark  $2b$10$rbpZ0yhkB.4jHPmRY7IvY.6kHaVYyCnNrTsz.7K...   

                       email  gender      role  designationId  \
0                  lara@g.co  Female     admin            NaN   
1       andrew831@mgcrew.com  Female  employee            6.0   
2  christopher862@mgcrew.com  Others  employee            1.0   
3   justinharris3@mgcrew.com  Female  employee            5.0   
4         jmarks4@mgcrew.com  Female  employee            3.0   

            phoneNumber           createdAt           updatedAt  
0            9874563210 2022-05-23 03:07:56 2023-05-17 15:38:22  
1   +1-7

# Generate DesignationSkills

In [11]:

designation_skills = []
for designation_id in range(1, NUM_DESIGNATIONS + 1):
    num_skills = random.randint(1, 10)  
    designation_skill_ids = random.sample(range(1, 200 + 1), num_skills)
    for skill_id in designation_skill_ids:
        designation_skills.append({
            'id': len(designation_skills) + 1,
            'designationId': designation_id,
            'skillId': skill_id
        })
designation_skill_df = pd.DataFrame(designation_skills)
designation_skill_df.to_csv('designation_skills.csv', index=False)
designation_skill_df.head()  # Display sample

Unnamed: 0,id,designationId,skillId
0,1,1,63
1,2,1,184
2,3,1,70
3,4,1,158
4,5,1,179


# Generate CourseSkills

In [12]:

course_skills = []
for course_id in range(1, NUM_COURSES + 1):
    num_skills = random.randint(1, 6)
    course_skill_ids = random.sample(range(1, 200 + 1), num_skills)
    for skill_id in course_skill_ids:
        course_skills.append({
            'id': len(course_skills) + 1,
            'courseId': course_id,
            'skillId': skill_id,
            'level': random.choice(['Beginner', 'Intermediate', 'Advanced'])
        })
course_skill_df = pd.DataFrame(course_skills)
course_skill_df.to_csv('course_skills.csv', index=False)
course_skill_df.head()  


Unnamed: 0,id,courseId,skillId,level
0,1,1,166,Advanced
1,2,2,45,Beginner
2,3,2,75,Intermediate
3,4,2,122,Intermediate
4,5,2,118,Advanced


# Generate Employee Progress

In [13]:

employee_progress = []
user_course_mapping = {}

for user in users:
    num_courses = random.randint(0, 3)
    assigned_courses = random.sample(range(1, NUM_COURSES + 1), num_courses)
    user_course_mapping[user['id']] = assigned_courses
    
    for course_id in assigned_courses:
        total_modules = courses[course_id - 1]['totalModules']
        course_created_at = courses[course_id - 1]['createdAt']
        user_created_at = user['createdAt']
        
        # Set the date range for progress entries
        first_entry_time = fake.date_time_between(start_date=max(course_created_at, user_created_at), end_date=end_date)

        # First progress entry
        modules_completed = random.randint(0, total_modules)
        progress_status = 'completed' if modules_completed == total_modules else 'in_progress'

        employee_progress.append({
            'id': len(employee_progress) + 1,
            'userId': user['id'],
            'courseId': course_id,
            'progressStatus': progress_status,
            'lastUpdated': first_entry_time,
            'modulesCompleted': modules_completed,
            'certificateProof': fake.url() if progress_status == 'completed' else None
        })

        # Subsequent progress entries
        for j in range(1, PROGRESS_ENTRIES_PER_USER):
            previous_entry = employee_progress[-1]
            previous_modules = previous_entry['modulesCompleted']

            # Check if previous_modules is already equal to total_modules
            if previous_modules >= total_modules:
                break  # No further progress can be made

            # Ensure the next modules completed is greater than previous
            next_modules_completed = random.randint(previous_modules + 1, total_modules)

            # Increment time for next entry
            next_entry_time = fake.date_time_between(start_date=previous_entry['lastUpdated'], end_date=end_date)

            # Ensure the next entry time is after course and user creation dates
            next_entry_time = max(next_entry_time, previous_entry['lastUpdated'])
            next_entry_time = max(next_entry_time, course_created_at, user_created_at)

            # Determine the progress status
            progress_status = 'completed' if next_modules_completed == total_modules else 'in_progress'

            employee_progress.append({
                'id': len(employee_progress) + 1,
                'userId': user['id'],
                'courseId': course_id,
                'progressStatus': progress_status,
                'lastUpdated': next_entry_time,
                'modulesCompleted': next_modules_completed,
                'certificateProof': fake.url() if progress_status == 'completed' else None
            })

# Convert to DataFrame and save to CSV
employee_progress_df = pd.DataFrame(employee_progress)
employee_progress_df.to_csv('employee_progress.csv', index=False)
print(employee_progress_df.head())


   id  userId  courseId progressStatus         lastUpdated  modulesCompleted  \
0   1       1       376    in_progress 2024-01-16 00:31:03                16   
1   2       1       376    in_progress 2024-03-26 06:35:43                19   
2   3       1       376      completed 2024-09-20 04:34:24                20   
3   4       2       331    in_progress 2024-01-18 07:29:25                 6   
4   5       2       331      completed 2024-07-30 12:21:29                10   

       certificateProof  
0                  None  
1                  None  
2  http://roberts.info/  
3                  None  
4  https://sellers.net/  


# Generate UserSkills

In [14]:

user_skills = []
for user in users:
    num_skills = random.randint(1, 5)  
    user_skill_ids = random.sample(range(1, 200 + 1), num_skills)
    
    for skill_id in user_skill_ids:
        user_skills.append({
            'id': len(user_skills) + 1,
            'userId': user['id'],
            'skillId': skill_id,
            'level': random.choice(['Beginner', 'Intermediate', 'Advanced'])
        })
    
    # Add skills from completed courses
    completed_courses = [progress for progress in employee_progress if progress['userId'] == user['id'] and progress['progressStatus'] == 'completed']
    for progress in completed_courses:
        course_skills = [skill['skillId'] for skill in course_skill_df.to_dict('records') if skill['courseId'] == progress['courseId']]
        for skill_id in course_skills:
            # Check if the skill is already assigned to the user
            existing_skill = next((skill for skill in user_skills if skill['skillId'] == skill_id and skill['userId'] == user['id']), None)
            new_skill_level = random.choice(['Beginner', 'Intermediate', 'Advanced'])

            if existing_skill:
                # Compare levels and keep the highest one
                if new_skill_level == 'Advanced':
                    existing_skill['level'] = 'Advanced'
                elif new_skill_level == 'Intermediate' and existing_skill['level'] != 'Advanced':
                    existing_skill['level'] = 'Intermediate'
                elif new_skill_level == 'Beginner' and existing_skill['level'] not in ['Intermediate', 'Advanced']:
                    existing_skill['level'] = 'Beginner'
            else:
                # If the skill is not already with the user, add it
                user_skills.append({
                    'id': len(user_skills) + 1,
                    'userId': user['id'],
                    'skillId': skill_id,
                    'level': new_skill_level
                })

user_skill_df = pd.DataFrame(user_skills)
user_skill_df.to_csv('user_skills.csv', index=False)
user_skill_df.head()  


Unnamed: 0,id,userId,skillId,level
0,1,1,79,Beginner
1,2,1,3,Intermediate
2,3,1,151,Advanced
3,4,1,171,Advanced
4,5,1,107,Intermediate
