| Column Name          | Data Type   | Description                                   |
|----------------------|-------------|-----------------------------------------------|
| userId               | Integer     | Unique identifier for each user               |
| courseId             | Integer     | Unique identifier for each course             |
| progressStatus       | String      | Current status of the user in the course (e.g., Completed, In-progress, Not Started) |
| totalCourses         | Integer     | Total courses completed by the user           |
| averageProgress      | Float       | Average progress of the user across courses   |
| skillCount           | Integer     | Count of unique skills the user has           |
| popularityScore      | Integer     | Popularity score of the course                |
| difficulty           | String      | Difficulty level of the course                |
| language             | String      | Language of the course                        |
| totalTime            | Integer     | Total time to complete the course (in hours) |
| totalModules         | Integer     | Total modules in the course                   |
| skillRelevanceScore  | Float       | Score indicating how relevant the user's skills are to the course (could be a calculated value based on matching skills) |
| has_certificate       | Boolean     | Whether the user has a certificate for the course (for label in classification) |


In [6]:
import pandas as pd

# Sample DataFrames
UserSkill = pd.DataFrame({
    'userId': [1, 2, 3],
    'skillId': [101, 102, 103],
    'level': ['Beginner', 'Intermediate', 'Advanced']
})

DesignationSkill = pd.DataFrame({
    'designationId': [1, 2],
    'skillId': [101, 104]
})

CourseSkill = pd.DataFrame({
    'courseId': [201, 202, 300, 203],
    'skillId': [102, 105, 105, 105],
    'level': ['Beginner', 'Intermediate', 'Intermediate', 'Intermediate']
})

Skill = pd.DataFrame({
    'id': [101, 102, 103, 104, 105],
    'name': ['Python', 'Java', 'C++', 'SQL', 'JavaScript']
})
user_skills_merged = UserSkill.merge(Skill, left_on='skillId', right_on='id', how='left')
user_skills_merged = user_skills_merged[['userId', 'id', 'name', 'level']]
user_skills_merged.columns = ['userId', 'skillId', 'skillName', 'userSkillLevel']
designation_skills_merged = DesignationSkill.merge(Skill, left_on='skillId', right_on='id', how='left')
designation_skills_merged = designation_skills_merged[['designationId', 'id', 'name']]
designation_skills_merged.columns = ['userId', 'skillId', 'skillName']
designation_skills_merged['userSkillLevel'] = None  # No level for designation skills
course_skills_merged = CourseSkill.merge(Skill, left_on='skillId', right_on='id', how='left')
course_skills_merged = course_skills_merged[['courseId', 'skillId', 'name', 'level']]
course_skills_merged.columns = ['userId', 'skillId', 'skillName', 'courseSkillLevel']
# Add userSkillLevel to course skills
course_skills_merged['userSkillLevel'] = course_skills_merged['courseSkillLevel']

# Concatenate all DataFrames
final_result = pd.concat([user_skills_merged, designation_skills_merged, course_skills_merged], ignore_index=True)

# Drop any unnecessary columns
final_result = final_result.drop(columns=['courseSkillLevel'], errors='ignore')

# Display the final result
print(final_result)


   userId  skillId   skillName userSkillLevel
0       1      101      Python       Beginner
1       2      102        Java   Intermediate
2       3      103         C++       Advanced
3       1      101      Python           None
4       2      104         SQL           None
5     201      102        Java       Beginner
6     202      105  JavaScript   Intermediate
7     300      105  JavaScript   Intermediate
8     203      105  JavaScript   Intermediate


Setting Up Data Directories

In [3]:
import pandas as pd

# Sample data creation (replace this with your actual data loading)
skills = pd.DataFrame({
    'id': [1, 2, 3],
    'name': ['Python', 'Java', 'SQL'],
})

user_skills = pd.DataFrame({
    'userId': [1, 1, 2, 3, 4],
    'skillId': [1, 2, 1, 3, 1],
    'level': ['Beginner', 'Intermediate', 'Beginner', 'Expert', 'Beginner'],
})

designation_skills = pd.DataFrame({
    'designationId': [1, 1, 2],
    'skillId': [1, 2, 3],
})

course_skills = pd.DataFrame({
    'courseId': [101, 102, 103],
    'skillId': [1, 2, 3],
    'level': ['Beginner', 'Intermediate', 'Advanced'],
})

# Sample User Data (replace this with your actual data loading)
users = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'username': ['user1', 'user2', 'user3', 'user4'],
    'designationId': [1, 1, 2, 2],
})

# Step 1: Merge DataFrames
merged = user_skills.merge(skills, left_on='skillId', right_on='id', suffixes=('', '_skill'))
merged = merged.merge(designation_skills, left_on='skillId', right_on='skillId', suffixes=('', '_designation'))
merged = merged.merge(course_skills, left_on='skillId', right_on='skillId', suffixes=('', '_course'))
merged = merged.merge(users, left_on='userId', right_on='id', suffixes=('', '_user'))

# Step 2: Create a fact table
fact_table = merged[['userId', 'skillId', 'level', 'designationId', 'courseId', 'name']]  # Include necessary fields

# Rename columns for clarity
fact_table.rename(columns={'name': 'skill_name', 'level': 'user_level'}, inplace=True)

# Debug: Check the fact table
print("Skill Fact Table:")
print(fact_table)

# Step 3: Export to CSV
fact_table.to_csv('skill_fact_table.csv', index=False)


Skill Fact Table:
   userId  skillId    user_level  designationId  courseId skill_name
0       1        1      Beginner              1       101     Python
1       1        2  Intermediate              1       102       Java
2       2        1      Beginner              1       101     Python
3       3        3        Expert              2       103        SQL
4       4        1      Beginner              1       101     Python


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fact_table.rename(columns={'name': 'skill_name', 'level': 'user_level'}, inplace=True)


Check number of null and duplicates

In [39]:
def count_nulls(file_path):
    df = pd.read_csv(file_path)
    return df.isnull().sum()

def count_duplicates(file_path):
    df = pd.read_csv(file_path)
    return df.duplicated().sum()

In [41]:
tables = [
    "User", "Designation", "Course", "EmployeeProgress", 
    "Skill", "UserSkill", "DesignationSkill", "CourseSkill"
]

null_counts_dict = {}
duplicate_counts_dict = {}


for table in tables:
    file_path = os.path.join(RAW_DATA_DIR, f'raw_{table}.csv')
    
    # Count nulls
    null_counts = count_nulls(file_path)
    null_counts_dict[table] = null_counts
    
    # Count duplicates
    duplicate_counts = count_duplicates(file_path)
    duplicate_counts_dict[table] = duplicate_counts

# Print null counts
for table, counts in null_counts_dict.items():
    print(f"Null counts for {table}:\n{counts}\n")

# Print duplicate counts
for table, count in duplicate_counts_dict.items():
    print(f"Duplicate counts for {table}: {count}\n")

Null counts for User:
id               0
username         0
passwordHash     0
email            0
gender           0
role             0
designationId    1
phoneNumber      0
createdAt        0
updatedAt        0
dtype: int64

Null counts for Designation:
id      0
name    0
dtype: int64

Null counts for Course:
course_id       0
title           0
url             0
shortIntro      0
difficulty      0
language        0
totalTime       0
totalModules    0
created_at      0
updated_at      0
dtype: int64

Null counts for EmployeeProgress:
progress_id            0
userId                 0
courseId               0
progressStatus         0
last_updated           0
modulesCompleted       0
certificateProof    1739
dtype: int64

Null counts for Skill:
id      0
name    0
dtype: int64

Null counts for UserSkill:
id         0
userId     0
skillId    0
level      0
dtype: int64

Null counts for DesignationSkill:
id               0
designationId    0
skillId          0
dtype: int64

Null counts for

Define required columns for each table

In [42]:
tables_reqd_columns = {
    "User": {
        "file": "User.csv",
        "columns": ["id", "username", "designationId", "createdAt"]
    },
    "Designation": {
        "file": "Designation.csv",
        "columns": ["id", "name"]
    },
    "Course": {
        "file": "Course.csv",
        "columns": ["course_id", "title", "difficulty", "language", "totalTime", "totalModules", "created_at"]
    },
    "EmployeeProgress": {
        "file": "EmployeeProgress.csv",
        "columns": ["userId", "courseId", "progressStatus", "last_updated", "modulesCompleted", "certificateProof"]
    },
    "Skill": {
        "file": "Skill.csv",
        "columns": ["id", "name"]
    },
    "UserSkill": {
        "file": "UserSkill.csv",
        "columns": ["userId", "skillId", "level"]
    },
    "DesignationSkill": {
        "file": "DesignationSkill.csv",
        "columns": ["designationId", "skillId"]
    },
    "CourseSkill": {
        "file": "CourseSkill.csv",
        "columns": ["courseId", "skillId", "level"]
    }
}


Data Preprocessing: Removing Nulls, Selecting Required Columns, and Type Conversion

In [49]:
cleaned_dfs = {}

# Assuming 'tables_reqd_columns' is defined somewhere in your code
for model, info in tables_reqd_columns.items():
    file_path = os.path.join(RAW_DATA_DIR, f'raw_{info["file"]}')
    df = pd.read_csv(file_path)

    # Select required columns
    df_cleaned = df[info["columns"]].copy() 

    # Convert data types
    for col in df_cleaned.columns:
        if df_cleaned[col].dtype == 'object': 
            df_cleaned[col] = df_cleaned[col].astype('category') 
        elif df_cleaned[col].dtype == 'int64':  
            df_cleaned[col] = df_cleaned[col].astype('Int64')  
    
    # Drop rows with nulls for User table
    if model == "User":
        df_cleaned = df_cleaned.dropna()

    # Convert designationId to int if it exists
    if 'designationId' in df_cleaned.columns:
        df_cleaned['designationId'] = df_cleaned['designationId'].astype('Int64')  # Use 'int' if preferred

    # Handle valid user IDs for filtering in EmployeeProgress and UserSkill
    if model == "User":
        valid_user_ids = df_cleaned['id'].unique()  # Replace 'id' with actual user ID column name

    if model == "EmployeeProgress":
        df_cleaned['has_certificate'] = df_cleaned['certificateProof'].notnull().astype(int)
        df_cleaned = df_cleaned.drop(columns=['certificateProof'])
        # Filter based on valid user IDs from the User table
        if 'userId' in df_cleaned.columns:  # Check if userId exists
            df_cleaned = df_cleaned[df_cleaned['userId'].isin(valid_user_ids)]

    if model == "UserSkill":
        # Filter based on valid user IDs from the User table
        if 'userId' in df_cleaned.columns:  # Check if userId exists
            df_cleaned = df_cleaned[df_cleaned['userId'].isin(valid_user_ids)]  

    cleaned_dfs[model] = df_cleaned

Exporting Cleaned Data to CSV

In [50]:
for model, df in cleaned_dfs.items():
    transformed_file_path = os.path.join(TRANSFORMED_DATA_DIR, f"prep_{model}.csv")
    df.to_csv(transformed_file_path, index=False)

print("All data cleaned and saved to the transformed folder!")


All data cleaned and saved to the transformed folder!


In [51]:
for model, df in cleaned_dfs.items():
    print(model, df.columns)

User Index(['id', 'username', 'designationId', 'createdAt'], dtype='object')
Designation Index(['id', 'name'], dtype='object')
Course Index(['course_id', 'title', 'difficulty', 'language', 'totalTime',
       'totalModules', 'created_at'],
      dtype='object')
EmployeeProgress Index(['userId', 'courseId', 'progressStatus', 'last_updated',
       'modulesCompleted', 'has_certificate'],
      dtype='object')
Skill Index(['id', 'name'], dtype='object')
UserSkill Index(['userId', 'skillId', 'level'], dtype='object')
DesignationSkill Index(['designationId', 'skillId'], dtype='object')
CourseSkill Index(['courseId', 'skillId', 'level'], dtype='object')


In [23]:
# import pandas as pd

# # Sample data creation (replace this with your actual data loading)
# users = pd.DataFrame({
#     'id': [1, 2, 3, 4],
#     'username': ['user1', 'user2', 'user3', 'user4'],
#     'email': ['email1', 'email2', 'email3', 'email4'],
#     'role': ['admin', 'user', 'user', 'user'],
#     'designationId': [1, 1, 2, 2],
#     'createdAt': pd.to_datetime(['2022-01-01', '2022-02-01', '2022-03-01', '2022-04-01']),
#     'updatedAt': pd.to_datetime(['2022-05-01', '2022-05-02', '2022-05-03', '2022-05-04']),
# })

# designations = pd.DataFrame({
#     'id': [1, 2],
#     'name': ['Manager', 'Developer'],
# })

# skills = pd.DataFrame({
#     'id': [1, 2, 3],
#     'name': ['Python', 'Java', 'SQL'],
# })

# user_skills = pd.DataFrame({
#     'userId': [1, 1, 2, 3, 4],
#     'skillId': [1, 2, 1, 3, 1],
#     'level': ['Beginner', 'Intermediate', 'Beginner', 'Expert', 'Beginner'],
# })

# # Step 1: Merge DataFrames
# merged = users.merge(designations, left_on='designationId', right_on='id', suffixes=('', '_designation'))
# merged = merged.merge(user_skills, left_on='id', right_on='userId', suffixes=('', '_user'))
# merged = merged.merge(skills, left_on='skillId', right_on='id', suffixes=('', '_skill'))

# # Step 2: Create a fact table
# fact_table = merged[['name', 'name_skill', 'userId']]  # Keep designation name, skill name, and userId

# # Debug: Check the fact table
# print("Fact Table:")
# print(fact_table)

# # Step 3: Export to CSV
# fact_table.to_csv('designation_skill_fact_table.csv', index=False)


Fact Table:
        name name_skill  userId
0    Manager     Python       1
1    Manager       Java       1
2    Manager     Python       2
3  Developer        SQL       3
4  Developer     Python       4


In [52]:
merged = user_skills.merge(skills, left_on='skillId', right_on='id', suffixes=('', '_skill'))
merged = merged.merge(designation_skills, left_on='skillId', right_on='skillId', suffixes=('', '_designation'))
merged = merged.merge(course_skills, left_on='skillId', right_on='skillId', suffixes=('', '_course'))
merged = merged.merge(users, left_on='userId', right_on='id', suffixes=('', '_user'))

# Step 2: Create a fact table
fact_table = merged[['userId', 'skillId', 'level', 'designationId', 'courseId', 'name']]  # Include necessary fields

# Rename columns for clarity
fact_table.rename(columns={'name': 'skill_name', 'level': 'user_level'}, inplace=True)

# Debug: Check the fact table
print("Skill Fact Table:")
print(fact_table)

NameError: name 'course_skills' is not defined