In [2]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import pandas as pd
import pickle

In [2]:
# Step 1: Load the dataset
import pandas as pd

# Load the dataset (replace file_path with the actual file path)
# file_path = 'output_truncated.csv'
file_path = './mooc_dataset/big_student_clear_third_version.csv'
df = pd.read_csv(file_path)

# Display the first few rows to understand its structure
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416921 entries, 0 to 416920
Data columns (total 22 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         416921 non-null  int64  
 1   institute          416921 non-null  object 
 2   course_id          416921 non-null  object 
 3   year               416921 non-null  int64  
 4   semester           416921 non-null  object 
 5   userid_DI          416921 non-null  object 
 6   viewed             416921 non-null  int64  
 7   explored           416921 non-null  int64  
 8   certified          416921 non-null  int64  
 9   final_cc_cname_DI  416921 non-null  object 
 10  LoE_DI             416921 non-null  object 
 11  gender             393710 non-null  object 
 12  grade              416921 non-null  float64
 13  start_time_DI      416921 non-null  object 
 14  last_event_DI      416921 non-null  object 
 15  nevents            416921 non-null  int64  
 16  nd

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,institute,course_id,year,semester,userid_DI,viewed,explored,certified,final_cc_cname_DI,...,grade,start_time_DI,last_event_DI,nevents,ndays_act,nplay_video,nchapters,nforum_posts,incomplete_flag,age
0,4,HarvardX,PH207x,2012,Fall,MHxPC130313697,0,0,0,India,...,0.0,2012-07-24,2013-07-27,6,3,197757,0,0,0,23
1,6,HarvardX,PH207x,2012,Fall,MHxPC130237753,1,0,0,United States,...,0.0,2012-07-24,2012-12-24,107,8,7,2,0,0,19
2,7,HarvardX,CS50x,2012,Summer,MHxPC130202970,1,0,0,United States,...,0.0,2012-07-24,2013-03-28,8,1,197757,1,0,0,24
3,20,HarvardX,CS50x,2012,Summer,MHxPC130223941,1,0,0,Other Middle East/Central Asia,...,0.0,2012-07-24,2013-07-15,25,2,197757,4,0,0,20
4,22,HarvardX,PH207x,2012,Fall,MHxPC130317399,0,0,0,Australia,...,0.0,2012-07-24,2012-08-25,3,2,197757,0,0,0,32


In [35]:
# Step 2: Preprocess the data to enrich transactions
# Create age group, activity level, and certification status attributes
df['age_group'] = pd.cut(
    df['age'], bins=[0, 18, 25, 35, 50, 100], labels=["<18", "18-25", "26-35", "36-50", "50+"]
)
df['activity_level'] = pd.cut(
    df['ndays_act'], bins=[-1, 5, 15, 30, 100], labels=["Low", "Medium", "High", "Very High"]
)
df['certification_status'] = df['certified'].apply(lambda x: "Certified" if x == 1 else "Not Certified")

# Combine user attributes into a single profile
df['user_profile'] = (
    df['age_group'].astype(str) + ", " +
    df['final_cc_cname_DI'].astype(str) + ", " +
    df['LoE_DI'].astype(str) + ", " +
    df['activity_level'].astype(str) + ", " +
    df['certification_status']
)

# Group data by user ID to create transactions
df_grouped = df.groupby('userid_DI').agg({
    'user_profile': 'first',  # Each user has one profile
    'course_id': list  # List of courses they enrolled in
}).reset_index()

# Combine user profile with courses into enriched transactions
df_grouped['profile_and_courses'] = df_grouped.apply(
    lambda x: [x['user_profile']] + x['course_id'], axis=1
)
# Filter out rows where users study only one course
df_grouped = df_grouped[df_grouped['course_id'].apply(len) > 1]

# Combine user profile with courses into enriched transactions
df_grouped['profile_and_courses'] = df_grouped.apply(
    lambda x: [x['user_profile']] + x['course_id'], axis=1
)

# Extract the enriched transactions list
enriched_transactions_list = df_grouped['profile_and_courses'].tolist()

# Extract the enriched transactions list
enriched_transactions_list = df_grouped['profile_and_courses'].tolist()

# Display a few enriched transactions
enriched_transactions_list[:5]

df_grouped

Unnamed: 0,userid_DI,user_profile,course_id,profile_and_courses
1,MHxPC130000004,"18-25, India, Secondary, High, Certified","[CS50x, ER22x]","[18-25, India, Secondary, High, Certified, CS5..."
5,MHxPC130000011,"18-25, Egypt, Secondary, Very High, Not Certified","[CS50x, 6.00x]","[18-25, Egypt, Secondary, Very High, Not Certi..."
10,MHxPC130000024,"26-35, United States, Bachelor's, Low, Not Cer...","[CB22x, ER22x]","[26-35, United States, Bachelor's, Low, Not Ce..."
11,MHxPC130000026,"18-25, Philippines, Secondary, Low, Not Certified","[ER22x, 14.73x, 7.00x]","[18-25, Philippines, Secondary, Low, Not Certi..."
13,MHxPC130000030,"18-25, Brazil, Secondary, Low, Not Certified","[PH207x, 3.091x]","[18-25, Brazil, Secondary, Low, Not Certified,..."
...,...,...,...,...
335620,MHxPC130597628,"18-25, Other Middle East/Central Asia, Bachelo...","[6.002x, 6.002x, 6.00x]","[18-25, Other Middle East/Central Asia, Bachel..."
335627,MHxPC130597642,"26-35, United States, Secondary, Low, Not Cert...","[CB22x, ER22x, 7.00x]","[26-35, United States, Secondary, Low, Not Cer..."
335628,MHxPC130597645,"18-25, Mexico, Secondary, Low, Not Certified","[14.73x, 7.00x]","[18-25, Mexico, Secondary, Low, Not Certified,..."
335636,MHxPC130597657,"18-25, Bangladesh, Secondary, Low, Not Certified","[14.73x, 7.00x]","[18-25, Bangladesh, Secondary, Low, Not Certif..."


In [36]:
# Step 3: One-hot encode the transactions for association rule mining
from mlxtend.preprocessing import TransactionEncoder

# One-hot encode the enriched transactions
te = TransactionEncoder()
te_ary = te.fit(enriched_transactions_list).transform(enriched_transactions_list)
enriched_one_hot_df = pd.DataFrame(te_ary, columns=te.columns_)

# Display the one-hot encoded DataFrame
enriched_one_hot_df.head()



Unnamed: 0,14.73x,"18-25, Australia, Bachelor's, High, Not Certified","18-25, Australia, Bachelor's, Low, Not Certified","18-25, Australia, Bachelor's, Medium, Certified","18-25, Australia, Bachelor's, Medium, Not Certified","18-25, Australia, Bachelor's, Very High, Certified","18-25, Australia, Bachelor's, Very High, Not Certified","18-25, Australia, Master's, Low, Not Certified","18-25, Australia, Master's, Medium, Not Certified","18-25, Australia, Secondary, High, Not Certified",...,PH207x,PH278x,"nan, India, Bachelor's, Low, Not Certified","nan, India, Secondary, Low, Not Certified","nan, India, Secondary, Medium, Not Certified","nan, Other Europe, Bachelor's, Low, Not Certified","nan, United States, Bachelor's, Low, Not Certified","nan, United States, Bachelor's, Medium, Not Certified","nan, United States, Secondary, High, Not Certified","nan, United States, Secondary, Low, Not Certified"
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False


In [50]:
# Step 4: Apply Apriori for association rule mining
from mlxtend.frequent_patterns import apriori, association_rules

# Apply Apriori algorithm
frequent_itemsets = apriori(enriched_one_hot_df, min_support=0.001, use_colnames=True)
# frequent_itemsets.to_csv('frequent_itemsets.csv')
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.136615,(14.73x)
1,0.001905,"(18-25, Australia, Bachelor's, Low, Not Certif..."
2,0.001752,"(18-25, Australia, Secondary, Low, Not Certified)"
3,0.001174,"(18-25, Bangladesh, Bachelor's, Low, Not Certi..."
4,0.001922,"(18-25, Bangladesh, Secondary, Low, Not Certif..."
...,...,...
1736,0.001004,"(14.73x, PH278x, 8.02x, ER22x, CB22x, 7.00x, 3..."
1737,0.001055,"(14.73x, PH278x, 6.002x, 6.00x, ER22x, CB22x, ..."
1738,0.001038,"(14.73x, PH278x, 6.002x, 8.02x, ER22x, CB22x, ..."
1739,0.001276,"(14.73x, PH278x, 6.00x, 8.02x, ER22x, CB22x, 7..."


In [51]:
# frequent_itemsets = pd.read_csv('frequent_itemsets.csv')
# Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1, num_itemsets=len(frequent_itemsets))
rules


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(14.73x),"(18-25, India, Master's, Low, Not Certified)",0.136615,0.006174,0.001276,0.009338,1.512365,1.0,0.000432,1.003193,0.392390,0.009014,0.003183,0.107975
1,"(18-25, India, Master's, Low, Not Certified)",(14.73x),0.006174,0.136615,0.001276,0.206612,1.512365,1.0,0.000432,1.088225,0.340889,0.009014,0.081072,0.107975
2,(14.73x),"(18-25, Other East Asia, Bachelor's, Low, Not ...",0.136615,0.004065,0.001157,0.008466,2.082634,1.0,0.000601,1.004439,0.602094,0.008290,0.004419,0.146492
3,"(18-25, Other East Asia, Bachelor's, Low, Not ...",(14.73x),0.004065,0.136615,0.001157,0.284519,2.082634,1.0,0.000601,1.206720,0.521961,0.008290,0.171307,0.146492
4,(14.73x),"(18-25, Other Europe, Bachelor's, Low, Not Cer...",0.136615,0.008504,0.001378,0.010085,1.185815,1.0,0.000216,1.001596,0.181493,0.009585,0.001594,0.086042
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19103,(6.00x),"(14.73x, PH278x, ER22x, CS50x, CB22x, 7.00x)",0.438028,0.001208,0.001072,0.002446,2.025723,1.0,0.000543,1.001242,0.901022,0.002446,0.001240,0.444885
19104,(CS50x),"(14.73x, PH278x, 6.00x, ER22x, CB22x, 7.00x)",0.247240,0.001667,0.001072,0.004334,2.600131,1.0,0.000659,1.002679,0.817530,0.004324,0.002672,0.323596
19105,(ER22x),"(14.73x, PH278x, 6.00x, CS50x, CB22x, 7.00x)",0.226183,0.001157,0.001072,0.004738,4.096104,1.0,0.000810,1.003598,0.976802,0.004736,0.003585,0.465604
19106,(CB22x),"(14.73x, PH278x, 6.00x, ER22x, CS50x, 7.00x)",0.175463,0.001225,0.001072,0.006107,4.986804,1.0,0.000857,1.004912,0.969600,0.006102,0.004888,0.440554


In [52]:

# Step 3: Filter rules to include user attributes in antecedents and courses in consequents
def is_valid_rule(antecedents, consequents):
    # Check if antecedents include user attributes and consequents are courses
    antecedent_is_profile = any(
        item for item in antecedents
        if "age_group" in item or "Certified" in item or "activity" in item
    )
    consequent_is_course = all(
        item in enriched_one_hot_df.columns and item not in antecedents
        for item in consequents
    )
    return antecedent_is_profile and consequent_is_course

# Apply the filter
filtered_rules = rules[
    rules.apply(
        lambda row: is_valid_rule(row['antecedents'], row['consequents']), axis=1
    )
]

# Sort rules by confidence
rules_with_profile_sorted = filtered_rules.sort_values(by='confidence', ascending=False)




# Filter rules to include only those with user profile attributes in the antecedents
rules_with_profile = rules[rules['antecedents'].apply(
    lambda x: any(item for item in x if "age_group" in item or "Certified" in item)
)]

# Sort rules by confidence
rules_with_profile_sorted = rules_with_profile.sort_values(by='confidence', ascending=False)

# Display the top rules
rules_with_profile_sorted[['antecedents', 'consequents', 'confidence', 'lift', 'support']].head()


Unnamed: 0,antecedents,consequents,confidence,lift,support
710,"(14.73x, 18-25, Unknown/Other, Bachelor's, Low...",(CB22x),0.971831,5.538664,0.001174
772,"(26-35, Unknown/Other, Bachelor's, Low, Not Ce...",(CB22x),0.953846,5.436165,0.001055
528,"(26-35, Unknown/Other, Bachelor's, Low, Not Ce...",(CB22x),0.835897,4.763951,0.002772
166,"(18-25, Nigeria, Secondary, Low, Not Certified)",(6.00x),0.819277,1.870375,0.001157
1752,"(26-35, Unknown/Other, Bachelor's, Low, Not Ce...",(CB22x),0.813187,4.634518,0.001259


In [53]:
rules_with_profile_sorted.info()

<class 'pandas.core.frame.DataFrame'>
Index: 887 entries, 710 to 666
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   antecedents         887 non-null    object 
 1   consequents         887 non-null    object 
 2   antecedent support  887 non-null    float64
 3   consequent support  887 non-null    float64
 4   support             887 non-null    float64
 5   confidence          887 non-null    float64
 6   lift                887 non-null    float64
 7   representativity    887 non-null    float64
 8   leverage            887 non-null    float64
 9   conviction          887 non-null    float64
 10  zhangs_metric       887 non-null    float64
 11  jaccard             887 non-null    float64
 12  certainty           887 non-null    float64
 13  kulczynski          887 non-null    float64
dtypes: float64(12), object(2)
memory usage: 103.9+ KB


In [54]:
rules_with_profile_sorted

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
710,"(14.73x, 18-25, Unknown/Other, Bachelor's, Low...",(CB22x),0.001208,0.175463,0.001174,0.971831,5.538664,1.0,0.000962,29.271061,0.820442,0.006687,0.965837,0.489260
772,"(26-35, Unknown/Other, Bachelor's, Low, Not Ce...",(CB22x),0.001106,0.175463,0.001055,0.953846,5.436165,1.0,0.000861,17.864967,0.816950,0.006008,0.944025,0.479928
528,"(26-35, Unknown/Other, Bachelor's, Low, Not Ce...",(CB22x),0.003317,0.175463,0.002772,0.835897,4.763951,1.0,0.002190,5.024522,0.792719,0.015752,0.800976,0.425849
166,"(18-25, Nigeria, Secondary, Low, Not Certified)",(6.00x),0.001412,0.438028,0.001157,0.819277,1.870375,1.0,0.000538,3.109577,0.466006,0.002639,0.678413,0.410959
1752,"(26-35, Unknown/Other, Bachelor's, Low, Not Ce...",(CB22x),0.001548,0.175463,0.001259,0.813187,4.634518,1.0,0.000987,4.413698,0.785444,0.007162,0.773433,0.410180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1071,"(18-25, India, Secondary, Low, Not Certified)","(CB22x, 8.02x)",0.046281,0.019628,0.001038,0.022418,1.142145,1.0,0.000129,1.002854,0.130494,0.015994,0.002846,0.037639
1615,"(26-35, United States, Bachelor's, Low, Not Ce...","(PH278x, 6.00x)",0.048373,0.016260,0.001055,0.021800,1.340694,1.0,0.000268,1.005663,0.267035,0.016586,0.005631,0.043327
674,"(18-25, India, Secondary, Low, Not Certified)","(14.73x, 8.02x)",0.046281,0.014815,0.001004,0.021683,1.463629,1.0,0.000318,1.007021,0.332138,0.016700,0.006972,0.044711
1669,"(26-35, United States, Bachelor's, Low, Not Ce...","(ER22x, PH207x)",0.048373,0.013488,0.001038,0.021449,1.590203,1.0,0.000385,1.008135,0.390016,0.017058,0.008070,0.049186


In [55]:
rules_with_profile_sorted.to_csv('rules_with_profile_sorted.csv')

In [61]:
# Step 5: Define a function to recommend courses based on user profile and enrolled courses
def recommend_courses(user_age_group, user_nationality, user_education, user_certification, user_activity, enrolled_courses, rules):
    # Create the user's profile
    user_profile = f"{user_age_group}, {user_nationality}, {user_education}, {user_activity}, {user_certification}"

    # Combine user profile with enrolled courses to form antecedents
    user_antecedents = set([user_profile] + enrolled_courses)

    # Filter rules where antecedents match the user's profile and courses
    matching_rules = rules[rules['antecedents'].apply(
    lambda x: any(item in user_antecedents for item in x)
    )]


    # Sort matching rules by confidence
    matching_rules_sorted = matching_rules.sort_values(by='confidence', ascending=False)

    # Extract recommended courses from consequents
    recommendations = set()
    for consequents in matching_rules_sorted['consequents']:
        recommendations.update(consequents)

    # Remove already enrolled courses from recommendations
    recommendations -= set(enrolled_courses)

    return list(recommendations)


In [62]:
# Example Profiles
examples = [
    {
        "user_age_group": "18-25",
        "user_nationality": "India",
        "user_education": "Bachelor's",
        "user_certification": "Not Certified",
        "user_activity": "Low",
        "enrolled_courses": ["CS50x"]
    },
    {
        "user_age_group": "26-35",
        "user_nationality": "India",
        "user_education": "Master's",
        "user_certification": "Not Certified",
        "user_activity": "Low",
        "enrolled_courses": ["PH207x"]
    },
    {
        "user_age_group": "36-50",
        "user_nationality": "United States",
        "user_education": "Master's",
        "user_certification": "Not Certified",
        "user_activity": "Low",
        "enrolled_courses": ["PH207x"]
    },
    {
        "user_age_group": "26-35",
        "user_nationality": "Other Africa",
        "user_education": "Bachelor's",
        "user_certification": "Not Certified",
        "user_activity": "Low",
        "enrolled_courses": ["PH207x"]
    },
    {
        "user_age_group": "18-25",
        "user_nationality": "United States",
        "user_education": "Secondary",
        "user_certification": "Not Certified",
        "user_activity": "Low",
        "enrolled_courses": ["CS50x"]
    },
    {
        "user_age_group": "26-35",
        "user_nationality": "United States",
        "user_education": "Bachelor's",
        "user_certification": "Not Certified",
        "user_activity": "Low",
        "enrolled_courses": ["PH207x"]
    },
    {
        "user_age_group": "36-50",
        "user_nationality": "India",
        "user_education": "Master's",
        "user_certification": "Not Certified",
        "user_activity": "Low",
        "enrolled_courses": ["PH207x"]
    },
    {
        "user_age_group": "26-35",
        "user_nationality": "Unknown/Other",
        "user_education": "Bachelor's",
        "user_certification": "Not Certified",
        "user_activity": "Low",
        "enrolled_courses": ["CB22x"]
    },
    {
        "user_age_group": "26-35",
        "user_nationality": "India",
        "user_education": "Bachelor's",
        "user_certification": "Not Certified",
        "user_activity": "Low",
        "enrolled_courses": ["PH207x"]
    },
    {
        "user_age_group": "18-25",
        "user_nationality": "United States",
        "user_education": "Bachelor's",
        "user_certification": "Not Certified",
        "user_activity": "Low",
        "enrolled_courses": ["CS50x"]
    }
]

# Loop through the examples and call the recommendation function
for i, example in enumerate(examples, start=1):
    recommended_courses = recommend_courses(
        example["user_age_group"],
        example["user_nationality"],
        example["user_education"],
        example["user_certification"],
        example["user_activity"],
        example["enrolled_courses"],
        rules_with_profile_sorted
    )
    print(f"Example {i}  - Recommended Courses: {recommended_courses}")


Example 1  - Recommended Courses: ['14.73x', 'PH278x', '6.002x', '8.MReV', '6.00x', '8.02x', '2.01x', 'ER22x', 'CB22x', '7.00x', '3.091x']
Example 2  - Recommended Courses: ['14.73x', 'PH278x', '6.002x', '6.00x', '8.02x', 'ER22x', 'CS50x', '3.091x', '7.00x']
Example 3  - Recommended Courses: ['14.73x', 'PH278x', '6.00x', 'ER22x', 'CS50x', 'CB22x', '3.091x', '7.00x']
Example 4  - Recommended Courses: ['14.73x', 'PH278x', '6.002x', '6.00x', 'CS50x', '3.091x']
Example 5  - Recommended Courses: ['14.73x', 'PH278x', '6.002x', '8.MReV', '6.00x', '8.02x', 'CB22x', '2.01x', 'ER22x', '3.091x', '7.00x', 'PH207x']
Example 6  - Recommended Courses: ['14.73x', 'PH278x', '6.002x', '6.00x', '8.02x', 'ER22x', 'CS50x', 'CB22x', '3.091x', '7.00x']
Example 7  - Recommended Courses: ['6.00x', '3.091x']
Example 8  - Recommended Courses: ['14.73x', 'PH278x', '6.00x', 'ER22x', 'CS50x', '7.00x']
Example 9  - Recommended Courses: ['14.73x', 'PH278x', '6.002x', '6.00x', '8.02x', 'CB22x', 'CS50x', 'ER22x', '3.09