In [2]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import pandas as pd
import pickle

In [2]:
# Step 1: Load the dataset
import pandas as pd

# Load the dataset (replace file_path with the actual file path)
# file_path = 'output_truncated.csv'
file_path = './mooc_dataset/big_student_clear_third_version.csv'
df = pd.read_csv(file_path)

# Display the first few rows to understand its structure
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416921 entries, 0 to 416920
Data columns (total 22 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         416921 non-null  int64  
 1   institute          416921 non-null  object 
 2   course_id          416921 non-null  object 
 3   year               416921 non-null  int64  
 4   semester           416921 non-null  object 
 5   userid_DI          416921 non-null  object 
 6   viewed             416921 non-null  int64  
 7   explored           416921 non-null  int64  
 8   certified          416921 non-null  int64  
 9   final_cc_cname_DI  416921 non-null  object 
 10  LoE_DI             416921 non-null  object 
 11  gender             393710 non-null  object 
 12  grade              416921 non-null  float64
 13  start_time_DI      416921 non-null  object 
 14  last_event_DI      416921 non-null  object 
 15  nevents            416921 non-null  int64  
 16  nd

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,institute,course_id,year,semester,userid_DI,viewed,explored,certified,final_cc_cname_DI,...,grade,start_time_DI,last_event_DI,nevents,ndays_act,nplay_video,nchapters,nforum_posts,incomplete_flag,age
0,4,HarvardX,PH207x,2012,Fall,MHxPC130313697,0,0,0,India,...,0.0,2012-07-24,2013-07-27,6,3,197757,0,0,0,23
1,6,HarvardX,PH207x,2012,Fall,MHxPC130237753,1,0,0,United States,...,0.0,2012-07-24,2012-12-24,107,8,7,2,0,0,19
2,7,HarvardX,CS50x,2012,Summer,MHxPC130202970,1,0,0,United States,...,0.0,2012-07-24,2013-03-28,8,1,197757,1,0,0,24
3,20,HarvardX,CS50x,2012,Summer,MHxPC130223941,1,0,0,Other Middle East/Central Asia,...,0.0,2012-07-24,2013-07-15,25,2,197757,4,0,0,20
4,22,HarvardX,PH207x,2012,Fall,MHxPC130317399,0,0,0,Australia,...,0.0,2012-07-24,2012-08-25,3,2,197757,0,0,0,32


In [35]:
# Step 2: Preprocess the data to enrich transactions
# Create age group, activity level, and certification status attributes
df['age_group'] = pd.cut(
    df['age'], bins=[0, 18, 25, 35, 50, 100], labels=["<18", "18-25", "26-35", "36-50", "50+"]
)
df['activity_level'] = pd.cut(
    df['ndays_act'], bins=[-1, 5, 15, 30, 100], labels=["Low", "Medium", "High", "Very High"]
)
df['certification_status'] = df['certified'].apply(lambda x: "Certified" if x == 1 else "Not Certified")

# Combine user attributes into a single profile
df['user_profile'] = (
    df['age_group'].astype(str) + ", " +
    df['final_cc_cname_DI'].astype(str) + ", " +
    df['LoE_DI'].astype(str) + ", " +
    df['activity_level'].astype(str) + ", " +
    df['certification_status']
)

# Group data by user ID to create transactions
df_grouped = df.groupby('userid_DI').agg({
    'user_profile': 'first',  # Each user has one profile
    'course_id': list  # List of courses they enrolled in
}).reset_index()

# Combine user profile with courses into enriched transactions
df_grouped['profile_and_courses'] = df_grouped.apply(
    lambda x: [x['user_profile']] + x['course_id'], axis=1
)
# Filter out rows where users study only one course
df_grouped = df_grouped[df_grouped['course_id'].apply(len) > 1]

# Combine user profile with courses into enriched transactions
df_grouped['profile_and_courses'] = df_grouped.apply(
    lambda x: [x['user_profile']] + x['course_id'], axis=1
)

# Extract the enriched transactions list
enriched_transactions_list = df_grouped['profile_and_courses'].tolist()

# Extract the enriched transactions list
enriched_transactions_list = df_grouped['profile_and_courses'].tolist()

# Display a few enriched transactions
enriched_transactions_list[:5]

df_grouped

Unnamed: 0,userid_DI,user_profile,course_id,profile_and_courses
1,MHxPC130000004,"18-25, India, Secondary, High, Certified","[CS50x, ER22x]","[18-25, India, Secondary, High, Certified, CS5..."
5,MHxPC130000011,"18-25, Egypt, Secondary, Very High, Not Certified","[CS50x, 6.00x]","[18-25, Egypt, Secondary, Very High, Not Certi..."
10,MHxPC130000024,"26-35, United States, Bachelor's, Low, Not Cer...","[CB22x, ER22x]","[26-35, United States, Bachelor's, Low, Not Ce..."
11,MHxPC130000026,"18-25, Philippines, Secondary, Low, Not Certified","[ER22x, 14.73x, 7.00x]","[18-25, Philippines, Secondary, Low, Not Certi..."
13,MHxPC130000030,"18-25, Brazil, Secondary, Low, Not Certified","[PH207x, 3.091x]","[18-25, Brazil, Secondary, Low, Not Certified,..."
...,...,...,...,...
335620,MHxPC130597628,"18-25, Other Middle East/Central Asia, Bachelo...","[6.002x, 6.002x, 6.00x]","[18-25, Other Middle East/Central Asia, Bachel..."
335627,MHxPC130597642,"26-35, United States, Secondary, Low, Not Cert...","[CB22x, ER22x, 7.00x]","[26-35, United States, Secondary, Low, Not Cer..."
335628,MHxPC130597645,"18-25, Mexico, Secondary, Low, Not Certified","[14.73x, 7.00x]","[18-25, Mexico, Secondary, Low, Not Certified,..."
335636,MHxPC130597657,"18-25, Bangladesh, Secondary, Low, Not Certified","[14.73x, 7.00x]","[18-25, Bangladesh, Secondary, Low, Not Certif..."


In [36]:
# Step 3: One-hot encode the transactions for association rule mining
from mlxtend.preprocessing import TransactionEncoder

# One-hot encode the enriched transactions
te = TransactionEncoder()
te_ary = te.fit(enriched_transactions_list).transform(enriched_transactions_list)
enriched_one_hot_df = pd.DataFrame(te_ary, columns=te.columns_)

# Display the one-hot encoded DataFrame
enriched_one_hot_df.head()



Unnamed: 0,14.73x,"18-25, Australia, Bachelor's, High, Not Certified","18-25, Australia, Bachelor's, Low, Not Certified","18-25, Australia, Bachelor's, Medium, Certified","18-25, Australia, Bachelor's, Medium, Not Certified","18-25, Australia, Bachelor's, Very High, Certified","18-25, Australia, Bachelor's, Very High, Not Certified","18-25, Australia, Master's, Low, Not Certified","18-25, Australia, Master's, Medium, Not Certified","18-25, Australia, Secondary, High, Not Certified",...,PH207x,PH278x,"nan, India, Bachelor's, Low, Not Certified","nan, India, Secondary, Low, Not Certified","nan, India, Secondary, Medium, Not Certified","nan, Other Europe, Bachelor's, Low, Not Certified","nan, United States, Bachelor's, Low, Not Certified","nan, United States, Bachelor's, Medium, Not Certified","nan, United States, Secondary, High, Not Certified","nan, United States, Secondary, Low, Not Certified"
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False


In [37]:
# Step 4: Apply Apriori for association rule mining
from mlxtend.frequent_patterns import apriori, association_rules

# Apply Apriori algorithm
frequent_itemsets = apriori(enriched_one_hot_df, min_support=0.005, use_colnames=True)
# frequent_itemsets.to_csv('frequent_itemsets.csv')
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.136615,(14.73x)
1,0.006327,"(18-25, Brazil, Secondary, Low, Not Certified)"
2,0.007246,"(18-25, Egypt, Bachelor's, Low, Not Certified)"
3,0.047012,"(18-25, India, Bachelor's, Low, Not Certified)"
4,0.007144,"(18-25, India, Bachelor's, Medium, Not Certified)"
...,...,...
244,0.016345,"(ER22x, PH278x, CB22x)"
245,0.006701,"(ER22x, CS50x, PH278x)"
246,0.005103,"(3.091x, PH207x, 6.00x, 6.002x)"
247,0.005137,"(7.00x, 8.02x, 6.00x, 6.002x)"


In [38]:
# frequent_itemsets = pd.read_csv('frequent_itemsets.csv')
# Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1, num_itemsets=len(frequent_itemsets))
rules


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(14.73x),"(18-25, United States, Bachelor's, Low, Not Ce...",0.136615,0.045431,0.006327,0.046315,1.019462,1.0,0.000121,1.000927,0.022111,0.036008,0.000926,0.092794
1,"(18-25, United States, Bachelor's, Low, Not Ce...",(14.73x),0.045431,0.136615,0.006327,0.139274,1.019462,1.0,0.000121,1.003089,0.019999,0.036008,0.003079,0.092794
2,"(26-35, United States, Bachelor's, Low, Not Ce...",(14.73x),0.048373,0.136615,0.008334,0.172293,1.261155,1.0,0.001726,1.043104,0.217602,0.047179,0.041323,0.116649
3,(14.73x),"(26-35, United States, Bachelor's, Low, Not Ce...",0.136615,0.048373,0.008334,0.061006,1.261155,1.0,0.001726,1.013454,0.239842,0.047179,0.013275,0.116649
4,(14.73x),"(26-35, United States, Master's, Low, Not Cert...",0.136615,0.022877,0.005290,0.038720,1.692544,1.0,0.002164,1.016481,0.473918,0.034304,0.016214,0.134973
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,"(6.00x, 6.002x)","(CS50x, 8.02x)",0.158046,0.035106,0.008675,0.054886,1.563424,1.0,0.003126,1.020928,0.428026,0.047022,0.020499,0.150989
438,(CS50x),"(6.002x, 6.00x, 8.02x)",0.247240,0.027877,0.008675,0.035085,1.258554,1.0,0.001782,1.007470,0.272912,0.032557,0.007415,0.173125
439,(8.02x),"(CS50x, 6.00x, 6.002x)",0.189972,0.022282,0.008675,0.045662,2.049322,1.0,0.004442,1.024499,0.632118,0.042610,0.023913,0.217488
440,(6.00x),"(6.002x, CS50x, 8.02x)",0.438028,0.014645,0.008675,0.019804,1.352274,1.0,0.002260,1.005263,0.463556,0.019537,0.005236,0.306069


In [39]:

# Filter rules to include only those with user profile attributes in the antecedents
rules_with_profile = rules[rules['antecedents'].apply(
    lambda x: any(item for item in x if "age_group" in item or "Certified" in item)
)]

# Sort rules by confidence
rules_with_profile_sorted = rules_with_profile.sort_values(by='confidence', ascending=False)

# Display the top rules
rules_with_profile_sorted[['antecedents', 'consequents', 'confidence', 'lift', 'support']].head()


Unnamed: 0,antecedents,consequents,confidence,lift,support
209,"(CS50x, 18-25, India, Secondary, Low, Not Cert...",(6.00x),0.677898,1.547611,0.008555
184,"(18-25, India, Bachelor's, Low, Not Certified,...",(6.002x),0.676316,2.312184,0.008743
190,"(18-25, India, Bachelor's, Low, Not Certified,...",(6.00x),0.672439,1.535149,0.007926
202,"(18-25, India, Secondary, Low, Not Certified, ...",(6.002x),0.654994,2.239288,0.008589
218,"(CS50x, 18-25, United States, Bachelor's, Low,...",(6.00x),0.623501,1.423427,0.008845


In [40]:
rules_with_profile_sorted.info()

<class 'pandas.core.frame.DataFrame'>
Index: 64 entries, 209 to 39
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   antecedents         64 non-null     object 
 1   consequents         64 non-null     object 
 2   antecedent support  64 non-null     float64
 3   consequent support  64 non-null     float64
 4   support             64 non-null     float64
 5   confidence          64 non-null     float64
 6   lift                64 non-null     float64
 7   representativity    64 non-null     float64
 8   leverage            64 non-null     float64
 9   conviction          64 non-null     float64
 10  zhangs_metric       64 non-null     float64
 11  jaccard             64 non-null     float64
 12  certainty           64 non-null     float64
 13  kulczynski          64 non-null     float64
dtypes: float64(12), object(2)
memory usage: 7.5+ KB


In [41]:
rules_with_profile_sorted

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
209,"(CS50x, 18-25, India, Secondary, Low, Not Cert...",(6.00x),0.012621,0.438028,0.008555,0.677898,1.547611,1.0,0.003027,1.744699,0.358366,0.019352,0.426835,0.348715
184,"(18-25, India, Bachelor's, Low, Not Certified,...",(6.002x),0.012927,0.292501,0.008743,0.676316,2.312184,1.0,0.004961,2.185770,0.574941,0.029467,0.542495,0.353102
190,"(18-25, India, Bachelor's, Low, Not Certified,...",(6.00x),0.011787,0.438028,0.007926,0.672439,1.535149,1.0,0.002763,1.715623,0.352755,0.017937,0.417121,0.345267
202,"(18-25, India, Secondary, Low, Not Certified, ...",(6.002x),0.013114,0.292501,0.008589,0.654994,2.239288,1.0,0.004754,2.050684,0.560783,0.028918,0.512358,0.342180
218,"(CS50x, 18-25, United States, Bachelor's, Low,...",(6.00x),0.014185,0.438028,0.008845,0.623501,1.423427,1.0,0.002631,1.492625,0.301750,0.019949,0.330040,0.321847
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,"(26-35, United States, Bachelor's, Low, Not Ce...",(7.00x),0.048373,0.126223,0.007416,0.153305,1.214563,1.0,0.001310,1.031986,0.185639,0.044359,0.030995,0.106029
54,"(26-35, United States, Bachelor's, Low, Not Ce...",(3.091x),0.048373,0.111680,0.007144,0.147679,1.322344,1.0,0.001741,1.042237,0.256158,0.046719,0.040525,0.105823
238,"(26-35, United States, Bachelor's, Low, Not Ce...","(ER22x, CB22x)",0.048373,0.082051,0.006889,0.142405,1.735576,1.0,0.002920,1.070376,0.445366,0.055762,0.065749,0.113180
1,"(18-25, United States, Bachelor's, Low, Not Ce...",(14.73x),0.045431,0.136615,0.006327,0.139274,1.019462,1.0,0.000121,1.003089,0.019999,0.036008,0.003079,0.092794


In [42]:
rules_with_profile_sorted.to_csv('rules_with_profile_sorted.csv')

In [43]:
# Step 5: Define a function to recommend courses based on user profile and enrolled courses
def recommend_courses(user_age_group, user_nationality, user_education, user_certification, user_activity, enrolled_courses, rules):
    # Create the user's profile
    user_profile = f"{user_age_group}, {user_nationality}, {user_education}, {user_activity}, {user_certification}"

    # Combine user profile with enrolled courses to form antecedents
    user_antecedents = set([user_profile] + enrolled_courses)

    # Filter rules where antecedents match the user's profile and courses
    matching_rules = rules[rules['antecedents'].apply(
        lambda x: user_antecedents.issuperset(x)
    )]

    # Sort matching rules by confidence
    matching_rules_sorted = matching_rules.sort_values(by='confidence', ascending=False)

    # Extract recommended courses from consequents
    recommendations = set()
    for consequents in matching_rules_sorted['consequents']:
        recommendations.update(consequents)

    # Remove already enrolled courses from recommendations
    recommendations -= set(enrolled_courses)

    return list(recommendations)


In [47]:
# Example Profiles
examples = [
    {
        "user_age_group": "18-25",
        "user_nationality": "India",
        "user_education": "Bachelor's",
        "user_certification": "Not Certified",
        "user_activity": "Low",
        "enrolled_courses": ["CS50x"]
    },
    {
        "user_age_group": "26-35",
        "user_nationality": "India",
        "user_education": "Master's",
        "user_certification": "Not Certified",
        "user_activity": "Low",
        "enrolled_courses": ["PH207x"]
    },
    {
        "user_age_group": "36-50",
        "user_nationality": "United States",
        "user_education": "Master's",
        "user_certification": "Not Certified",
        "user_activity": "Low",
        "enrolled_courses": ["PH207x"]
    },
    {
        "user_age_group": "26-35",
        "user_nationality": "Other Africa",
        "user_education": "Bachelor's",
        "user_certification": "Not Certified",
        "user_activity": "Low",
        "enrolled_courses": ["PH207x"]
    },
    {
        "user_age_group": "18-25",
        "user_nationality": "United States",
        "user_education": "Secondary",
        "user_certification": "Not Certified",
        "user_activity": "Low",
        "enrolled_courses": ["CS50x"]
    },
    {
        "user_age_group": "26-35",
        "user_nationality": "United States",
        "user_education": "Bachelor's",
        "user_certification": "Not Certified",
        "user_activity": "Low",
        "enrolled_courses": ["PH207x"]
    },
    {
        "user_age_group": "36-50",
        "user_nationality": "India",
        "user_education": "Master's",
        "user_certification": "Not Certified",
        "user_activity": "Low",
        "enrolled_courses": ["PH207x"]
    },
    {
        "user_age_group": "26-35",
        "user_nationality": "Unknown/Other",
        "user_education": "Bachelor's",
        "user_certification": "Not Certified",
        "user_activity": "Low",
        "enrolled_courses": ["CB22x"]
    },
    {
        "user_age_group": "26-35",
        "user_nationality": "India",
        "user_education": "Bachelor's",
        "user_certification": "Not Certified",
        "user_activity": "Low",
        "enrolled_courses": ["PH207x"]
    },
    {
        "user_age_group": "18-25",
        "user_nationality": "United States",
        "user_education": "Bachelor's",
        "user_certification": "Not Certified",
        "user_activity": "Low",
        "enrolled_courses": ["CS50x"]
    }
]

# Loop through the examples and call the recommendation function
for i, example in enumerate(examples, start=1):
    recommended_courses = recommend_courses(
        example["user_age_group"],
        example["user_nationality"],
        example["user_education"],
        example["user_certification"],
        example["user_activity"],
        example["enrolled_courses"],
        rules_with_profile_sorted
    )
    print(f"Example {i} - {example["user_nationality"]} - Recommended Courses: {recommended_courses}")


Example 1 - India - Recommended Courses: ['6.002x', '6.00x', '8.02x']
Example 2 - India - Recommended Courses: []
Example 3 - United States - Recommended Courses: []
Example 4 - Other Africa - Recommended Courses: []
Example 5 - United States - Recommended Courses: ['7.00x', 'ER22x', 'CB22x', '3.091x']
Example 6 - United States - Recommended Courses: ['14.73x', 'PH278x', 'ER22x', 'CB22x', '7.00x', '3.091x']
Example 7 - India - Recommended Courses: []
Example 8 - Unknown/Other - Recommended Courses: []
Example 9 - India - Recommended Courses: []
Example 10 - United States - Recommended Courses: ['14.73x', '3.091x', '7.00x', '6.00x']
