In [7]:
import os
import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi

# Step 1: Authenticate with Kaggle API
api = KaggleApi()
api.authenticate()

# Step 2: Download the dataset directly and extract
dataset = "kanikanarang94/mooc-dataset"
destination = "mooc_dataset"
os.makedirs(destination, exist_ok=True)
api.dataset_download_files(dataset, path=destination, unzip=True)

# Step 3: Load the dataset into a DataFrame
csv_path = os.path.join(destination, "big_student_clear_third_version.csv")
df = pd.read_csv(csv_path)

# Step 4: Display the first few rows
print(df.head())


Dataset URL: https://www.kaggle.com/datasets/kanikanarang94/mooc-dataset
   Unnamed: 0 institute course_id  year semester       userid_DI  viewed  \
0           4  HarvardX    PH207x  2012     Fall  MHxPC130313697       0   
1           6  HarvardX    PH207x  2012     Fall  MHxPC130237753       1   
2           7  HarvardX     CS50x  2012   Summer  MHxPC130202970       1   
3          20  HarvardX     CS50x  2012   Summer  MHxPC130223941       1   
4          22  HarvardX    PH207x  2012     Fall  MHxPC130317399       0   

   explored  certified               final_cc_cname_DI  ... grade  \
0         0          0                           India  ...   0.0   
1         0          0                   United States  ...   0.0   
2         0          0                   United States  ...   0.0   
3         0          0  Other Middle East/Central Asia  ...   0.0   
4         0          0                       Australia  ...   0.0   

  start_time_DI  last_event_DI nevents ndays_act  nplay

In [17]:
# Load the short version of the dataset to inspect its structure
# file_path = 'mooc_dataset/big_student_clear_third_version.csv'
file_path = 'mooc_dataset/output_truncated.csv'

import pandas as pd

# Load the dataset
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head(), data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         50000 non-null  int64  
 1   institute          50000 non-null  object 
 2   course_id          50000 non-null  object 
 3   year               50000 non-null  int64  
 4   semester           50000 non-null  object 
 5   userid_DI          50000 non-null  object 
 6   viewed             50000 non-null  int64  
 7   explored           50000 non-null  int64  
 8   certified          50000 non-null  int64  
 9   final_cc_cname_DI  50000 non-null  object 
 10  LoE_DI             50000 non-null  object 
 11  gender             48513 non-null  object 
 12  grade              50000 non-null  float64
 13  start_time_DI      50000 non-null  object 
 14  last_event_DI      50000 non-null  object 
 15  nevents            50000 non-null  int64  
 16  ndays_act          500

(   Unnamed: 0 institute course_id  year semester       userid_DI  viewed  \
 0           4  HarvardX    PH207x  2012     Fall  MHxPC130313697       0   
 1           6  HarvardX    PH207x  2012     Fall  MHxPC130237753       1   
 2           7  HarvardX     CS50x  2012   Summer  MHxPC130202970       1   
 3          20  HarvardX     CS50x  2012   Summer  MHxPC130223941       1   
 4          22  HarvardX    PH207x  2012     Fall  MHxPC130317399       0   
 
    explored  certified               final_cc_cname_DI  ... grade  \
 0         0          0                           India  ...   0.0   
 1         0          0                   United States  ...   0.0   
 2         0          0                   United States  ...   0.0   
 3         0          0  Other Middle East/Central Asia  ...   0.0   
 4         0          0                       Australia  ...   0.0   
 
   start_time_DI  last_event_DI nevents ndays_act  nplay_video  nchapters  \
 0    2012-07-24     2013-07-27      

In [18]:
# Drop irrelevant columns
columns_to_drop = ['Unnamed: 0', 'userid_DI', 'start_time_DI', 'last_event_DI']
data_cleaned = data.drop(columns=columns_to_drop)

# Fill missing values in 'gender' with a placeholder
data_cleaned['gender'] = data_cleaned['gender'].fillna('Unknown')

# Discretize numerical variables into categories
data_cleaned['age'] = pd.cut(data_cleaned['age'], bins=[0, 18, 25, 35, 50, 100], labels=['<18', '18-25', '26-35', '36-50', '50+'])
data_cleaned['grade'] = pd.cut(data_cleaned['grade'], bins=[-1, 0.5, 1], labels=['Low', 'High'])

# Convert interaction metrics to boolean values
interaction_cols = ['viewed', 'explored', 'certified', 'nchapters', 'nforum_posts']
data_cleaned[interaction_cols] = data_cleaned[interaction_cols].applymap(lambda x: 1 if x > 0 else 0)

# Display the cleaned dataset
data_cleaned.head()


  data_cleaned[interaction_cols] = data_cleaned[interaction_cols].applymap(lambda x: 1 if x > 0 else 0)


Unnamed: 0,institute,course_id,year,semester,viewed,explored,certified,final_cc_cname_DI,LoE_DI,gender,grade,nevents,ndays_act,nplay_video,nchapters,nforum_posts,incomplete_flag,age
0,HarvardX,PH207x,2012,Fall,0,0,0,India,Bachelor's,m,Low,6,3,197757,0,0,0,18-25
1,HarvardX,PH207x,2012,Fall,1,0,0,United States,Secondary,m,Low,107,8,7,1,0,0,18-25
2,HarvardX,CS50x,2012,Summer,1,0,0,United States,Bachelor's,m,Low,8,1,197757,1,0,0,18-25
3,HarvardX,CS50x,2012,Summer,1,0,0,Other Middle East/Central Asia,Secondary,m,Low,25,2,197757,1,0,0,18-25
4,HarvardX,PH207x,2012,Fall,0,0,0,Australia,Master's,f,Low,3,2,197757,0,0,0,26-35


In [19]:
from mlxtend.frequent_patterns import apriori, association_rules

# Convert all attributes to a boolean transactional format (1/0)
# This ensures all columns represent presence or absence of attributes.
transaction_data = data_cleaned.astype(bool)

# Display the transactional dataset
transaction_data.head()


Unnamed: 0,institute,course_id,year,semester,viewed,explored,certified,final_cc_cname_DI,LoE_DI,gender,grade,nevents,ndays_act,nplay_video,nchapters,nforum_posts,incomplete_flag,age
0,True,True,True,True,False,False,False,True,True,True,True,True,True,True,False,False,False,True
1,True,True,True,True,True,False,False,True,True,True,True,True,True,True,True,False,False,True
2,True,True,True,True,True,False,False,True,True,True,True,True,True,True,True,False,False,True
3,True,True,True,True,True,False,False,True,True,True,True,True,True,True,True,False,False,True
4,True,True,True,True,False,False,False,True,True,True,True,True,True,True,False,False,False,True


In [20]:
# Ensure no NaN values exist in the transactional data
transaction_data = transaction_data.fillna(False)

# Verify that there are no NaN values
print(transaction_data.isnull().sum().sum())  # Should print 0

0


  transaction_data = transaction_data.fillna(False)


In [21]:
# Run Apriori to find frequent itemsets
frequent_itemsets = apriori(transaction_data, min_support=0.01, use_colnames=True)

# Sort and display the frequent itemsets
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)
print(frequent_itemsets.head())


    support                        itemsets
71      1.0              (gender, semester)
40      1.0  (final_cc_cname_DI, course_id)
8       1.0                        (LoE_DI)
9       1.0                        (gender)
0       1.0                     (institute)


In [22]:
# Display frequent itemsets with highest support
print(frequent_itemsets.sort_values(by='support', ascending=False).head(10))

       support                                           itemsets
51678      1.0  (gender, institute, grade, course_id, nplay_vi...
7670       1.0  (gender, grade, nplay_video, final_cc_cname_DI...
6988       1.0  (gender, nplay_video, final_cc_cname_DI, year,...
71         1.0                                 (gender, semester)
40         1.0                     (final_cc_cname_DI, course_id)
8          1.0                                           (LoE_DI)
9          1.0                                           (gender)
0          1.0                                        (institute)
1          1.0                                        (course_id)
2          1.0                                             (year)


In [26]:
from mlxtend.frequent_patterns import apriori, association_rules

# Generate frequent itemsets
frequent_itemsets = apriori(transaction_data, min_support=0.01, use_colnames=True)

# Display to ensure correctness
print(frequent_itemsets.head())

KeyboardInterrupt: 

In [None]:
# Generate association rules with minimum confidence threshold
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6,num_itemsets=len(frequent_itemsets) )

# Sort rules by lift and display the top rules
rules = rules.sort_values(by='lift', ascending=False)

# Display the first few rules
print(rules.head())
