# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules

# Load the data

In [None]:
df_assignment_rel=pd.read_csv('./data/assignment_relationships.csv')
df_training=pd.read_csv('./data/training_unit_test_scores.csv')
df_actionlogs=pd.read_csv('./data/action_logs.csv')
df_problem=pd.read_csv('./data/problem_details.csv')

# Predict Score for in-unit problems

In [None]:
#select only required columns in data frames
df_training=df_training.merge(df_assignment_rel,left_on='assignment_log_id',right_on='unit_test_assignment_log_id',how='left')
df_training=df_training[['unit_test_assignment_log_id','problem_id', 'score', 'in_unit_assignment_log_id']]
df_actionlogs=df_actionlogs[['assignment_log_id','problem_id','available_core_tutoring','action']]

In [None]:
# one-hot encoding of action and available_core_tutoring features.
one_hot = pd.get_dummies(df_actionlogs[['action','available_core_tutoring']])
# Concatenate the original DataFrame with the one-hot encoded DataFrame
df_actionlogs = pd.concat([df_actionlogs, one_hot], axis=1)

In [None]:
df_actionlogs=df_actionlogs.rename(columns={'problem_id':'problem_id_action'})
df_assosiation=df_training.merge(df_actionlogs,left_on='in_unit_assignment_log_id',right_on='assignment_log_id',how='left')
df_actionlogs=df_actionlogs.dropna(subset = ['problem_id_action'])
df_actionlogs_sub=df_actionlogs[['assignment_log_id','problem_id_action','action']]
# getting the predicted score based on defination of scoring for end-of unit problems
df_actionlogs_sub['predictedscore'] = df_actionlogs_sub.groupby(['assignment_log_id', 'problem_id_action'])['action'].transform(
    lambda x: 1 if 'open_response' in x.tolist() 
    else (0 if any(action in ['wrong_response', 'hint_requested', 'explanation_requested','live_tutor_requested','skill_related_video_requested','answer_requested'] for action in x.tolist()) else None)
)
#only correct response is left in actions, so assigning score=1
df_actionlogs_sub['predictedscore']=df_actionlogs_sub['predictedscore'].fillna(1)
df_actionlogs_sub_temp=df_actionlogs_sub.drop(columns=['action'])
df_actionlogs_sub_temp=df_actionlogs_sub_temp.drop_duplicates()


In [None]:
# get just required columns
df_assosiation=df_assosiation[['unit_test_assignment_log_id', 'problem_id', 'score',
       'in_unit_assignment_log_id', 'problem_id_action',]]
#removing records where problem_id_action is null
df_assosiation=df_assosiation.dropna(subset = ['problem_id_action'])
df_assosiation=df_assosiation.rename(columns={'in_unit_assignment_log_id':'assignment_log_id'})
df_assosiation_rule=df_assosiation.merge(df_actionlogs_sub_temp,on=['assignment_log_id','problem_id_action'],how='left')
df_assosiation_rule=df_assosiation_rule.drop_duplicates()

In [None]:
# #save predicted scores
# df_assosiation_rule.to_pickle('/home/aswani/workspace/edm-cup/files/pickle_files/assosiationRule.pkl')
# df_assosiation_rule=pd.read_pickle('/home/aswani/workspace/edm-cup/files/pickle_files/assosiationRule.pkl')

# Make Transaction for assosiation rules

In [None]:
# Group the data by 'unit_test_assignment_log_id' and 'problem_id'
groups = df_assosiation_rule.groupby(['unit_test_assignment_log_id', 'problem_id'])
filtered_records = []

# Iterate over each group
for _, group_df in groups:
    score = group_df['score'].iloc[0]
    
    # Filter records based on the 'score' and 'predictedscore' columns, 
    # we only consider the problem with either score 1 or score 0 both in in-unit and end of unit problems.
    if score == 1:
        filtered_group = group_df[group_df['predictedscore'] == 1]
    elif score == 0:
        filtered_group = group_df[group_df['predictedscore'] == 0]
    else:
        continue  # Skip the group if 'score' is neither 0 nor 1
    
    # Append the filtered records to the final result
    filtered_records.append(filtered_group)

# Combine the filtered records from each group
filtered_df = pd.concat(filtered_records)

# Print the filtered DataFrame
print(filtered_df)


### Get skill code for problems

In [None]:
# get skill code for all problems in in-unit and end-of unit.
filtered_df=filtered_df.merge(df_problem[['problem_id','problem_skill_code']],on='problem_id',how='left')
filtered_df=filtered_df.merge(df_problem[['problem_id','problem_skill_code']],left_on='problem_id_action',right_on='problem_id',how='left')
filtered_df=filtered_df.drop(columns=['problem_id_y'])
filtered_df=filtered_df.rename(columns={'problem_skill_code_x':'problem_skill_code_train','problem_skill_code_y':'problem_skill_code_action'})
filtered_df=filtered_df.rename(columns={'problem_skill_code_train':'problem_skill_code_train_full','problem_skill_code_action':'problem_skill_code_action_full'})
filtered_df['problem_skill_code_train']=filtered_df['problem_skill_code_train_full'].str.split('.', n=3).str[0:2].str.join('.')
filtered_df['problem_skill_code_action']=filtered_df['problem_skill_code_action_full'].str.split('.', n=3).str[0:2].str.join('.')

### Get the grade details

In [None]:
#first part of skill code is grade
filtered_df['grade_train'] = filtered_df['problem_skill_code_train'].str.split('.', n=1).str[0]
filtered_df['grade_action'] = filtered_df['problem_skill_code_action'].str.split('.', n=1).str[0]
# we will divide the dataframes based on grade_action, so let's remove records with grade_action as NaN
filtered_df=filtered_df.dropna(subset=['grade_train'])


### Barplot for grade vs average score 

In [None]:
# data frame to calculate average score per garde.
filtered_df_fig=filtered_df.groupby('grade_train',as_index=False).agg({'score':'mean'})
filtered_df_fig.score=filtered_df_fig.score.round(2)

In [None]:

plt.figure(figsize=(10,6),dpi=300)  # Increase the figure size
# sns.barplot(data=filtered_df_fig,  x='grade_train',y='score',color='#7093cd')
plt.bar(x=filtered_df_fig.grade_train, height=filtered_df_fig.score, width=0.5)
plt.ylim(0.6)
# plt.xticks(rotation=90)  # Rotate the x-axis labels by 90 degrees
plt.xlabel('Grade', fontsize=18,fontweight='bold')
plt.ylabel('Average Score', fontsize=18,fontweight='bold')
plt.xticks(fontsize=14, fontweight='bold')
plt.yticks(fontsize=14, fontweight='bold')
plt.subplots_adjust(wspace=0.1) 
plt.savefig('Grade_Subject.pdf',bbox_inches='tight')
plt.show()

### Separate data for score 0 & 1 and Get Transactions

In [None]:
#separate dfs for score 1 and 0
filtered_df_1=filtered_df[filtered_df['score']==1]
filtered_df_0=filtered_df[filtered_df['score']==0]

In [None]:

# Group the DataFrame by the 'grade_train' column
grouped1 = filtered_df_1.groupby('grade_train')
# Create separate DataFrames for each group
df_grades_1 = [group for _, group in grouped1]
# Group the DataFrame by the 'grade_train' column
grouped0 = filtered_df_0.groupby('grade_train')
# Create separate DataFrames for each group
df_grades_0 = [group for _, group in grouped0]

In [None]:
# with skill code
# Function to prepare transactions 
# input: dataframe with scores for in-unit and end-of-unit problems and scores
# Output: dataframe with new column skill_code_list, which will have transactions for each unit_test_assignment_log_id', 'problem_skill_code_train
def group_and_extract_skill(filtered_df):
    # Group the filtered data by 'unit_test_assignment_log_id' and 'problem_skill_code_train'
    groups = filtered_df.groupby(['unit_test_assignment_log_id', 'problem_skill_code_train'])

    result_records = []

    # Iterate over each group
    for _, group_df in groups:
        unit_test_assignment_log_id = group_df['unit_test_assignment_log_id'].iloc[0]
        skill_id = group_df['problem_skill_code_train'].iloc[0]

        # Get the unique 'problem_skill_code_action' values and add 'skill_id' to the list
        skill_code_list = group_df['problem_skill_code_action'].unique().tolist()
        skill_code_list.append(skill_id)

        # Create a dictionary with the required columns
        result_dict = {
            'unit_test_assignment_log_id': unit_test_assignment_log_id,
            'problem_skill_code_train': skill_id,
            'skill_code_list': skill_code_list
        }

        # Append the dictionary to the result records
        result_records.append(result_dict)

    # Create the final DataFrame
    final_df = pd.DataFrame(result_records)

    return final_df

In [None]:
# Apply the group_and_extract function to each DataFrame and store them in a dictionary
result_dfs_skill_1 = {}
for i, df in enumerate(df_grades_1):
    processed_df = group_and_extract_skill(df)
    result_dfs_skill_1[f"df_grades_1_{i+1}"] = processed_df

# Apply the group_and_extract function to each DataFrame and store them in a dictionary
result_df_skill_0 = {}
for i, df in enumerate(df_grades_0):
    processed_df = group_and_extract_skill(df)
    result_df_skill_0[f"df_grades_0_{i+1}"] = processed_df
    
#the result dictionaries result_dfs_skill_1,result_df_skill_0 have key as gradename and values as dataframes for each grade.

In [None]:
#Number of transactions in each grade score 0
row_counts = {}
for key, value in result_df_skill_0.items():
    row_counts[key] = value.shape[0]

print(row_counts)

In [None]:
#Number of transactions in each grade score 1
row_counts = {}
for key, value in result_dfs_skill_1.items():
    row_counts[key] = value.shape[0]

print(row_counts)

# Frequent item sets and assosiation Rules

In [None]:
# function to remove null values in transactions
# input: transaction lists
def remove_nan(lst):
    cleaned_lst = [[x for x in sublist if isinstance(x, str)] for sublist in lst]
    return cleaned_lst

problem_skill_code_training=list(filtered_df.problem_skill_code_train.unique())
problem_skill_code_training=list(set(problem_skill_code_training))

In [None]:

#function to get frequent item sets and rules.
def generate_association_rules(df, min_support, min_threshold):
    # Initialize an empty list to store the rule dataframes
    all_rules = []

    # Iterate over the keys in the DataFrame
    for key in df.keys():
        # Get the transactions for the current key
        transactions = df[key]['skill_code_list']
        transactions_cleaned = remove_nan(transactions)
        # Transform transactions into a binary encoded DataFrame
        te = TransactionEncoder()
        te_ary = te.fit(transactions_cleaned).transform(transactions_cleaned)
        encoded_df = pd.DataFrame(te_ary, columns=te.columns_)
        # Find frequent itemsets
        frequent_itemsets = fpgrowth(encoded_df, min_support=min_support, use_colnames=True)
        # Check if frequent_itemsets is empty
        if frequent_itemsets.empty:
            continue  # Ignore and move to the next key
        # Generate association rules
        rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_threshold)
        # Append the rules dataframe to the list
        all_rules.append(rules)
    # Concatenate all the rules dataframes into a single dataframe
    concatenated_rules = pd.concat(all_rules, ignore_index=True)
    concatenated_rules["consequents_len"] = concatenated_rules["consequents"].apply(lambda x: len(x))
    concatenated_rules=concatenated_rules[concatenated_rules.consequents_len==1]
    concatenated_rules["antecedents"] = concatenated_rules["antecedents"].apply(lambda x: ', '.join(list(x))).astype("unicode")
    concatenated_rules["consequents"] = concatenated_rules["consequents"].apply(lambda x: ', '.join(list(x))).astype("unicode")
    concatenated_rules=concatenated_rules.sort_values('confidence',ascending=False)
    concatenated_rules['rule']=concatenated_rules['antecedents']+'-->'+concatenated_rules['consequents']
    
    return concatenated_rules


## Top rules for for score 0 with min support=0.7 and confidence=0.8

In [None]:
df_score_0=generate_association_rules(result_df_skill_0, 0.05, 0.5)

In [None]:
df_score_0[['rule', 'support', 'confidence']]

## Top rules for for score 1 with min support=0.8 and confidence=0.9

In [None]:
df_score_1=generate_association_rules(result_dfs_skill_1, 0.1, 0.5)

In [None]:
df_score_1[['rule', 'support', 'confidence']]

## Rules for all grades

In [None]:
# this function outputs assosiation rules for each grade seperately.
#input: Dataframe which has skill codes as transactions
#Output: assosiation Rules
def generate_association_rules_grade(df, min_support, min_threshold):
    # Get the transactions for the current DataFrame
    transactions = df['skill_code_list']
    transactions_cleaned = remove_nan(transactions)

    # Transform transactions into a binary encoded DataFrame
    te = TransactionEncoder()
    te_ary = te.fit(transactions_cleaned).transform(transactions_cleaned)
    encoded_df = pd.DataFrame(te_ary, columns=te.columns_)

    # Find frequent itemsets
    frequent_itemsets = fpgrowth(encoded_df, min_support=min_support, use_colnames=True)

    # Check if frequent_itemsets is empty
    if frequent_itemsets.empty:
        return pd.DataFrame()  # Return an empty DataFrame if no frequent itemsets are found

    # Generate association rules
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_threshold)

    # Preprocess and format the rules DataFrame
    rules["consequents_len"] = rules["consequents"].apply(lambda x: len(x))
    rules = rules[rules.consequents_len == 1]
    rules["antecedents"] = rules["antecedents"].apply(lambda x: ', '.join(list(x))).astype("unicode")
    rules["consequents"] = rules["consequents"].apply(lambda x: ', '.join(list(x))).astype("unicode")
    rules = rules.sort_values('confidence', ascending=False)
    rules['rule'] = rules['antecedents'] + '-->' + rules['consequents']

    return rules[['rule','support','confidence']]



### Rules for score 0 -grade wise

In [None]:
#print rules seperated by grade for score 0
#Even though the rules has other skills than specified grade skills, all the consequents are end-of unit problem skills. so they are still valid.
for key in result_df_skill_0.keys():
    rules_df = generate_association_rules_grade(result_df_skill_0[key], 0.05, 0.5)
    if not rules_df.empty:
        print(f"Association rules for DataFrame '{key}':")
        print(rules_df)
        print("--------------------------------------")


#### Top rule from each grade based on support

In [None]:
#print rules dataframe only with top one rule from each grade for score 0
rule_table = pd.DataFrame(columns=['Rule', 'Support', 'Confidence'])
for key in result_df_skill_0.keys():
    rules_df = generate_association_rules_grade(result_df_skill_0[key], 0.1, 0.5)
    if not rules_df.empty:
        max_support_row = rules_df.nlargest(1, 'support').iloc[0]
        rule = max_support_row['rule']
        support = max_support_row['support']
        confidence = max_support_row['confidence']
        rule_table = rule_table.append({'Rule': rule, 'Support': support, 'Confidence': confidence}, ignore_index=True)

print(rule_table)

### Rules for score 1 -grade wise

In [None]:
#print rules seperated by grade for score 1
for key in result_dfs_skill_1.keys():
    rules_df = generate_association_rules_grade(result_dfs_skill_1[key], 0.1, 0.5)
    if not rules_df.empty:
        print(f"Association rules for DataFrame '{key}':")
        print(rules_df)
        print("--------------------------------------")


#### Top rule from each grade based on support

In [None]:
#print rules dataframe only with top one rule from each grade for score 1
rule_table = pd.DataFrame(columns=['Rule', 'Support', 'Confidence'])

for key in result_dfs_skill_1.keys():
    rules_df = generate_association_rules_grade(result_dfs_skill_1[key], 0.1, 0.5)
    if not rules_df.empty:
        max_support_row = rules_df.nlargest(1, 'support').iloc[0]
        rule = max_support_row['rule']
        support = max_support_row['support']
        confidence = max_support_row['confidence']
        rule_table = rule_table.append({'Rule': rule, 'Support': support, 'Confidence': confidence}, ignore_index=True)

print(rule_table)
