In [6]:
%matplotlib inline

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# Function to load a CSV file
def load_csv(file_path):
    """
    Load a CSV file into a Pandas DataFrame.
    Args:
        file_path (str): Path to the CSV file.
    Returns:
        pd.DataFrame: Loaded DataFrame.
    """
    try:
        df = pd.read_csv(file_path)
        print(f"Successfully loaded: {file_path}")
        return df
    except Exception as e:
        print(f"Error loading file {file_path}: {e}")
        return None

# Function to clean data
def clean_data(df, file_name):
    """
    Clean the dataset by handling missing values and removing irrelevant rows.
    Args:
        df (pd.DataFrame): Input DataFrame.
        file_name (str): Name of the file being cleaned.
    Returns:
        pd.DataFrame: Cleaned DataFrame.
    """
    if df is None:
        print(f"No data to clean in {file_name}.")
        return None

    try:
        if file_name == "USER_LOG":
            df = df.dropna(subset=["Date", "Time"])
        elif file_name == "ACTIVITY_LOG":
            df = df.dropna(subset=["Component", "Action"])
            df = df[~df["Component"].isin(["System", "Folder"])]
        elif file_name == "COMPONENT_CODES":
            df = df.dropna(subset=["Component", "Code"])
        print(f"Data cleaning successful for {file_name}.")
        return df
    except Exception as e:
        print(f"Error during cleaning for {file_name}: {e}")
        return None

# Function to rename columns
def rename_columns(df, file_name):
    """
    Rename columns in the DataFrame.
    Args:
        df (pd.DataFrame): Input DataFrame.
        file_name (str): Name of the file being processed.
    Returns:
        pd.DataFrame: DataFrame with renamed columns.
    """
    if "User Full Name *Anonymized" in df.columns:
        df.rename(columns={"User Full Name *Anonymized": "User_ID"}, inplace=True)
        print(f"Columns renamed for {file_name}.")
    return df

# Function to merge data
def merge_data(activity_log, user_log):
    """
    Merge activity log with user log for user interaction analysis.
    Args:
        activity_log (pd.DataFrame): Cleaned ACTIVITY_LOG DataFrame.
        user_log (pd.DataFrame): Cleaned USER_LOG DataFrame.
    Returns:
        pd.DataFrame: Merged DataFrame.
    """
    merged_df = pd.merge(activity_log, user_log, on="User_ID", how="inner")
    print("Data merged successfully.")
    return merged_df

# Function to reshape data
def reshape_data(df):
    """
    Reshape the data using a pivot operation.
    Args:
        df (pd.DataFrame): Merged DataFrame.
    Returns:
        pd.DataFrame: Pivoted DataFrame.
    """
    df["Month"] = pd.to_datetime(df["Date"]).dt.to_period("M")
    pivot_df = df.pivot_table(index="User_ID", columns="Component", values="Action", aggfunc="count", fill_value=0)
    print("Data reshaped successfully.")
    return pivot_df

# Function to calculate statistics
def calculate_statistics(df, components):
    """
    Calculate mean, mode, and median for specific components.
    Args:
        df (pd.DataFrame): Merged DataFrame.
        components (list): List of components to calculate statistics for.
    Returns:
        dict: Dictionary containing statistics.
    """
    stats = {}
    for component in components:
        if component in df.columns:
            stats[component] = {
                "mean": df[component].mean(),
                "mode": df[component].mode().iloc[0],
                "median": df[component].median(),
            }
    print("Statistics calculated.")
    return stats

# Function to plot correlation
def plot_correlation(df, components):
    """
    Plot a correlation heatmap for specific components.
    Args:
        df (pd.DataFrame): Merged DataFrame.
        components (list): List of components to analyze.
    """
    correlation_df = df[components]
    correlation_matrix = correlation_df.corr()

    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
    plt.title("Correlation between Components")
    plt.show()

In [7]:
# Example usage
file_paths = {
    "USER_LOG": "USER_LOG.csv",
    "ACTIVITY_LOG": "ACTIVITY_LOG.csv",
    "COMPONENT_CODES": "COMPONENT_CODES.csv",
}

# Load and clean individual files
user_log_df = load_csv(file_paths["USER_LOG"])
user_log_df = clean_data(user_log_df, "USER_LOG")
user_log_df = rename_columns(user_log_df, "USER_LOG")

activity_log_df = load_csv(file_paths["ACTIVITY_LOG"])
activity_log_df = clean_data(activity_log_df, "ACTIVITY_LOG")
activity_log_df = rename_columns(activity_log_df, "ACTIVITY_LOG")

component_codes_df = load_csv(file_paths["COMPONENT_CODES"])
component_codes_df = clean_data(component_codes_df, "COMPONENT_CODES")

Successfully loaded: USER_LOG.csv
Data cleaning successful for USER_LOG.
Columns renamed for USER_LOG.
Successfully loaded: ACTIVITY_LOG.csv
Data cleaning successful for ACTIVITY_LOG.
Columns renamed for ACTIVITY_LOG.
Successfully loaded: COMPONENT_CODES.csv
Data cleaning successful for COMPONENT_CODES.


In [8]:
print(user_log_df.columns)
print(activity_log_df.columns)
print(component_codes_df.columns)

Index(['Date', 'Time', 'User_ID'], dtype='object')
Index(['User_ID', 'Component', 'Action', 'Target'], dtype='object')
Index(['Component', 'Code'], dtype='object')


In [12]:
activity_log_df.shape, user_log_df.shape

((145262, 4), (150835, 3))

In [10]:
activity_log_df['User_ID'].value_counts()

User_ID
117    2831
83     2212
100    2052
11     1983
125    1949
       ... 
116     191
3       185
65      119
10       69
77       10
Name: count, Length: 152, dtype: int64

In [13]:
user_log_df['User_ID'].value_counts()

User_ID
117    3078
83     2319
100    2157
11     2057
125    2023
       ... 
116     196
3       186
65      119
10       73
77       10
Name: count, Length: 152, dtype: int64

In [None]:
# Merge data
merged_df = merge_data(activity_log_df, user_log_df)

In [None]:
# Reshape data
pivot_df = reshape_data(merged_df)

# Calculate statistics
components = ["Quiz", "Lecture", "Assignment", "Attendance", "Survey"]
stats = calculate_statistics(pivot_df, components)
print("\nStatistics:")
for component, stat in stats.items():
    print(f"{component}: {stat}")

# Plot correlation
components_to_analyze = ["Assignment", "Quiz", "Lecture", "Book", "Project", "Course"]
plot_correlation(pivot_df, components_to_analyze)
