In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

: 

In [None]:
# Load your data
demographics_data = pd.read_csv("../Dataset/Demographics/demographics.csv")
sensing_data = pd.read_csv("../Dataset/Sensing/sensing.csv")
ema_data = pd.read_csv("../Dataset/EMA/general_ema.csv")

DATA PREPROCESSING

In [None]:
# First, merge all three datasets into ONE DF
def prepare_merged_dataset(sensing_data, demographics_data, ema_data):
    """
    Merge sensing, demographics, and EMA data
    """
    # Merge sensing and demographics first
    merged = sensing_data.merge(
        demographics_data,
        on='uid',
        how='inner'
    )
    
    # Then merge with EMA data
    merged = merged.merge(
        ema_data,
        on=['uid', 'day'],
        how='inner'
    )
    
    # Add academic quarter
    merged['quarter'] = merged['day'].apply(get_academic_quarter)
    
    return merged

In [None]:
# Organized feature dictionary into categories and platforms
''' 
Categories: demographics, activity, location, phone, sleep, audio, calls, sms, light, steps, quality
Platforms: all, android, ios
'''
feature_dict = {
    'demographics': [
        'gender', 'race'
    ],
    
    'activity': {
        'all': [
            'act_in_vehicle_ep_0', 'act_on_bike_ep_0', 'act_still_ep_0'
        ],
        'android': [
            'act_on_foot_ep_0', 'act_tilting_ep_0'
        ],
        'ios': [
            'act_running_ep_0', 'act_walking_ep_0'
        ]
    },
    
    'location': [
        'loc_dist_ep_0', 'loc_visit_num_ep_0', 'loc_max_dis_from_campus_ep_0',
        'loc_food_dur', 'loc_health_dur', 'loc_home_dur', 'loc_leisure_dur',
        'loc_other_dorm_dur', 'loc_self_dorm_dur', 'loc_social_dur', 
        'loc_study_dur', 'loc_workout_dur', 'loc_worship_dur'
    ],
    
    'phone': {
        'all': [
            'unlock_duration_ep_0', 'unlock_num_ep_0'
        ],
        'ios': [
            'other_playing_duration_ep_0', 'other_playing_num_ep_0'
        ]
    },
    
    'sleep': {
        'all': [
            'sleep_duration', 'sleep_start', 'sleep_end'
        ],
        'ios': [
            'sleep_heathkit_dur'
        ]
    },
    'audio': {
        'android': [
            'audio_amp_mean_ep_0', 'audio_amp_std_ep_0',
            'audio_convo_duration_ep_0', 'audio_convo_num_ep_0',
            'audio_voice_ep_0'
        ]
    },
    
    'calls': {
        'android': [
            'call_in_duration_ep_0', 'call_in_num_ep_0',
            'call_out_duration_ep_0', 'call_out_num_ep_0',
            'call_miss_num_ep_0'
        ]
    },
    
    'sms': {
        'android': [
            'sms_in_num_ep_0', 'sms_out_num_ep_0'
        ]
    },
    
    'light': {
        'android': [
            'light_mean_ep_0', 'light_std_ep_0'
        ]
    },
    
    'steps': {
        'ios': [
            'step_ep_0'
        ]
    },
    
    'quality': {
        'all': [
            'quality_activity', 'quality_loc'
        ],
        'android': [
            'quality_audio', 'quality_light'
        ]
    }
}

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Example feature dictionary (provided as feature_dict)
# Define the merged dataset (replace with actual merged DataFrame)
# merged_data = prepare_merged_dataset(sensing_data, demographics_data, ema_data)

def select_features(platform, category_list):
    """
    Select features based on platform and categories from feature_dict.
    """
    features = []
    for category in category_list:
        if isinstance(feature_dict[category], dict):  # Sub-dictionary for platform-specific features
            if platform in feature_dict[category]:
                features.extend(feature_dict[category][platform])
            if 'all' in feature_dict[category]:  # Add cross-platform features
                features.extend(feature_dict[category]['all'])
        else:
            features.extend(feature_dict[category])
    return features

def build_pipeline(merged_data, platform, categories, outcome, model_type="RandomForest"):
    """
    Builds a pipeline with customizable features and outcomes based on the platform and categories.
    
    Parameters:
    - merged_data: the merged DataFrame with all features
    - platform: platform-specific features to use ('all', 'android', 'ios')
    - categories: list of categories to use (e.g., ['demographics', 'activity'])
    - outcome: name of the target variable column
    - model_type: ensemble model to use ("RandomForest", "GradientBoosting", "AdaBoost")
    
    Returns:
    - A scikit-learn pipeline ready for training and evaluation
    """
    features = select_features(platform, categories)
    
    X = merged_data[features]
    y = merged_data[outcome]
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Choose the model
    if model_type == "RandomForest":
        model = RandomForestClassifier()
    elif model_type == "GradientBoosting":
        model = GradientBoostingClassifier()
    elif model_type == "AdaBoost":
        model = AdaBoostClassifier()
    else:
        raise ValueError("Unsupported model_type. Choose 'RandomForest', 'GradientBoosting', or 'AdaBoost'.")
    
    # Build the pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predict and evaluate
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"{model_type} Model Accuracy: {accuracy:.2f}")
    return pipeline

In [None]:
merged_data = prepare_merged_dataset(sensing_data, demographics_data, ema_data)

categories = ['demographics', 'activity', 'location']
platform = 'android'  # Specify platform compatibility
outcome = 'target_variable'  # Replace with your actual outcome column

# Train and evaluate the pipeline
pipeline = build_pipeline(merged_data, platform, categories, outcome, model_type="RandomForest")