In [33]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

DEMOGRAPHICS_CSV = "../../Dataset/Demographics/demographics.csv"
SENSING_CSV = "../../Dataset/Sensing/sensing.csv"
GENERAL_EMA_CSV = "../../Dataset/EMA/general_ema.csv"
COVID_EMA_CSV = "../../Dataset/EMA/covid_ema.csv"

df = pd.read_csv(DEMOGRAPHICS_CSV)
sensing_df = pd.read_csv(SENSING_CSV)
general_ema = pd.read_csv(GENERAL_EMA_CSV)
covid_ema = pd.read_csv(COVID_EMA_CSV)

In [70]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

def train_and_evaluate_models(df, feature_columns, target_columns):
    results = {}
    
    # Loop through each target column (survey question)
    for target in target_columns:
        print(f"Training model for target: {target}")
        
        # Split data into features (X) and the current target (y)
        X = df[feature_columns]
        y = df[target]
        
        # Train/test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Initialize and train the linear regression model
        model = LinearRegression()
        model.fit(X_train, y_train)
        
        # Predict on test set
        y_pred = model.predict(X_test)
        
        # Calculate Mean Squared Error for evaluation
        mse = mean_squared_error(y_test, y_pred)
        
        # Store results
        results[target] = {'model': model, 'mse': mse}
        print(f"Mean Squared Error for {target}: {mse}\n")
    
    return results


In [30]:
def drop_rare_categories(df, threshold=0.05, exclude_columns=None):
    # If exclude_columns is None, initialize it as an empty list
    if exclude_columns is None:
        exclude_columns = []
    
    # Identify categorical columns, excluding specified columns like 'u_id'
    categorical_columns = [col for col in df.select_dtypes(include=['object', 'category']).columns if col not in exclude_columns]
    
    # Loop through each categorical column to drop rare categories
    for col in categorical_columns:
        # Calculate the frequency of each category as a proportion
        value_counts = df[col].value_counts(normalize=True)
        
        # Identify categories that appear in less than the threshold percentage
        rare_categories = value_counts[value_counts < threshold].index
        print(rare_categories)
        
        # Drop rows with these rare categories
        df = df[~df[col].isin(rare_categories)]
    
    return df


def one_hot_encode_categoricals(df, exclude_columns=None):
    # If exclude_columns is None, initialize it as an empty list
    if exclude_columns is None:
        exclude_columns = []
    
    # Identify categorical columns, excluding specified columns
    categorical_columns = [col for col in df.select_dtypes(include=['object', 'category']).columns if col not in exclude_columns]
    
    # Perform one-hot encoding on the categorical columns
    df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)
    
    return df_encoded


In [45]:
ema_data = pd.merge(general_ema, covid_ema , on=['uid', 'day'], how='inner')
print(ema_data.shape)
#ema_data = ema_data.dropna(subset=['survey_answer_column']) 

(16508, 29)


In [74]:
merged_data = pd.merge(sensing_df, df, on='uid', how='inner')

df_cleaned = drop_rare_categories(merged_data, threshold=0.005, exclude_columns=['uid'])
df_cleaned = one_hot_encode_categoricals(df_cleaned, exclude_columns=['uid'])

df_cleaned.update(df_cleaned.drop(columns=['uid']).fillna(df_cleaned.drop(columns=['uid']).median()))
df_cleaned.shape

Index([], dtype='object', name='gender')
Index([], dtype='object', name='race')


(214759, 660)

In [75]:
ema_data = ema_data.dropna()
print(ema_data.shape)

print(merged_data.shape)
merged_data = pd.merge(df_cleaned, ema_data, on=['uid', 'day'], how='inner')
print(merged_data.shape)

(2024, 29)
(214759, 653)
(2019, 687)


In [76]:
merged_data

Unnamed: 0,uid,is_ios,day,act_in_vehicle_ep_0,act_in_vehicle_ep_1,act_in_vehicle_ep_2,act_in_vehicle_ep_3,act_in_vehicle_hr_0,act_in_vehicle_hr_1,act_in_vehicle_hr_10,...,COVID-1,COVID-2,COVID-3,COVID-4,COVID-5,COVID-6,COVID-7,COVID-8,COVID-9,COVID-10
0,1ff6d7f34acb354430e7323a35ff7703,1,20200320,0,0,0,0,0,0,0,...,5.0,7.0,6.0,6.0,4.0,6.0,4.0,2.0,3.0,4.0
1,1ff6d7f34acb354430e7323a35ff7703,1,20200327,0,0,0,0,0,0,0,...,6.0,5.0,5.0,4.0,3.0,5.0,3.0,3.0,3.0,4.0
2,1ff6d7f34acb354430e7323a35ff7703,1,20200331,0,0,0,0,0,0,0,...,3.0,6.0,4.0,3.0,5.0,5.0,3.0,3.0,3.0,3.0
3,1ff6d7f34acb354430e7323a35ff7703,1,20200410,0,0,0,0,0,0,0,...,3.0,4.0,4.0,2.0,2.0,4.0,2.0,4.0,2.0,1.0
4,1ff6d7f34acb354430e7323a35ff7703,1,20200417,0,0,0,0,0,0,0,...,4.0,4.0,3.0,3.0,5.0,5.0,3.0,3.0,3.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014,6b0083d00297f9c03e00b2cde889b666,1,20200408,0,0,0,0,0,0,0,...,3.0,5.0,6.0,2.0,2.0,3.0,2.0,5.0,3.0,5.0
2015,6b0083d00297f9c03e00b2cde889b666,1,20200416,0,0,0,0,0,0,0,...,4.0,5.0,5.0,3.0,3.0,3.0,2.0,5.0,3.0,5.0
2016,6b0083d00297f9c03e00b2cde889b666,1,20200418,0,0,0,0,0,0,0,...,3.0,4.0,4.0,2.0,2.0,2.0,2.0,5.0,2.0,4.0
2017,6b0083d00297f9c03e00b2cde889b666,1,20200423,0,0,0,0,0,0,0,...,3.0,4.0,4.0,2.0,2.0,2.0,2.0,5.0,3.0,5.0


In [80]:
target_cols = list(ema_data.columns)
target_cols.remove("uid")
target_cols.remove("day")

feature_cols = [col for col in merged_data.columns if col not in target_cols + ['uid']]  # Exclude 'uid' as well
print(len(feature_cols))


659


In [81]:
print(merged_data['gender_M'])

0       False
1       False
2       False
3       False
4       False
        ...  
2014     True
2015     True
2016     True
2017     True
2018     True
Name: gender_M, Length: 2019, dtype: bool


In [82]:
# Run the training and evaluation
results = train_and_evaluate_models(merged_data, feature_cols, target_cols)

# Display results
for target, result in results.items():
    print(f"Target: {target}, MSE: {result['mse']}")

Training model for target: pam
Mean Squared Error for pam: 201.7482976254648

Training model for target: phq4-1
Mean Squared Error for phq4-1: 3.2073719356443817

Training model for target: phq4-2
Mean Squared Error for phq4-2: 16.96795028180641

Training model for target: phq4-3
Mean Squared Error for phq4-3: 3.5522852708840205

Training model for target: phq4-4
Mean Squared Error for phq4-4: 2.0402228836856895

Training model for target: phq4_resp_mean
Mean Squared Error for phq4_resp_mean: 147893.7702559271

Training model for target: phq4_resp_median
Mean Squared Error for phq4_resp_median: 22.53138582480347

Training model for target: phq4_score
Mean Squared Error for phq4_score: 55.17430610003855

Training model for target: social_level
Mean Squared Error for social_level: 25.451549931745884

Training model for target: sse3-1
Mean Squared Error for sse3-1: 3.7798680643207696

Training model for target: sse3-2
Mean Squared Error for sse3-2: 3.8845590035556965

Training model for t

In [65]:
# Identify columns with non-numeric data types, excluding 'uid'
non_numeric_columns = df_cleaned.select_dtypes(exclude=['number']).columns
non_numeric_columns = [col for col in non_numeric_columns if col != 'uid']

# Convert non-numeric columns (excluding 'uid') from True/False to 1.0/0.0
df_cleaned[non_numeric_columns] = df_cleaned[non_numeric_columns].astype(float)

# Check the updated DataFrame to confirm the changes
print(df_cleaned[non_numeric_columns].head())

   gender_M  gender_both  race_american indian/alaska native  \
0       0.0          0.0                                 0.0   
1       0.0          0.0                                 0.0   
2       0.0          0.0                                 0.0   
3       0.0          0.0                                 0.0   
4       0.0          0.0                                 0.0   

   race_american indian/white  race_asian  race_black  race_more than one  \
0                         0.0         1.0         0.0                 0.0   
1                         0.0         1.0         0.0                 0.0   
2                         0.0         1.0         0.0                 0.0   
3                         0.0         1.0         0.0                 0.0   
4                         0.0         1.0         0.0                 0.0   

   race_other/hispanic  race_white  
0                  0.0         0.0  
1                  0.0         0.0  
2                  0.0         0.0  
3   

In [67]:
ema_data

Unnamed: 0,uid,day,pam,phq4-1,phq4-2,phq4-3,phq4-4,phq4_resp_mean,phq4_resp_median,phq4_score,...,COVID-1,COVID-2,COVID-3,COVID-4,COVID-5,COVID-6,COVID-7,COVID-8,COVID-9,COVID-10
1,1ff6d7f34acb354430e7323a35ff7703,20200320,2.0,1.0,1.0,1.0,1.0,1.061805,0.950987,4.0,...,5.0,7.0,6.0,6.0,4.0,6.0,4.0,2.0,3.0,4.0
2,1ff6d7f34acb354430e7323a35ff7703,20200327,4.0,1.0,1.0,1.0,1.0,0.877866,0.607991,4.0,...,6.0,5.0,5.0,4.0,3.0,5.0,3.0,3.0,3.0,4.0
3,1ff6d7f34acb354430e7323a35ff7703,20200331,3.0,1.0,1.0,1.0,1.0,0.903385,0.642661,4.0,...,3.0,6.0,4.0,3.0,5.0,5.0,3.0,3.0,3.0,3.0
4,1ff6d7f34acb354430e7323a35ff7703,20200410,4.0,1.0,1.0,1.0,1.0,0.878460,0.738582,4.0,...,3.0,4.0,4.0,2.0,2.0,4.0,2.0,4.0,2.0,1.0
5,1ff6d7f34acb354430e7323a35ff7703,20200417,3.0,1.0,1.0,1.0,1.0,0.788469,0.508463,4.0,...,4.0,4.0,3.0,3.0,5.0,5.0,3.0,3.0,3.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16474,6b0083d00297f9c03e00b2cde889b666,20200408,13.0,0.0,0.0,1.0,0.0,1.842248,1.696259,1.0,...,3.0,5.0,6.0,2.0,2.0,3.0,2.0,5.0,3.0,5.0
16475,6b0083d00297f9c03e00b2cde889b666,20200416,7.0,0.0,0.0,1.0,0.0,3.019619,3.159822,1.0,...,4.0,5.0,5.0,3.0,3.0,3.0,2.0,5.0,3.0,5.0
16476,6b0083d00297f9c03e00b2cde889b666,20200418,7.0,0.0,1.0,0.0,0.0,11.109188,1.464542,1.0,...,3.0,4.0,4.0,2.0,2.0,2.0,2.0,5.0,2.0,4.0
16477,6b0083d00297f9c03e00b2cde889b666,20200423,1.0,0.0,0.0,0.0,0.0,1.833260,1.379733,0.0,...,3.0,4.0,4.0,2.0,2.0,2.0,2.0,5.0,3.0,5.0
