## Imports

In [163]:
import pandas as pd
import os

from src import files_import
from src import to_csv
from src import cleaning
import numpy as np

## Variables

In [164]:
# Define the folder containing the files
EMA_dictionary = "data/StudentLife/dataset/EMA"
mood_directory_path = "data/StudentLife/dataset/EMA/response/Mood"
mood1_directory_path = "data/StudentLife/dataset/EMA/response/Mood 1"
mood2_directory_path = "data/StudentLife/dataset/EMA/response/Mood 2"
exercise_directory_path = "data/StudentLife/dataset/EMA/response/Exercise"
stress_directory_path = "data/StudentLife/dataset/EMA/response/Stress"
social_directory_path = "data/StudentLife/dataset/EMA/response/Social"
sleep_directory_path = "data/StudentLife/dataset/EMA/response/Sleep"
activity_directory_path = "data/StudentLife/dataset/EMA/response/Activity"


# Define single file paths
grades_file_path = "data/StudentLife/dataset/education/grades.csv"

# List of datasets and their directory paths
survey_datasets = {
    "Mood": {'Path' : mood_directory_path, 'Important_Column' : 'happyornot'},
    "Mood1":  {'Path' : mood1_directory_path, 'Important_Column' : 'tomorrow'},
    "Mood2":  {'Path' : mood2_directory_path, 'Important_Column' : 'how'},
    "Exercise":  {'Path' : exercise_directory_path, 'Important_Column' : 'have'},
    "Stress":  {'Path' : stress_directory_path, 'Important_Column' : 'level'},
    "Social":  {'Path' : social_directory_path, 'Important_Column' : 'number'},
    "Sleep":  {'Path' : sleep_directory_path, 'Important_Column' : 'hour'},
    "Activity":  {'Path' : activity_directory_path, 'Important_Column' : 'working'},
}

output_path ="output.csv"

semester_length_in_days = 65




## Download the data

In [165]:
all_data = {}
for survey, dictionary in survey_datasets.items():
    combined_data = to_csv.folder_to_csv(dictionary['Path'], dictionary['Important_Column'], 0)
    all_data[survey] = combined_data


## Analysing

### Making a combined csv with all types of data

In [None]:
combined_csv = {}

#### Social and Stress

In [167]:

# Transformation dictionaries
# if data saved as a range, mapping to midpoint
social_mapping = {
    1:2, 2:7, 3:15, 4:30, 5:75, 6:150
}
stress_mapping = {
    1:3, 2:4, 3:5, 4:2, 5:1
}

# Mapping the data
stress_data= all_data['Stress']
all_data['Stress']['stress_level'] = stress_data['level'].map(stress_mapping)

social_data= all_data['Social']
all_data['Social']['number_of_people'] = social_data['number'].map(social_mapping)


average_social = all_data['Social'].groupby('User')['number_of_people'].mean()
average_stress = all_data['Stress'].groupby('User')['stress_level'].mean()
combined_csv['number_of_people'] = average_social
combined_csv['avg_stress_level'] = average_stress


#### Exercise

In [169]:

# Transformation dictionaries
exercise_mapping = {
    1:0, 2:15, 3:45, 4:75, 5:110
}
# Mapping the data
exercise_data= all_data['Exercise']
exercise_data['exercise_time'] = exercise_data['exercise'].map(exercise_mapping)
exercise_data['walk_time'] = exercise_data['walk'].map(exercise_mapping)
average_exercise_time = exercise_data.groupby('User')['exercise_time'].mean()
average_walk_time = exercise_data.groupby('User')['walk_time'].mean()
exercise_amount = exercise_data[exercise_data['exercise'] == 1].groupby('User')['exercise'].sum()
exercise_avg_per_day = exercise_amount/semester_length_in_days

combined_csv['amount_of_workouts'] = exercise_amount
combined_csv['avg_workout_per_day'] = exercise_avg_per_day
combined_csv['avg_workout_time'] = average_exercise_time
combined_csv['avg_walk_time'] = average_walk_time

#### Sleep


In [170]:
sleep_hour_mapping = {
    1:2, 2:3.5, 3:4, 4:4.5, 5:5, 6:5.5, 7:6, 8:6.5, 9:7, 10:7.5, 11:8, 12:8.5, 13:9, 14:9.5, 15:10, 16:10.5, 17:11, 18:11.5, 19:12
}

sleep_data = all_data['Sleep']
sleep_data['sleep_hours'] = sleep_data['hour'].map(sleep_hour_mapping)

average_sleep_hours = sleep_data.groupby('User')['sleep_hours'].mean()
average_sleep_rating = sleep_data.groupby('User')['rate'].mean()

combined_csv['avg_sleep_hours'] = average_sleep_hours
combined_csv['avg_sleep_rating'] = average_sleep_rating


#### Mood

In [171]:
sad_happy_data = all_data['Mood']
feeling_data = all_data['Mood2']

# Cleaning the data
sad_happy_data = sad_happy_data.dropna(subset=['happy'])
sad_happy_data.loc[sad_happy_data['happyornot'] == 2, 'happy'] = 0
sad_happy_data = sad_happy_data.dropna(subset=['sad'])
sad_happy_data.loc[sad_happy_data['sadornot'] == 2, 'sad'] = 0

# Analysing 
average_happy_rating = sad_happy_data.groupby('User')['happy'].mean()
average_sad_rating = sad_happy_data.groupby('User')['sad'].mean()

total_mood_entries = feeling_data.groupby('User')['how'].sum()
happy_entry_sum = feeling_data[feeling_data['how'] == 1].groupby('User')['how'].sum()
sad_entry_sum = feeling_data[feeling_data['how'] == 3].groupby('User')['how'].sum()
stressed_entry_sum = feeling_data[feeling_data['how'] == 2].groupby('User')['how'].sum()


# Adding to combined df
combined_csv['avg_happy_rating'] = average_happy_rating
combined_csv['avg_sad_rating'] = average_sad_rating
combined_csv['happy_percentage'] = (happy_entry_sum/total_mood_entries)*100
combined_csv['sad_percentage'] = (sad_entry_sum/total_mood_entries)*100
combined_csv['stressed_percentage'] = (stressed_entry_sum/total_mood_entries)*100

#### Study/Relax Time

In [172]:
percentage_spent_mapping = {
    1:5, 2:18, 3:38, 4:63, 5:88
}
time_spent_data = all_data['Activity']

time_spent_data['alone_working'] = time_spent_data['working'].map(percentage_spent_mapping)
time_spent_data['alone_relaxing'] = time_spent_data['relaxing'].map(percentage_spent_mapping)
time_spent_data['together_working'] = time_spent_data['other_working'].map(percentage_spent_mapping)
time_spent_data['together_relaxing'] = time_spent_data['other_relaxing'].map(percentage_spent_mapping)
time_spent_data['total_time_spent'] = time_spent_data['together_relaxing'] + time_spent_data['together_working'] + time_spent_data['alone_relaxing'] + time_spent_data['alone_working']

# Percentages
time_spent_data['alone_percentage'] = (time_spent_data['alone_working'] + time_spent_data['alone_relaxing'])/time_spent_data['total_time_spent']  * 100
time_spent_data['together_percentage'] = (time_spent_data['together_working'] + time_spent_data['together_relaxing'])/time_spent_data['total_time_spent']  * 100
time_spent_data['working_percentage'] = (time_spent_data['alone_working'] + time_spent_data['together_working'])/time_spent_data['total_time_spent']  * 100
time_spent_data['relaxing_percentage'] = (time_spent_data['alone_relaxing'] + time_spent_data['together_relaxing'])/time_spent_data['total_time_spent']  * 100

# adding averages
combined_csv['avg_alone_percentage'] = time_spent_data.groupby('User')['alone_percentage'].mean()
combined_csv['avg_together_percentage'] = time_spent_data.groupby('User')['together_percentage'].mean()
combined_csv['avg_working_percentage'] = time_spent_data.groupby('User')['working_percentage'].mean()
combined_csv['avg_relaxing_percentage'] = time_spent_data.groupby('User')['relaxing_percentage'].mean()




#### Creating the .csv

In [173]:

df = pd.DataFrame.from_dict(combined_csv, orient='columns')
df.to_csv(output_path, index=True)

#### Grades

  file_path = "data/StudentLife\dataset\education\grades.csv"
