In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# The data 

In [None]:
DATA_DIR = '/Users/fletchercollis/Desktop/Machine learning for behavioral data/PROJECT_GoGYMi/GoGymi' ### put your path here

activity = pd.read_csv('{}/activity.csv'.format(DATA_DIR))


students = pd.read_csv('{}/students.csv'.format(DATA_DIR))
teachers = pd.read_csv('{}/teachers.csv'.format(DATA_DIR))


gymitrainer = pd.read_csv('{}/gymitrainer.csv'.format(DATA_DIR))


math_questions = pd.read_csv('{}/math_questions.csv'.format(DATA_DIR)) ### NOT USEFUL HERE YET SO NOT LOOKED AT OR CLEANED
text_questions = pd.read_csv('{}/text_questions.csv'.format(DATA_DIR)) ### NOT USEFUL HERE YET SO NOT LOOKED AT OR CLEANED
texts = pd.read_csv('{}/texts.csv'.format(DATA_DIR)) ### NOT USEFUL HERE YET SO NOT LOOKED AT OR CLEANED


math_results = pd.read_csv('{}/math_results.csv'.format(DATA_DIR))
essay_results = pd.read_csv('{}/essay_results.csv'.format(DATA_DIR))
text_results = pd.read_csv('{}/text_results.csv'.format(DATA_DIR))

all_scores = pd.read_csv('{}/all_scores.csv'.format(DATA_DIR))

# First, lets clean up the time in every dataframe to make it easier to manipulate!

In [None]:
activity['activity_started'] = pd.to_datetime(activity['activity_started'], unit='s')
activity['activity_completed'] = pd.to_datetime(activity['activity_completed'], unit='s')
activity['activity_updated'] = pd.to_datetime(activity['activity_updated'], unit='s')

students['creation_time'] = pd.to_datetime(students['creation_time'], unit='s')
teachers['creation_time'] = pd.to_datetime(teachers['creation_time'], unit='s')

gymitrainer['start_time'] = pd.to_datetime(gymitrainer['startTime'], unit='s')
gymitrainer['end_time'] = pd.to_datetime(gymitrainer['endTime'], unit='s')

math_results['time'] = pd.to_datetime(math_results['time'], unit='s')
essay_results['time'] = pd.to_datetime(essay_results['time'], unit='s')
text_results['time'] = pd.to_datetime(text_results['time'], unit='s')

all_scores['time'] = pd.to_datetime(all_scores['time'], unit='s')

In [None]:
activity

# Cleaning the user data

### activity dataframe 

In [None]:
activity_og = activity.copy()

- First lets see if there are any missing or problematic values

In [None]:
activity.isnull().sum().reset_index(name='Nb of NAN')

- Only column where we have NAN and so we replace them with the last updated time 

In [None]:
activity.loc[:, 'activity_completed'] = activity['activity_completed'].fillna(activity['activity_updated'])  

- Also we notice that some of the activity_completed column are from 1970-01-01 and so we replace them with the last updated time 

In [None]:
activity.loc[activity['activity_completed'] < '2020-01-01', 'activity_completed'] = activity['activity_updated']

- Now we look at the time spent on activities

In [None]:
activity['time_spent'] = (activity['activity_completed'] - activity['activity_started']).dt.total_seconds()

plt.figure(figsize=(6, 4))
sns.boxplot(x=activity['time_spent'], color='blue')
plt.xlabel('Time Spent (seconds)')
plt.title('Box Plot of Time Spent on Activities')
plt.show()

print('Nb of rows where the length of the activity is negative :', activity[activity['time_spent'] < 0].size)

- Since there are few we decided to remove the problematic rows 

In [None]:
activity = activity[activity['time_spent'] > 0]

- How much data did we lose?

In [None]:
activity_data_loss = activity.size/activity_og.size
print('Percent of original data lost', 100 - activity_data_loss * 100)

- Now lets look at the distribution of time spent on activities

In [None]:
data = activity['time_spent']

plt.hist(data, bins=100, color='teal', edgecolor='black', alpha=0.7)
plt.title('Histogram Example')
plt.xlabel('time_spent Values')
plt.ylabel('Frequency')
plt.show()

- There seems to be some outliers, so lets remove some activities that seem to be taking too long -----> NEED TO DECIDE WHAT TOO LONG IS 

In [None]:
activity.groupby(by='activity_type')['time_spent'].describe()

In [None]:
quantiles = activity.groupby(by='activity_type')['time_spent'].quantile(0.95)

activity = activity[activity.apply(lambda row: row['time_spent'] <= quantiles[row['activity_type']], axis=1)]

- Now lets's see how much data deleted overall :

In [None]:
activity_data_loss = activity.size/activity_og.size
print('Percent of data lost', 100 - activity_data_loss * 100)

## Cleaning the test data

### all_scores datframe

In [None]:
all_scores_og = all_scores.copy()

- First lets see if there are any missing or problematic values

In [None]:
all_scores.isnull().sum().reset_index(name='Nb of NAN')

In [None]:
all_scores[all_scores['time']< '2023-01-01']

- Now let's see if some users redid the same exam more than once

In [None]:
attempt_count = all_scores.groupby(['user_id', 'test_id','course']).size().reset_index(name='attempt_count')

attempt_count[attempt_count['attempt_count']==2].size

- Let's get rid of everything past the first attempt ----> HERE DISCUSS

In [None]:
all_scores = all_scores.sort_values(by='time')

all_scores = all_scores.groupby(['user_id', 'test_id','course']).first().reset_index()

- Now lets's see how much data deleted overall :

In [None]:
all_scores_data_loss = all_scores.size/all_scores_og.size
print('Percent of data lost', 100 - all_scores_data_loss * 100)

# Now let's export our cleaned data !

In [None]:
# Uncomment this to save the cleaned dataframes :)

# activity.to_csv('activity_cleaned.csv', index=False)    
# all_scores.to_csv('all_scores_cleaned.csv', index=False)        