# Preprocessing donor info
The original file is '/Users/dionnespaltman/Desktop/V3/FAINT_Info_Personality_timepoints.csv'. 

The clean file is '/Users/dionnespaltman/Desktop/V3/clean_donor_info.csv'.

Output is 'processed_donor_info' and 'VVR_scores'. 

In [2]:
import pandas as pd
import csv 

In [3]:
path_name = '/Users/dionnespaltman/Desktop/V3/clean_donor_info.csv'
clean_donor_info = pd.read_csv(path_name)

In [4]:
clean_donor_info.rename(columns={'Unnamed: 0': 'Index'}, inplace=True)

In [5]:
display(clean_donor_info.head(5))

Unnamed: 0,Index,ID,Stage,Gender,Age,Date,Location,Condition,Weight,Length,...,Dizziness,Weakness,Lightheadedness,Fear,Tension,Stress,Nervousness,Physical_sum,Psychological_sum,na.rm
0,0,5,1.0,2,33.0,2019-10-21,0,1,74.0,171.0,...,1.0,1.0,1.0,2.0,2.0,2.0,2.0,4.0,8.0,True
1,1,5,2.0,2,33.0,2019-10-21,0,1,74.0,171.0,...,1.0,1.0,1.0,1.0,2.0,1.0,2.0,4.0,6.0,True
2,2,5,3.0,2,33.0,2019-10-21,0,1,74.0,171.0,...,1.0,1.0,1.0,1.0,2.0,1.0,2.0,4.0,6.0,True
3,3,5,4.0,2,33.0,2019-10-21,0,1,74.0,171.0,...,1.0,1.0,1.0,1.0,1.0,1.0,2.0,4.0,5.0,True
4,4,5,5.0,2,33.0,2019-10-21,0,1,74.0,171.0,...,1.0,1.0,1.0,1.0,1.0,1.0,2.0,4.0,5.0,True


## Adding sum scores per stage

In [6]:
# Sum the specified measurements to create the new column
clean_donor_info['sum_VVR'] = clean_donor_info[['Faintness', 'Dizziness', 'Weakness', 'Lightheadedness', 'Fear', 'Tension', 'Stress', 'Nervousness']].sum(axis=1, skipna=True)

# Get the maximum and minimum values of the sum_VVR column
max_sum_VVR = clean_donor_info['sum_VVR'].max()
min_sum_VVR = clean_donor_info['sum_VVR'].min()

print("Maximum value of sum_VVR:", max_sum_VVR)
print("Minimum value of sum_VVR:", min_sum_VVR)

display(clean_donor_info.head(5))

Maximum value of sum_VVR: 40.0
Minimum value of sum_VVR: 8.0


Unnamed: 0,Index,ID,Stage,Gender,Age,Date,Location,Condition,Weight,Length,...,Weakness,Lightheadedness,Fear,Tension,Stress,Nervousness,Physical_sum,Psychological_sum,na.rm,sum_VVR
0,0,5,1.0,2,33.0,2019-10-21,0,1,74.0,171.0,...,1.0,1.0,2.0,2.0,2.0,2.0,4.0,8.0,True,12.0
1,1,5,2.0,2,33.0,2019-10-21,0,1,74.0,171.0,...,1.0,1.0,1.0,2.0,1.0,2.0,4.0,6.0,True,10.0
2,2,5,3.0,2,33.0,2019-10-21,0,1,74.0,171.0,...,1.0,1.0,1.0,2.0,1.0,2.0,4.0,6.0,True,10.0
3,3,5,4.0,2,33.0,2019-10-21,0,1,74.0,171.0,...,1.0,1.0,1.0,1.0,1.0,2.0,4.0,5.0,True,9.0
4,4,5,5.0,2,33.0,2019-10-21,0,1,74.0,171.0,...,1.0,1.0,1.0,1.0,1.0,2.0,4.0,5.0,True,9.0


## Creating feature: stage 1 and 2

In [7]:
# Filter rows where Stage is 1 or 2
filtered_df= clean_donor_info[clean_donor_info['Stage'].isin([1, 2])]

# Group by ID and calculate the sum of sum_VVR
VVR_scores = filtered_df.groupby('ID')['sum_VVR'].sum().reset_index()

# Rename the column to sum_12
VVR_scores.rename(columns={'sum_VVR': 'sum_12'}, inplace=True)

# Get the maximum and minimum values of the sum_VVR column
max_sum_VVR = VVR_scores['sum_12'].max()
min_sum_VVR = VVR_scores['sum_12'].min()

print("Maximum value of sum_12:", max_sum_VVR)
print("Minimum value of sum_12:", min_sum_VVR)

# Now sum_12 DataFrame contains ID and the sum of sum_VVR for Stage 1 or 2
display(VVR_scores.head(5))

Maximum value of sum_12: 42.0
Minimum value of sum_12: 16.0


Unnamed: 0,ID,sum_12
0,5,22.0
1,6,16.0
2,7,23.0
3,8,17.0
4,9,16.0


## Creating dependent variable: sum VVR stages 4, 5, 6, 7

In [8]:
# Filter rows where Stage is 4, 5, 6, or 7
filtered_df_4567 = clean_donor_info[clean_donor_info['Stage'].isin([4, 5, 6, 7])]

# Group by ID and calculate the sum of sum_VVR
sum_4567 = filtered_df_4567.groupby('ID')['sum_VVR'].sum().reset_index()

# Create a new column in VVR_scores and assign the sum_4567 values
VVR_scores['sum_4567'] = sum_4567['sum_VVR'].tolist()

# Get the maximum and minimum values of the sum_VVR column
max_sum_VVR = VVR_scores['sum_4567'].max()
min_sum_VVR = VVR_scores['sum_4567'].min()

print("Maximum value of sum_4567:", max_sum_VVR)
print("Minimum value of sum_4567:", min_sum_VVR)

display(VVR_scores)

Maximum value of sum_4567: 82.0
Minimum value of sum_4567: 32.0


Unnamed: 0,ID,sum_12,sum_4567
0,5,22.0,34.0
1,6,16.0,32.0
2,7,23.0,77.0
3,8,17.0,32.0
4,9,16.0,32.0
...,...,...,...
315,328,20.0,35.0
316,329,18.0,35.0
317,330,23.0,35.0
318,331,28.0,38.0


## EXTRA: creating sum of stage 4, 5, 6 (in the donation chair)

In [9]:
# Filter rows where Stage is 4, 5 or 6
filtered_df_456 = clean_donor_info[clean_donor_info['Stage'].isin([4, 5, 6])]

# Group by ID and calculate the sum of sum_VVR
sum_456 = filtered_df_456.groupby('ID')['sum_VVR'].sum().reset_index()

# Create a new column in VVR_scores and assign the sum_4567 values
VVR_scores['sum_456'] = sum_456['sum_VVR'].tolist()

# Get the maximum and minimum values of the sum_VVR column
max_sum_VVR = VVR_scores['sum_456'].max()
min_sum_VVR = VVR_scores['sum_456'].min()

print("Maximum value of sum_456:", max_sum_VVR)
print("Minimum value of sum_456:", min_sum_VVR)

display(VVR_scores)

Maximum value of sum_456: 69.0
Minimum value of sum_456: 24.0


Unnamed: 0,ID,sum_12,sum_4567,sum_456
0,5,22.0,34.0,26.0
1,6,16.0,32.0,24.0
2,7,23.0,77.0,65.0
3,8,17.0,32.0,24.0
4,9,16.0,32.0,24.0
...,...,...,...,...
315,328,20.0,35.0,27.0
316,329,18.0,35.0,27.0
317,330,23.0,35.0,26.0
318,331,28.0,38.0,30.0


## Mean and creation of low and high VVR group

In [13]:
# Calculate the mean of the 'sum_456' column, excluding NaN values
mean_sum_4567 = VVR_scores['sum_4567'].mean(skipna=False)

# Print the mean value
print("Mean of 'sumVVR_4567' column (excluding NaN values):", mean_sum_4567)

# Create a new column 'VVR_group' based on the condition
VVR_scores['VVR_group'] = VVR_scores['sum_456'].apply(lambda x: 0 if x < mean_sum_4567 else 1)

display(VVR_scores.head(5))

Mean of 'sumVVR_4567' column (excluding NaN values): 39.4


Unnamed: 0,ID,sum_12,sum_4567,sum_456,VVR_group
0,5,22.0,34.0,26.0,0
1,6,16.0,32.0,24.0,0
2,7,23.0,77.0,65.0,1
3,8,17.0,32.0,24.0,0
4,9,16.0,32.0,24.0,0


In [12]:
# Count the occurrences of each unique value in the 'VVR_group' column
vvr_group_counts = VVR_scores['VVR_group'].value_counts()

# Print the counts
print("Count of values in 'VVR_group' column:")
print(vvr_group_counts)


Count of values in 'VVR_group' column:
0    290
1     30
Name: VVR_group, dtype: int64


## Adding condition to VVR_scores

In [69]:
# Merge VVR_scores with clean_donor_info on the 'ID' column
VVR_scores = pd.merge(VVR_scores, clean_donor_info[['ID', 'Condition']], on='ID', how='left')

# Display VVR_scores
display(VVR_scores)

Unnamed: 0,ID,sum_12,sum_4567,sum_456,VVR_group,Condition
0,5,22.0,34.0,26.0,0,1
1,5,22.0,34.0,26.0,0,1
2,5,22.0,34.0,26.0,0,1
3,5,22.0,34.0,26.0,0,1
4,5,22.0,34.0,26.0,0,1
...,...,...,...,...,...,...
2044,332,30.0,57.0,47.0,1,2
2045,332,30.0,57.0,47.0,1,2
2046,332,30.0,57.0,47.0,1,2
2047,332,30.0,57.0,47.0,1,2


Double check if the count occurences of condition are the same in both dataframes. 

In [70]:
# Count of unique values in the 'Condition' column in clean_donor_info
clean_donor_info_condition_count = clean_donor_info['Condition'].value_counts()

# Count of unique values in the 'Condition' column in VVR_scores
VVR_scores_condition_count = VVR_scores['Condition'].value_counts()

# Display the counts
print("Count of Condition in clean_donor_info:")
print(clean_donor_info_condition_count)

print("\nCount of Condition in VVR_scores:")
print(VVR_scores_condition_count)

Count of Condition in clean_donor_info:
3    1084
1     534
2     431
Name: Condition, dtype: int64

Count of Condition in VVR_scores:
3    1084
1     534
2     431
Name: Condition, dtype: int64


## Reducing VVR_scores 

In [9]:
import pandas as pd 

VVR_scores = pd.read_csv('/Users/dionnespaltman/Desktop/V3/VVR_scores.csv')

In [13]:
# Keep only one row per ID
VVR_scores_filtered = VVR_scores.drop_duplicates(subset='ID', keep='first')

In [14]:
display(VVR_scores_filtered)

Unnamed: 0.1,Unnamed: 0,ID,sum_12,sum_4567,sum_456,VVR_group,Condition
0,0,5,22.0,34.0,26.0,0,1
7,7,6,16.0,32.0,24.0,0,2
14,14,7,23.0,77.0,65.0,1,2
21,21,8,17.0,32.0,24.0,0,1
28,28,9,16.0,32.0,24.0,0,1
...,...,...,...,...,...,...,...
2019,2019,328,20.0,35.0,27.0,0,3
2025,2025,329,18.0,35.0,27.0,0,3
2031,2031,330,23.0,35.0,26.0,0,3
2037,2037,331,28.0,38.0,30.0,0,3


## Saving the processed file

In [5]:
display(clean_donor_info.head(5))
display(VVR_scores.head(5))

NameError: name 'clean_donor_info' is not defined

In [15]:
display(VVR_scores_filtered.head(5))

Unnamed: 0.1,Unnamed: 0,ID,sum_12,sum_4567,sum_456,VVR_group,Condition
0,0,5,22.0,34.0,26.0,0,1
7,7,6,16.0,32.0,24.0,0,2
14,14,7,23.0,77.0,65.0,1,2
21,21,8,17.0,32.0,24.0,0,1
28,28,9,16.0,32.0,24.0,0,1


In [72]:
#clean_donor_info.to_csv('/Users/dionnespaltman/Desktop/V3/processed_donor_info.csv', sep=',')

In [73]:
#VVR_scores.to_csv('/Users/dionnespaltman/Desktop/V3/VVR_scores.csv', sep=',')

In [16]:
# VVR_scores_filtered.to_csv('/Users/dionnespaltman/Desktop/V3/VVR_scores_final.csv', sep=',')