# Preprocessing donor info

In [1]:
import pandas as pd
import csv 

First we load the data. 

In [2]:
path_name = '/Users/dionnespaltman/Desktop/V4/VVR_measurements/VVR_measurements.csv'
VVR_measurements = pd.read_csv(path_name)
VVR_measurements.rename(columns={'Unnamed: 0': 'Index'}, inplace=True)

In [3]:
display(VVR_measurements.head(5))

Unnamed: 0,Index,ID,Stage,Gender,Age,Date,Location,Condition,Weight,Length,Faintness,Dizziness,Weakness,Lightheadedness,Fear,Tension,Stress,Nervousness,Physical_sum,Psychological_sum
0,0,5,1.0,2,33.0,2019-10-21,0,1,74.0,171.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,4.0,8.0
1,1,5,2.0,2,33.0,2019-10-21,0,1,74.0,171.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,4.0,6.0
2,2,5,3.0,2,33.0,2019-10-21,0,1,74.0,171.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,4.0,6.0
3,3,5,4.0,2,33.0,2019-10-21,0,1,74.0,171.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,4.0,5.0
4,4,5,5.0,2,33.0,2019-10-21,0,1,74.0,171.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,4.0,5.0


# Adding sum scores per stage

We'll make the sum scores. 

In [4]:
# Sum the specified measurements to create the new column
VVR_measurements['Sum_VVR'] = VVR_measurements[['Faintness', 'Dizziness', 'Weakness', 'Lightheadedness', 'Fear', 'Tension', 'Stress', 'Nervousness']].sum(axis=1, skipna=True)

# Get the maximum and minimum values of the sum_VVR column
max_sum_VVR = VVR_measurements['Sum_VVR'].max()
min_sum_VVR = VVR_measurements['Sum_VVR'].min()

print("Maximum value of sum_VVR:", max_sum_VVR)
print("Minimum value of sum_VVR:", min_sum_VVR)

display(VVR_measurements.head(5))

Maximum value of sum_VVR: 40.0
Minimum value of sum_VVR: 8.0


Unnamed: 0,Index,ID,Stage,Gender,Age,Date,Location,Condition,Weight,Length,...,Dizziness,Weakness,Lightheadedness,Fear,Tension,Stress,Nervousness,Physical_sum,Psychological_sum,Sum_VVR
0,0,5,1.0,2,33.0,2019-10-21,0,1,74.0,171.0,...,1.0,1.0,1.0,2.0,2.0,2.0,2.0,4.0,8.0,12.0
1,1,5,2.0,2,33.0,2019-10-21,0,1,74.0,171.0,...,1.0,1.0,1.0,1.0,2.0,1.0,2.0,4.0,6.0,10.0
2,2,5,3.0,2,33.0,2019-10-21,0,1,74.0,171.0,...,1.0,1.0,1.0,1.0,2.0,1.0,2.0,4.0,6.0,10.0
3,3,5,4.0,2,33.0,2019-10-21,0,1,74.0,171.0,...,1.0,1.0,1.0,1.0,1.0,1.0,2.0,4.0,5.0,9.0
4,4,5,5.0,2,33.0,2019-10-21,0,1,74.0,171.0,...,1.0,1.0,1.0,1.0,1.0,1.0,2.0,4.0,5.0,9.0


# Creating feature stage 1 and 2

In [5]:
# Filter rows where Stage is 1 or 2
VVR_measurements_filtered = VVR_measurements[VVR_measurements['Stage'].isin([1, 2])]

# Group by ID and calculate the sum of sum_VVR
VVR_scores = VVR_measurements_filtered.groupby('ID')['Sum_VVR'].sum().reset_index()

# Rename the column to sum_12
VVR_scores.rename(columns={'Sum_VVR': 'Sum_12'}, inplace=True)

# Get the maximum and minimum values of the sum_VVR column
max_sum_VVR = VVR_scores['Sum_12'].max()
min_sum_VVR = VVR_scores['Sum_12'].min()

print("Maximum value of sum_12:", max_sum_VVR)
print("Minimum value of sum_12:", min_sum_VVR)

# Now sum_12 DataFrame contains ID and the sum of sum_VVR for Stage 1 or 2
display(VVR_scores.head(5))

Maximum value of sum_12: 42.0
Minimum value of sum_12: 16.0


Unnamed: 0,ID,Sum_12
0,5,22.0
1,6,16.0
2,7,23.0
3,8,17.0
4,9,16.0


# Creating dependent variable: sum VVR stages 4, 5, 6, 7

In [6]:
# Filter rows where Stage is 4, 5, 6, or 7
VVR_measurements_filtered = VVR_measurements[VVR_measurements['Stage'].isin([4, 5, 6, 7])]

# Group by ID and calculate the sum of sum_VVR
sum_4567 = VVR_measurements_filtered.groupby('ID')['Sum_VVR'].sum().reset_index()

# Create a new column in VVR_scores and assign the sum_4567 values
VVR_scores['Sum_4567'] = sum_4567['Sum_VVR'].tolist()

# Get the maximum and minimum values of the sum_VVR column
max_sum_VVR = VVR_scores['Sum_4567'].max()
min_sum_VVR = VVR_scores['Sum_4567'].min()

print("Maximum value of sum_4567:", max_sum_VVR)
print("Minimum value of sum_4567:", min_sum_VVR)

display(VVR_scores)

Maximum value of sum_4567: 82.0
Minimum value of sum_4567: 32.0


Unnamed: 0,ID,Sum_12,Sum_4567
0,5,22.0,34.0
1,6,16.0,32.0
2,7,23.0,77.0
3,8,17.0,32.0
4,9,16.0,32.0
...,...,...,...
315,328,20.0,35.0
316,329,18.0,35.0
317,330,23.0,35.0
318,331,28.0,38.0


## Adding stage 1 and 2 seperately 

In [8]:
# Assuming clean_donor_info is your DataFrame
# Filter rows where Stage is 1 
VVR_measurements_filtered = VVR_measurements[VVR_measurements['Stage'] == 1]

# Reset index of filtered_df
VVR_measurements_filtered.reset_index(drop=True, inplace=True)

# Assign sum_VVR values to VVR_scores
VVR_scores['VVR_1'] = VVR_measurements_filtered['Sum_VVR']

# Display VVR_scores DataFrame
display(VVR_scores)

Unnamed: 0,ID,Sum_12,Sum_4567,VVR_1
0,5,22.0,34.0,12.0
1,6,16.0,32.0,8.0
2,7,23.0,77.0,11.0
3,8,17.0,32.0,9.0
4,9,16.0,32.0,8.0
...,...,...,...,...
315,328,20.0,35.0,10.0
316,329,18.0,35.0,10.0
317,330,23.0,35.0,11.0
318,331,28.0,38.0,14.0


In [9]:
# Assuming clean_donor_info is your DataFrame
# Filter rows where Stage is 1 
VVR_measurements_filtered = VVR_measurements[VVR_measurements['Stage'] == 2]

# Reset index of filtered_df
VVR_measurements_filtered.reset_index(drop=True, inplace=True)

# Assign sum_VVR values to VVR_scores
VVR_scores['VVR_2'] = VVR_measurements_filtered['Sum_VVR']

# Display VVR_scores DataFrame
display(VVR_scores.head(5))

Unnamed: 0,ID,Sum_12,Sum_4567,VVR_1,VVR_2
0,5,22.0,34.0,12.0,10.0
1,6,16.0,32.0,8.0,8.0
2,7,23.0,77.0,11.0,12.0
3,8,17.0,32.0,9.0,8.0
4,9,16.0,32.0,8.0,8.0


In [10]:
# Count the number of NaN values in each column of the DataFrame
nan_counts = VVR_scores.isna().sum()

# Print columns with NaN values
nan_columns = nan_counts[nan_counts > 0]
print("Columns with NaN values:")
print(nan_columns)

Columns with NaN values:
Series([], dtype: int64)


# Creating sum of stage 4, 5, 6 (in the donation chair)

In [11]:
# Filter rows where Stage is 4, 5 or 6
VVR_measurements_filtered = VVR_measurements[VVR_measurements['Stage'].isin([4, 5, 6])]

# Group by ID and calculate the sum of sum_VVR
sum_456 = VVR_measurements_filtered.groupby('ID')['Sum_VVR'].sum().reset_index()

# Create a new column in VVR_scores and assign the sum_4567 values
VVR_scores['Sum_456'] = sum_456['Sum_VVR'].tolist()

# Get the maximum and minimum values of the sum_VVR column
max_sum_VVR = VVR_scores['Sum_456'].max()
min_sum_VVR = VVR_scores['Sum_456'].min()

print("Maximum value of sum_456:", max_sum_VVR)
print("Minimum value of sum_456:", min_sum_VVR)

display(VVR_scores)

Maximum value of sum_456: 69.0
Minimum value of sum_456: 24.0


Unnamed: 0,ID,Sum_12,Sum_4567,VVR_1,VVR_2,Sum_456
0,5,22.0,34.0,12.0,10.0,26.0
1,6,16.0,32.0,8.0,8.0,24.0
2,7,23.0,77.0,11.0,12.0,65.0
3,8,17.0,32.0,9.0,8.0,24.0
4,9,16.0,32.0,8.0,8.0,24.0
...,...,...,...,...,...,...
315,328,20.0,35.0,10.0,10.0,27.0
316,329,18.0,35.0,10.0,8.0,27.0
317,330,23.0,35.0,11.0,12.0,26.0
318,331,28.0,38.0,14.0,14.0,30.0


# Mean and creation of low and high VVR group

We are still working with the complete file of all 320 participants. However, we're only going to be using the information from 104 participants. 

In [12]:
# Calculate the mean of the 'sum_456' column, excluding NaN values
mean_sum_4567 = VVR_scores['Sum_4567'].mean(skipna=False)

# Print the mean value
print("Mean of 'sumVVR_4567' column:", mean_sum_4567)

# Create a new column 'VVR_group' based on the condition
VVR_scores['VVR_group'] = VVR_scores['Sum_4567'].apply(lambda x: 0 if x < mean_sum_4567 else 1)

display(VVR_scores.head(5))

Mean of 'sumVVR_4567' column: 39.4


Unnamed: 0,ID,Sum_12,Sum_4567,VVR_1,VVR_2,Sum_456,VVR_group
0,5,22.0,34.0,12.0,10.0,26.0,0
1,6,16.0,32.0,8.0,8.0,24.0,0
2,7,23.0,77.0,11.0,12.0,65.0,1
3,8,17.0,32.0,9.0,8.0,24.0,0
4,9,16.0,32.0,8.0,8.0,24.0,0


In [13]:
# Count the occurrences of each unique value in the 'VVR_group' column
vvr_group_counts = VVR_scores['VVR_group'].value_counts()

# Print the counts
print("Count of values in 'VVR_group' column:")
print(vvr_group_counts)

Count of values in 'VVR_group' column:
0    216
1    104
Name: VVR_group, dtype: int64


# Adding condition to VVR_scores

In [14]:
# Merge VVR_scores with clean_donor_info on the 'ID' column
VVR_scores = pd.merge(VVR_scores, VVR_measurements[['ID', 'Condition', 'Date', 'Gender']], on='ID', how='left')

# Display VVR_scores
display(VVR_scores)

Unnamed: 0,ID,Sum_12,Sum_4567,VVR_1,VVR_2,Sum_456,VVR_group,Condition,Date,Gender
0,5,22.0,34.0,12.0,10.0,26.0,0,1,2019-10-21,2
1,5,22.0,34.0,12.0,10.0,26.0,0,1,2019-10-21,2
2,5,22.0,34.0,12.0,10.0,26.0,0,1,2019-10-21,2
3,5,22.0,34.0,12.0,10.0,26.0,0,1,2019-10-21,2
4,5,22.0,34.0,12.0,10.0,26.0,0,1,2019-10-21,2
...,...,...,...,...,...,...,...,...,...,...
2044,332,30.0,57.0,14.0,16.0,47.0,1,2,2022-12-19,2
2045,332,30.0,57.0,14.0,16.0,47.0,1,2,2022-12-19,2
2046,332,30.0,57.0,14.0,16.0,47.0,1,2,2022-12-19,2
2047,332,30.0,57.0,14.0,16.0,47.0,1,2,2022-12-19,2


Double check if the count occurences of condition are the same in both dataframes. 

In [15]:
# Count of unique values in the 'Condition' column in clean_donor_info
clean_donor_info_condition_count = VVR_measurements['Condition'].value_counts()

# Count of unique values in the 'Condition' column in VVR_scores
VVR_scores_condition_count = VVR_scores['Condition'].value_counts()

# Display the counts
print("Count of Condition in clean_donor_info:")
print(clean_donor_info_condition_count)

print("\nCount of Condition in VVR_scores:")
print(VVR_scores_condition_count)

Count of Condition in clean_donor_info:
3    1084
1     534
2     431
Name: Condition, dtype: int64

Count of Condition in VVR_scores:
3    1084
1     534
2     431
Name: Condition, dtype: int64


## Reducing VVR_scores by dropping duplicates

In [16]:
# Keep only one row per ID
VVR_scores_filtered = VVR_scores.drop_duplicates(subset='ID', keep='first')

In [17]:
display(VVR_scores_filtered)

Unnamed: 0,ID,Sum_12,Sum_4567,VVR_1,VVR_2,Sum_456,VVR_group,Condition,Date,Gender
0,5,22.0,34.0,12.0,10.0,26.0,0,1,2019-10-21,2
7,6,16.0,32.0,8.0,8.0,24.0,0,2,2019-10-25,1
14,7,23.0,77.0,11.0,12.0,65.0,1,2,2019-10-28,2
21,8,17.0,32.0,9.0,8.0,24.0,0,1,2019-10-30,1
28,9,16.0,32.0,8.0,8.0,24.0,0,1,2019-10-31,1
...,...,...,...,...,...,...,...,...,...,...
2019,328,20.0,35.0,10.0,10.0,27.0,0,3,2022-11-23,2
2025,329,18.0,35.0,10.0,8.0,27.0,0,3,2022-11-28,2
2031,330,23.0,35.0,11.0,12.0,26.0,0,3,2022-11-30,2
2037,331,28.0,38.0,14.0,14.0,30.0,0,3,2022-11-30,2


# Saving VVR_scores_filtered

In [37]:
VVR_scores_filtered.to_csv('/Users/dionnespaltman/Desktop/V4/VVR_measurements/VVR_measurements_clean.csv', sep=',')

In [38]:
display(VVR_scores_filtered)

Unnamed: 0,ID,Sum_12,Sum_4567,VVR_1,VVR_2,Sum_456,VVR_group,Condition,Date,Gender
0,5,22.0,34.0,12.0,10.0,26.0,0,1,2019-10-21,2
7,6,16.0,32.0,8.0,8.0,24.0,0,2,2019-10-25,1
14,7,23.0,77.0,11.0,12.0,65.0,1,2,2019-10-28,2
21,8,17.0,32.0,9.0,8.0,24.0,0,1,2019-10-30,1
28,9,16.0,32.0,8.0,8.0,24.0,0,1,2019-10-31,1
...,...,...,...,...,...,...,...,...,...,...
2019,328,20.0,35.0,10.0,10.0,27.0,0,3,2022-11-23,2
2025,329,18.0,35.0,10.0,8.0,27.0,0,3,2022-11-28,2
2031,330,23.0,35.0,11.0,12.0,26.0,0,3,2022-11-30,2
2037,331,28.0,38.0,14.0,14.0,30.0,0,3,2022-11-30,2
