In [1]:
import pandas as pd
import os
import json
import numpy as np
import numpy as np 
import statsmodels.api as sm 
import matplotlib.pyplot as plt 
import seaborn as sns

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# I - Pre-analysis

### Before proceeding to MRI analyses, we must conduct a pre-analysis of test subjects : indeed their score in each category will be computed with machine learning algorithms

## 1 - Preliminary data cleaning

In [2]:
# Load phenotype data associated with participants
phenotype_path = '/home/skander/ds004920/phenotype/'

# List all files in the phenotype directory
files = os.listdir(phenotype_path)

# Initialize an empty DataFrame to accumulate phenotype data
all_phenotype_data = pd.DataFrame()

# Process each file
for file in files:
    file_path = os.path.join(phenotype_path, file)

    if file.endswith('.tsv'):
        # Load TSV file, considering multiple formats as NaN
        temp_df = pd.read_csv(file_path, sep='\t', na_values=['n/a', 'NA', 'na', 'N/A', ''])
        # Append the data from each TSV file to the accumulated DataFrame
        all_phenotype_data = pd.concat([all_phenotype_data, temp_df], ignore_index=True)

# Aggregate the values for each phenotype
aggregated_phenotype_data = all_phenotype_data.groupby('participant_id').sum().reset_index()

# Load participants data
participants_path = '/home/skander/ds004920/participants.tsv'
participants_df = pd.read_csv(participants_path, sep='\t', na_values=['n/a', 'NA', 'na', 'N/A', ''])

# Merge aggregated phenotype data with participants data
combined_df = pd.merge(participants_df, aggregated_phenotype_data, on='participant_id', how='left')

# Display the first few rows of the combined DataFrame
print(combined_df.head())

# Define the file path for the CSV file in your /home directory
#csv_file_path = '/home/skander/combined1.csv'

# Save the combined DataFrame as a CSV file
#combined_df.to_csv(csv_file_path, index=False)


  participant_id  screen_bmi handedness             screen_race  \
0       sub-1001   22.128772       Left       Two or more races   
1       sub-1002   21.284602      Right                   White   
2       sub-1003   31.882086      Right  Black/African American   
3       sub-1004   22.312012      Right       Two or more races   
4       sub-1006   19.156142      Right                   White   

  appt1_drugtest  appt1_breathalyzer        age    sex  deception  \
0       negative                 0.0  19.294145  Woman         10   
1       negative                 0.0  21.139380    Man          6   
2       negative                 0.0  18.992861  Woman          5   
3       negative                 0.0  19.006550  Woman          3   
4       negative                 0.0  20.654885    Man          1   

   screen_meds_pastmonth  ...  audit_2  audit_3  audit_4  audit_5  audit_6  \
0                    1.0  ...      1.0      0.0      1.0      1.0      0.0   
1                    0.0  

We make it simpler by making an overall score, indeed 437 columns cannot give any information

In [3]:
# Replace NaN or N/A values with 0
combined_df = combined_df.replace('N/A', 0).fillna(0)

# Remove duplicates, keeping the first occurrence
combined_df = combined_df.drop_duplicates(subset='participant_id', keep='first')

# Truncate the decimal places in 'screen_BMI' and 'age' without rounding
combined_df['screen_bmi'] = combined_df['screen_bmi'].apply(lambda x: np.floor(x))
combined_df['age'] = combined_df['age'].apply(lambda x: np.floor(x))

# Function to sum columns based on containing specific substrings
def sum_columns(df, substrings):
    for substring, new_col_name in substrings.items():
        # Select columns that contain the specific substring
        selected_columns = df.filter(like=substring).columns

        # Sum these columns and create a new column
        df[new_col_name] = df[selected_columns].sum(axis=1)

        #drop the original columns
        df.drop(selected_columns, axis=1, inplace=True)

# Dictionary mapping substrings to new column names
substrings_to_merge = {
    'aadis': 'juvenile_drug_alcohol_use',
    'audit': 'Alcohol_use',
    'asrm': 'Mania_Scale',
    'aq': 'Autism_Quotient',
    'bdi': 'Back_depression_inventory',
    'bisbas': 'Inhibited',
    'bpaq': 'Aggressive',
    'ctqsf_adult_c': 'Childhood_trauma',
    'dudit': 'Drug_Use',
    'ios_computer': 'PC_use',
    'ios_p': 'Friend_behavior',
    'ios_fu_score': 'Stranger_behavior',
    'iri': 'Interpersonal_Reactivity_Index',
    'pnr': 'Social_reactivity',
    'pvss': 'Positivity',
    'quic_adult_cj': 'Unpredictability_in_Childhood',
    'rse': 'Self_esteem',
    'spsrq': 'Sensitivity_to_punishment_reward',
    'susd': '7_up_7_down',
    'seq_adult_cj': 'Social_Experience',
    'score_teps': 'Temporal_pleasure',
    'tei': 'Emotional_intelligence'
    # ... Add any additional mappings here
}

# Perform the column summing based on substrings
sum_columns(combined_df, substrings_to_merge)

# Display the DataFrame for verification
print(combined_df.head())

# Print all column names in the DataFrame
print(combined_df.columns.tolist())

  participant_id  screen_bmi handedness             screen_race  \
0       sub-1001        22.0       Left       Two or more races   
1       sub-1002        21.0      Right                   White   
2       sub-1003        31.0      Right  Black/African American   
3       sub-1004        22.0      Right       Two or more races   
4       sub-1006        19.0      Right                   White   

  appt1_drugtest  appt1_breathalyzer   age    sex  deception  \
0       negative                 0.0  19.0  Woman         10   
1       negative                 0.0  21.0    Man          6   
2       negative                 0.0  18.0  Woman          5   
3       negative                 0.0  19.0  Woman          3   
4       negative                 0.0  20.0    Man          1   

   screen_meds_pastmonth  ...  Interpersonal_Reactivity_Index  \
0                    1.0  ...                            66.0   
1                    0.0  ...                            56.0   
2                

we have one column per medication used, we transform it into 2 features : Number of medication, and medication type (tbd : one-hot encore)

In [4]:
# Add a new column for the total drug use
drug_columns = [
    'screen_meds_stabilizer', 'screen_meds_ssri', 'screen_meds_epilepsy', 
    'screen_meds_psychosis', 'screen_meds_antianxiety', 'screen_meds_pain'
]
combined_df['total_drug_use'] = combined_df[drug_columns].sum(axis=1)

# Function to create a list of medication names, removing 'screen_meds_'
def list_medications(row, column_names):
    medications = [col.replace('screen_meds_', '') for col in column_names if row[col] != 0]
    return ', '.join(medications) if medications else 'None'

# Add a new column listing the modified medication names
combined_df['medication_names'] = combined_df.apply(lambda row: list_medications(row, drug_columns), axis=1)

# Columns to be removed
columns_to_remove = [
    'screen_meds_stabilizer', 'screen_meds_ssri', 'screen_meds_epilepsy', 
    'screen_meds_psychosis', 'screen_meds_antianxiety', 'screen_meds_pain'
]

# Removing the specified columns from combined_df
combined_df = combined_df.drop(columns=columns_to_remove)

# Display the DataFrame for verification
#print(combined_df.head())

# Display the DataFrame for verification with a selection of columns
#print(combined_df.columns.tolist())

## 2 - Preliminary feature engineering

### A - We take values who would be interpreted wrongfully as "better" or "lesser" (an age of 40 would be higher ranked than the age 20, which makes no sense), so let's transform these features into ranges use to categorize relevant data

Transforming individual BMI's into BMI's ranges

In [5]:
import pandas as pd
# Define a function to categorize BMI based on the provided ranges
def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi <= 24.9:
        return 'Healthy Weight'
    elif 25 <= bmi <= 29.9:
        return 'Overweight'
    elif 30 <= bmi <= 34.9:
        return 'Obese'
    elif 35 <= bmi <= 39.9:
        return 'Severely Obese'
    else:  # Assuming that any BMI of 40 or above is Morbidly Obese
        return 'Morbidly Obese'

# Apply the function to the 'screen_bmi' column to create a new 'bmi_category' column
combined_df['bmi_category'] = combined_df['screen_bmi'].apply(categorize_bmi)

# drop the original 'screen_bmi' column we won't use
combined_df.drop('screen_bmi', axis=1, inplace=True)


Transforming age into age ranges

In [6]:
# Define the function to categorize age
def categorize_age(age):
    if age <= 12:
        return 'Child'
    elif age <= 17:
        return 'Teen'
    elif age <= 24:
        return 'Young Adult'
    elif age <= 34:
        return 'Adult'
    elif age <= 44:
        return 'Mature Adult'
    elif age <= 54:
        return 'Middle Aged'
    elif age <= 64:
        return 'Senior'
    else:
        return 'Elderly'

# Apply the function to the 'age' column
combined_df['age_category'] = combined_df['age'].apply(categorize_age)

# We drop the original 'age' column as we no longer need it
combined_df.drop('age', axis=1, inplace=True)


saving a csv file for tests

In [7]:
# Define the file path for the CSV file in your /home directory
#csv_file_path = '/home/skander/combined2.csv'

# Save the DataFrame as a CSV file
#combined_df.to_csv(csv_file_path, index=False)

check = ok, results intended, proceeding to the next step

### B - Hot-encoding categorial variables

#### hot-encoding is chosen rather than ordinal encoding, there is no hierarchy between data (e.g health and overweight may be interpreted as such, but we cannot say an Obese is hierarchically higher than a healthy or underweight person, it's hierarchy-independant)

In [8]:
# List of categorical columns
categorical_columns = ['bmi_category', 'age_category', 'medication_names', 'sex', 'handedness', 'screen_race']

# Apply one-hot encoding
for column in categorical_columns:
    dummies = pd.get_dummies(combined_df[column], prefix=column)
    combined_df = pd.concat([combined_df, dummies], axis=1)
    combined_df.drop(column, axis=1, inplace=True)

### C - Gaussian distribution : normalization vs standardization

#### Determining if we have a gaussian distribution, which is critical to make our model tailored for deep learning