In [1]:
import os
import json
import pandas as pd
import requests
from pandas.core.frame import DataFrame

import numpy as np
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline


In [3]:
#load the dataframe you want to clean
depr_runs = pd.read_csv('Data/raw_runs/depr_test')
depr_runs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354 entries, 0 to 353
Columns: 3172 entries, Runs to Depression
dtypes: float64(3170), int64(1), object(1)
memory usage: 8.6+ MB


In [110]:
class DataCleaningPipeline:

    def __init__(self):
        self.steps = []

    def add_step(self, name, function):
        self.steps.append({'name': name, 'function': function})

    def execute(self, df):
        results = []
        current_df = df.copy()

        for step in self.steps:
            try:
                current_df = step['function'](current_df)
                results.append(
                    {'step': step['name'],
                    'status': 'success',
                    'rows_affected': len(current_df)
                    }
                )
                
            except Exception as e:
                results.append(
                    {'step': step['name'],
                    'status': 'failed',
                    'error': str(e)
                    }
                )
                break
        return current_df, results

In [111]:
def filter_high_missing_count(df):
    threshold = 50
    filtered_df = df[df["-1"] <= threshold]
    filtered_df.drop(columns=["-1"], inplace=True)
    return filtered_df

In [112]:
def filter_zeros(df):
    #remove all columns that have zeros in more than 90% of samples
    threshold = 0.9
    zero_fraction = (df == 0).sum() / len(df)
    cols_with_many_zeros = zero_fraction[zero_fraction > threshold].index.tolist()
    df_cleaned = df.drop(columns=cols_with_many_zeros)
    return df_cleaned

In [113]:
def remove_low_abundance(df):
    #get the columns where 90% of the samples have an abundance of less than threshold
    abundance_threshold = 0.001
    percentage_threshold = 0.9

    low_abundance_fraction = (df < abundance_threshold).sum() / len(df)
    mostly_low_abundance_cols = low_abundance_fraction[low_abundance_fraction >= percentage_threshold].index.tolist()
    df_cleaned = df.drop(columns=mostly_low_abundance_cols)
    return df_cleaned

In [114]:
def log_transform(df):
    features_logged = np.log1p(df)
    return features_logged

In [115]:
def normalize_data(df):
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(df)
    return pd.DataFrame(features_scaled, columns=df.columns, index=df.index)

In [None]:


pipeline = DataCleaningPipeline()
pipeline.add_step('filter high missing count', filter_high_missing_count)
pipeline.add_step('filter zeros', filter_zeros)
pipeline.add_step('remove low abundance', remove_low_abundance)
#pipeline.add_step('log transform', log_transform)
#pipeline.add_step('normalize data', normalize_data)


In [117]:
#remove identifiers and labels if necessary, otherwise comment out
sample_ids = depr_runs.iloc[:, 0]        # ID
labels = depr_runs.iloc[:, -1]           # label
features = depr_runs.iloc[:, 1:-1]       # features

df, results = pipeline.execute(features) #input only features
print(results)

[{'step': 'filter high missing count', 'status': 'success', 'rows_affected': 217}, {'step': 'filter zeros', 'status': 'success', 'rows_affected': 217}, {'step': 'remove low abundance', 'status': 'success', 'rows_affected': 217}]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.drop(columns=["-1"], inplace=True)


In [None]:
#convert df to DataFrame if it is not already
if not isinstance(df, pd.DataFrame):
    df = pd.DataFrame(df, columns=features.columns, index=features.index)

#insert sample_ids as the first column
df.insert(0, 'SampleID', sample_ids)

#add labels as the last column
df['Schizophrenia'] = labels



In [119]:
df.describe()

Unnamed: 0,821,328813,823,817,818,239935,28116,351091,28118,1161942,...,301302,632,392412,234908,156980,54005,1530,587,83771,Schizophrenia
count,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,...,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0
mean,5.814278,0.931705,1.069564,0.673349,1.470364,2.979764,2.338707,1.158183,0.240945,0.116741,...,0.004515,0.008438,0.001146,0.042469,0.001692,0.027913,0.017605,0.334331,0.002241,1.0
std,7.379256,2.009961,1.448299,2.094324,1.913818,8.332079,3.033184,1.629267,0.506952,0.341835,...,0.021391,0.117484,0.006355,0.278858,0.010196,0.258727,0.065365,3.560276,0.016271,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.210937,0.046135,0.160971,0.004158,0.249017,0.007174,0.479362,0.181957,0.025723,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,2.04104,0.249868,0.616461,0.01416,0.729705,0.138307,1.19654,0.582637,0.131486,0.005051,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,9.67555,0.793108,1.4064,0.380079,1.84487,1.6349,2.99031,1.42224,0.267066,0.023599,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,30.7071,16.9385,11.4843,23.0109,11.9322,67.1951,19.0889,11.5982,5.72358,2.42874,...,0.218809,1.7309,0.087459,3.54515,0.1276,3.68542,0.473825,45.2668,0.199682,1.0


In [120]:
df.to_csv('Data/Cleaned_data/cleaned_schizo_runs', index = False)