In [1]:
from paths import *
import pandas as pd
import numpy as np
import os
import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import joblib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import multiprocessing as mp
import warnings


In [2]:
NCORES = 8

In [3]:

warnings.filterwarnings("ignore", message="`distplot` is a deprecated function")
warnings.filterwarnings("ignore", message="DeprecationWarning")

 


In [4]:
def glm_model(data_, analyte_):
    data_analyte = data_[data_["Analyte"] == analyte_]
    le = LabelEncoder()
    data_analyte.loc[:, 'Disease_label'] = le.fit_transform(data_analyte['Group Name'])
    data_analyte.loc[:, 'Gender_label'] = le.fit_transform(data_analyte['Gender'])
    data_analyte.loc[:, 'site'] = data_analyte.loc[:, 'Client Sample ID'].str[:3]
    data_analyte = data_analyte[["Result", "Disease_label", "Age", "BMI", "Gender_label", "site", "house"]]
    X = sm.add_constant(data_analyte[['Age', 'BMI', 'Gender_label', 'Disease_label']])
    site_dummies = pd.get_dummies(data_analyte['site'], prefix='site', drop_first=True)
    house_dummies = pd.get_dummies(data_analyte['house'], prefix='house', drop_first=True)
    X = pd.concat([X, site_dummies, house_dummies], axis=1)
    model = sm.GLM(data_analyte['Result'], X, family=sm.families.Gamma(link=sm.families.links.log()))
#     model = sm.GLM(data_analyte['Result'], X, family=sm.families.Gaussian(link=sm.families.links.identity()))
    mdf = model.fit()
    out_dict = {}
    out_dict["analyte"] = analyte_
    out_dict["sample"] = "serum_feces_ratio"
    out_dict["patient_sample_count"] = data_analyte.shape[0]
    out_dict["model"] = mdf
    return out_dict 

def get_model_parallel(data_, analytes_):
    data_list = [data_]*len(analytes_)
    analytes_list = list(analytes_)
    arg_list = list(zip(data_list, analytes_list))
    p = mp.Pool(NCORES)
    model_list = p.starmap(glm_model, arg_list)
    p.close()
    p.join()
    return model_list

In [7]:


filename = SHORT_CHAIN_FATTY_ACID_DATA_FILENAME
mapping_filename = "short_chain_fatty_acid_spoke_map.csv"
file_path = os.path.join(DATA_ROOT_PATH, filename)
mapping_filepath = os.path.join(OUTPUT_PATH, mapping_filename)
data = pd.read_excel(file_path, engine='openpyxl')
data_feces = data[data["Client Matrix"]=="feces"]
data_serum = data[data["Client Matrix"]=="serum"]
mapping_data = pd.read_csv(mapping_filepath)
analytes = mapping_data["name"].unique()


In [111]:
data_feces.loc[:, 'house'] = (data_feces['Client Sample ID'].str[:3] + data_feces['Client Sample ID'].str[-4:])
data_feces_exclude_outlier_threshold_column = data_feces.drop("Analysis Comment", axis=1)
house_to_exclude = data_feces_exclude_outlier_threshold_column[data_feces_exclude_outlier_threshold_column.isna().any(axis=1)].house.values
data_feces_nan_removed = data_feces[~data_feces["house"].isin(house_to_exclude)]
data_feces_outlier_removed = data_feces_nan_removed[data_feces_nan_removed["Analysis Comment"].isna()]
data_feces_outlier_removed.drop("Analysis Comment", axis=1, inplace=True)
group_counts = data_feces_outlier_removed.groupby(['Analyte', 'house']).size()
valid_index = group_counts[group_counts == 2].reset_index().set_index(['Analyte', 'house']).index
data_feces_outlier_removed = data_feces_outlier_removed.set_index(['Analyte', 'house']).loc[valid_index].reset_index()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [112]:
data_serum.loc[:, 'house'] = (data_serum['Client Sample ID'].str[:3] + data_serum['Client Sample ID'].str[-4:])
data_serum_exclude_outlier_threshold_column = data_serum.drop("Analysis Comment", axis=1)
house_to_exclude = data_serum_exclude_outlier_threshold_column[data_serum_exclude_outlier_threshold_column.isna().any(axis=1)].house.values
data_serum_nan_removed = data_serum[~data_serum["house"].isin(house_to_exclude)]
data_serum_outlier_removed = data_serum_nan_removed[data_serum_nan_removed["Analysis Comment"].isna()]
data_serum_outlier_removed.drop("Analysis Comment", axis=1, inplace=True)
group_counts = data_serum_outlier_removed.groupby(['Analyte', 'house']).size()
valid_index = group_counts[group_counts == 2].reset_index().set_index(['Analyte', 'house']).index
data_serum_outlier_removed = data_serum_outlier_removed.set_index(['Analyte', 'house']).loc[valid_index].reset_index()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
columns_list = list(data_serum_outlier_removed.columns)
columns_to_remove = ["Unique Tube Label ID", "Dilution Factor", "Unit", "Race / Ethnicity", "Sample Description", "Client Matrix"]

columns_to_merge = ["Client Sample ID", "Analyte", "Group Name"]
columns_list_x = list(map(lambda x:x+"_x", columns_list))
for index, item in enumerate(columns_list):
    if item in columns_to_remove:
        columns_list_x.remove(item+"_x")
for item in columns_to_merge:
    columns_list_x.remove(item+"_x")
    columns_list_x.append(item)
    
data_serum_outlier_removed.drop(columns_to_remove, axis=1, inplace=True)
data_feces_outlier_removed.drop(columns_to_remove, axis=1, inplace=True)

data_serum_feces_outlier_removed = pd.merge(data_serum_outlier_removed, data_feces_outlier_removed, on=columns_to_merge)
data_serum_feces_outlier_removed.loc[:, "Result"] = np.divide(data_serum_feces_outlier_removed.Result_x, data_serum_feces_outlier_removed.Result_y)
data_serum_feces_outlier_removed = data_serum_feces_outlier_removed[["Result"] + columns_list_x]
data_serum_feces_outlier_removed.drop("Result_x", axis=1, inplace=True)

for item in columns_list:
    if item+"_x" in list(data_serum_feces_outlier_removed.columns):
        data_serum_feces_outlier_removed.rename(columns={item+"_x":item}, inplace=True)
        



In [124]:
%%time

model_without_outlier_list = get_model_parallel(data_serum_feces_outlier_removed, analytes)
joblib.dump(model_without_outlier_list, os.path.join(OUTPUT_PATH, "GLM_shortChain_fattyAcid_without_outlier_sample_serum_feces_ratio.joblib"))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

CPU times: user 126 ms, sys: 133 ms, total: 259 ms
Wall time: 9.11 s


['/pool0/home/karthik/iMSMS_metabolomics/result/GLM_shortChain_fattyAcid_without_outlier_sample_serum_feces_ratio.joblib']