# Analise dos dados exclusivos de proteomica por regressao linear

Excel: regressao_linear.xlsx /
Abas

In [1]:
import pandas as pd
import statsmodels.api as sm
from joblib import Parallel, delayed

In [2]:
import threading
import psutil
import time

## P5 control

In [3]:
# Carregar os dados do Excel para um DataFrame
dados_excel = pd.read_excel('anoxia_controle_p0_p10_para_regressao.xlsx', sheet_name='p10_controle', index_col=0) 
print (dados_excel)

                      LFQ intensity P10-C2  LFQ intensity P10-C4  \
Majority protein IDs                                               
A0A096MJZ0                          873160                823630   
A0A0G2JTR4                         2254100               2205200   
A0A0G2JUG7                         3678400               3249000   
A0A0G2JV04                               0               1065700   
A0A0G2JXN2                         1082600               1043000   
...                                    ...                   ...   
Q9Z2Q1                             2457700               2263100   
Q9Z2S9                             2779200               2441000   
Q9Z2Z8                             3714000               3601700   
Q9Z336                             2650600               3192200   
Q9Z339                             4587600               5174700   

                      LFQ intensity P10-C5  LFQ intensity P10-C6  \
Majority protein IDs                           

In [4]:
dados_excel.dtypes

LFQ intensity P10-C2    int64
LFQ intensity P10-C4    int64
LFQ intensity P10-C5    int64
LFQ intensity P10-C6    int64
LFQ intensity P10-C7    int64
dtype: object

In [5]:
# Função para regressão linear
def regressao_linear(gene1, gene2, dados):
    x = dados.loc[gene1]
    y = dados.loc[gene2]

    # Adicionando uma constante para o termo linear
    X = sm.add_constant(x)

    # Ajustando o modelo de regressão linear
    modelo = sm.OLS(y, X).fit()

    # Coletando os resultados
    coef_angular = modelo.params[1]  # Coeficiente Angular
    coef_linear = modelo.params[0]   # Coeficiente Linear
    r2 = modelo.rsquared             # R^2
    pearson_corr = x.corr(y)         # Coeficiente de Correlação de Pearson
    f_statistic = modelo.fvalue      # F-statistic
    p_value = modelo.f_pvalue        # p-value do F-test
    coef_hypothesis_test = modelo.pvalues[1]  # Teste de Hipóteses dos Coeficientes
    std_error = modelo.bse[1]        # Erro Padrão da Estimativa

    return {
        'gene_x': gene1,
        'gene_y': gene2,
        'coef_angular': coef_angular,
        'coef_linear': coef_linear,
        'r2': r2,
        'pearson_corr': pearson_corr,
        'f_statistic': f_statistic,
        'p_value': p_value,
        'coef_hypothesis_test': coef_hypothesis_test,
        'std_error': std_error
    }

# Função para monitorar o uso da CPU
def monitor_cpu_usage(interval=2, duration=120):
    print("Monitoring CPU usage...")
    start_time = time.time()
    while time.time() - start_time < duration:
        cpu_percent = psutil.cpu_percent(interval=interval)
        print(f"CPU usage: {cpu_percent}%")
        time.sleep(interval)

# Configurando a paralelização
num_cores = 3  # Defina o número de núcleos a serem usados
resultados_regressao = []

# Iniciando o monitoramento da CPU em uma thread separada
monitor_thread = threading.Thread(target=monitor_cpu_usage, args=(1, 2000))
monitor_thread.start()

# Executando a regressão linear em paralelo
resultados_regressao = Parallel(n_jobs=num_cores)(
    delayed(regressao_linear)(gene1, gene2, dados_excel)
    for i, gene1 in enumerate(dados_excel.index)
    for gene2 in dados_excel.index[i+1:]
)

# Esperando o monitoramento da CPU terminar
monitor_thread.join()

# Criar um DataFrame com os resultados
df_resultados = pd.DataFrame(resultados_regressao)

# Definir o número máximo de linhas por aba
max_linhas_por_aba = 900000

# Dividir o DataFrame em partes menores
partes = [df_resultados[i:i+max_linhas_por_aba] for i in range(0, df_resultados.shape[0], max_linhas_por_aba)]

# Criar um escritor Excel
writer = pd.ExcelWriter('results_p10_controle.xlsx', engine='xlsxwriter')

# Salvar cada parte em uma aba (sheet) diferente
for i, parte in enumerate(partes):
    parte.to_excel(writer, sheet_name=f'Parte_{i+1}', index=False)

# Fechar o escritor Excel
writer.save()

Monitoring CPU usage...
CPU usage: 96.3%
CPU usage: 91.3%
CPU usage: 80.1%
CPU usage: 81.8%
CPU usage: 84.3%
CPU usage: 81.1%
CPU usage: 77.5%
CPU usage: 76.8%
CPU usage: 81.5%
CPU usage: 80.8%
CPU usage: 81.5%
CPU usage: 86.0%
CPU usage: 78.7%
CPU usage: 88.2%
CPU usage: 78.6%
CPU usage: 90.2%
CPU usage: 89.0%
CPU usage: 76.4%
CPU usage: 77.1%
CPU usage: 77.5%
CPU usage: 81.8%
CPU usage: 77.3%
CPU usage: 77.7%
CPU usage: 76.7%
CPU usage: 77.6%
CPU usage: 77.6%
CPU usage: 79.0%
CPU usage: 76.8%
CPU usage: 76.9%
CPU usage: 77.2%
CPU usage: 77.0%
CPU usage: 82.0%
CPU usage: 78.4%
CPU usage: 78.4%
CPU usage: 77.0%
CPU usage: 76.8%
CPU usage: 76.9%
CPU usage: 77.6%
CPU usage: 76.7%
CPU usage: 76.4%
CPU usage: 77.5%
CPU usage: 78.3%
CPU usage: 77.1%
CPU usage: 77.1%
CPU usage: 76.9%
CPU usage: 77.2%
CPU usage: 77.1%
CPU usage: 76.7%
CPU usage: 88.3%
CPU usage: 85.3%
CPU usage: 79.3%
CPU usage: 77.3%
CPU usage: 77.2%
CPU usage: 76.8%
CPU usage: 81.1%
CPU usage: 77.4%
CPU usage: 78.4%
CPU usa

CPU usage: 77.1%
CPU usage: 77.2%
CPU usage: 77.6%
CPU usage: 77.1%
CPU usage: 79.7%
CPU usage: 77.3%
CPU usage: 77.4%
CPU usage: 78.6%
CPU usage: 84.3%
CPU usage: 83.8%
CPU usage: 80.3%
CPU usage: 81.5%
CPU usage: 78.9%
CPU usage: 79.8%
CPU usage: 86.1%
CPU usage: 82.0%
CPU usage: 82.5%
CPU usage: 80.0%
CPU usage: 79.7%
CPU usage: 83.9%
CPU usage: 83.9%
CPU usage: 82.7%
CPU usage: 78.9%
CPU usage: 77.4%
CPU usage: 81.9%
CPU usage: 76.6%
CPU usage: 77.2%
CPU usage: 77.2%
CPU usage: 75.8%
CPU usage: 81.0%
CPU usage: 77.2%
CPU usage: 76.4%
CPU usage: 76.9%
CPU usage: 76.8%
CPU usage: 81.5%
CPU usage: 77.1%
CPU usage: 76.6%
CPU usage: 77.1%
CPU usage: 78.8%
CPU usage: 85.6%
CPU usage: 82.8%
CPU usage: 82.7%
CPU usage: 76.3%
CPU usage: 77.1%
CPU usage: 77.4%
CPU usage: 79.9%
CPU usage: 77.2%
CPU usage: 77.1%
CPU usage: 77.1%
CPU usage: 77.2%
CPU usage: 76.6%
CPU usage: 77.2%
CPU usage: 77.1%
CPU usage: 77.7%
CPU usage: 76.1%
CPU usage: 76.7%
CPU usage: 77.4%
CPU usage: 77.3%
CPU usage: 77.

CPU usage: 76.3%
CPU usage: 77.1%
CPU usage: 78.6%
CPU usage: 76.5%
CPU usage: 76.5%
CPU usage: 77.4%
CPU usage: 80.2%
CPU usage: 76.7%
CPU usage: 76.7%
CPU usage: 76.8%
CPU usage: 77.0%
CPU usage: 77.3%
CPU usage: 76.9%
CPU usage: 77.9%
CPU usage: 76.7%
CPU usage: 79.5%
CPU usage: 76.5%
CPU usage: 77.6%
CPU usage: 78.7%
CPU usage: 82.6%
CPU usage: 79.4%
CPU usage: 77.6%
CPU usage: 76.9%
CPU usage: 78.4%
CPU usage: 83.8%
CPU usage: 76.3%
CPU usage: 77.6%
CPU usage: 76.5%
CPU usage: 86.1%
CPU usage: 81.8%
CPU usage: 77.2%
CPU usage: 76.8%
CPU usage: 77.0%
CPU usage: 76.4%


In [None]:
# Loop para calcular a regressão linear e obter mais parâmetros
resultados_regressao = []
for i, gene1 in enumerate(dados_excel.index):
    for j, gene2 in enumerate(dados_excel.index[i+1:], start=i+1):
        x = dados_excel.loc[gene1]
        y = dados_excel.loc[gene2]

        # Adicionando uma constante para o termo linear
        X = sm.add_constant(x)

        # Ajustando o modelo de regressão linear
        modelo = sm.OLS(y, X).fit()

        # Coletando os resultados
        coef_angular = modelo.params[1]  # Coeficiente Angular
        coef_linear = modelo.params[0]   # Coeficiente Linear
        r2 = modelo.rsquared             # R^2
        pearson_corr = x.corr(y)         # Coeficiente de Correlação de Pearson
        f_statistic = modelo.fvalue      # F-statistic
        p_value = modelo.f_pvalue        # p-value do F-test
        coef_hypothesis_test = modelo.pvalues[1]  # Teste de Hipóteses dos Coeficientes
        std_error = modelo.bse[1]        # Erro Padrão da Estimativa

        resultados_regressao.append({
            'gene_x': gene1,
            'gene_y': gene2,
            'coef_angular': coef_angular,
            'coef_linear': coef_linear,
            'r2': r2,
            'pearson_corr': pearson_corr,
            'f_statistic': f_statistic,
            'p_value': p_value,
            'coef_hypothesis_test': coef_hypothesis_test,
            'std_error': std_error
        })

In [None]:
# Criar um DataFrame com os resultados
df_resultados = pd.DataFrame(resultados_regressao)

# Definir o número máximo de linhas por aba
max_linhas_por_aba = 900000

# Dividir o DataFrame em partes menores
partes = [df_resultados[i:i+max_linhas_por_aba] for i in range(0, df_resultados.shape[0], max_linhas_por_aba)]

# Criar um escritor Excel
writer = pd.ExcelWriter('results_p5_controle.xlsx', engine='xlsxwriter')

# Salvar cada parte em uma aba (sheet) diferente
for i, parte in enumerate(partes):
    parte.to_excel(writer, sheet_name=f'Parte_{i+1}', index=False)

# Fechar o escritor Excel
writer.save()