In [2]:
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from collections import defaultdict
from filenames import COMPLEX_BETWEENNESS, COMPLEX_CLOSENESS, COMPLEX_DEGREE, \
    GENE_ID_CONVERSION
from go_script import generate_matrix
from itertools import chain
from itertools import combinations
import json

In [3]:
measures = ["betweenness", "closeness", "degree"]

name_map = {
    COMPLEX_BETWEENNESS: "betweenness_centrality",
    COMPLEX_CLOSENESS: "closeness_centrality",
    COMPLEX_DEGREE: "degree_centrality"
}

In [4]:
def powerset(iterable):
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(1, len(s)+1))



In [5]:
def get_actual_map():
    data = open("convertor.txt").read().split('\n')
    m = {}
    for each in data[1:]:
        try:
            gene_id, gene_symbol, _ = each.split('\t')
            m[gene_symbol] = gene_id
        except:
            pass
    return m

In [6]:
def parse_json(measure):
    measure_map = defaultdict(list)
    data = json.loads(open(measure).read())
    for dp in data:
        for node in dp["nodes"]:
            measure_map[node].append(dp[name_map[measure]])

    measure_map = dict(list(map(lambda k: (k[0], max(k[1])), list(measure_map.items()))))
    data_ppi = json.loads(open(measure.replace(".json", "_PPI.json")).read())
    for data in measure_map:
        data_ppi[data] = measure_map[data]
    return data_ppi

In [7]:
def generate_measure_matrix():
    measure_map = {
        "betweenness": COMPLEX_BETWEENNESS,
        "closeness": COMPLEX_CLOSENESS,
        "degree": COMPLEX_DEGREE
    }
    gene_id_converter = get_actual_map()
    matrix, gene_list, go_ids = generate_matrix()

    measure_matrix = []
    for measure in measure_map:
        gene_measure = parse_json(measure_map[measure])
        gene_measure_list = []
        for gene in gene_list:
            try:
                gene_measure_list.append(gene_measure[gene_id_converter[str(gene)]])
            except:
                gene_measure_list.append(0)  # me sorry
        measure_matrix.append(gene_measure_list)
    return matrix, measure_matrix, go_ids, gene_list

In [8]:
def calculate_regression():
    for measure in measures:
        matrix, measures, _, _ = generate_measure_matrix(measure)
        go_measures = matrix[0]
        # print(len(go_measures), len(measures), "SUP SUP SUP")
        slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(np.asarray(measures_list), go_measures)
        print("Coefficient of determination for {}: {}".format(measure, r_value ** 2))

In [9]:
def calculate_multifit():
    measures = ["betweenness", "closeness", "degree"]
    measure_combs = powerset(measures)
    matrix, measure_list, _ = generate_measure_matrix()
    measure_dict = {}
    for i, measure in enumerate(measures):
        measure_dict[measure] = measure_list[i]

    for measure_comb in measure_combs:
        print("Calculating coeff for {}:".format(" and ".join(measure_comb)))
        X = []
        for measure in measure_comb:
            X.append(measure_dict[measure])
        X = np.asarray(X).T
        Y = matrix[0]
        clf = linear_model.LinearRegression()
        clf.fit(X, Y)
        print("Coeff is: {}".format(clf.score(X, Y)))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle

def get_model(train_data):
    measure_combs = powerset([0, 1, 2])
    Y, *centralities = train_data
    models = []
    for input_nums in measure_combs:
        X = []
        for i in input_nums:
            X.append(centralities[i])
        X = np.asarray(X).T
        clf = LogisticRegression()
        X_shuffled, Y_shuffled = shuffle(X, Y)
        clf.fit(X, Y)
        models.append((clf, input_nums))
    return models

In [None]:
def zip_measure_with_go(measure_matrix, go_matrix):

    list_measure_go = []
    for go_id in go_matrix:
        list_measure_go.append(list(zip(go_id, measure_matrix)))

    return list_measure_go

def remove_all_test_genes(index_list, go_id_to_measure_map):
    test_data = []
    for idx in index_list:
        test_data.append(go_id_to_measure_map[idx])
        go_id_to_measure_map.remove(idx)
    return test_data


In [42]:

def build_model_input(gene_data):
    data = []
    for i in range(4):
        data.append(list(map(lambda k: k[i], gene_data)))
    return data

def build_prediction_data(idx, test_data):
    go_id_data = test_data.index(idx)
    prediction_data = build_model_input(go_id_data)
    return prediction_data


def predict_lin_regression():
    go_matrix, measure_matrix, go_ids, gene_list = generate_measure_matrix()
    predictions = {}
    zero_class, total = 0, 0
    for ind, go_id in enumerate(go_matrix):
        train_data, test_data = train_test_split(list(zip(go_id, measure_matrix[0], measure_matrix[1], measure_matrix[2])), test_size=0.2)
        input_data = build_model_input(train_data)
        pred_data = build_model_input(test_data)
        if not sum(input_data[0]): continue
        models = get_model(input_data)
        go_id_models_pred = {}
        for model in models:
            # Ignore trainings with only one class.
            model, centralities = model
            model_cent_pred = []
            pred_input = []
            for c in centralities:
                pred_input.append(pred_data[c + 1])
            pred_input = np.asarray(pred_input).T
            pred = model.predict(pred_input)
            go_id_models_pred[tuple(centralities)] = {"prediction": pred, "rms": model.score(pred_input, pred_data[0])}
        predictions[go_ids[ind]] = go_id_models_pred
    print("{}/{}".format(zero_class, total))
    return predictions

In [43]:
a = predict_lin_regression()


  if __name__ == '__main__':


12/100
