In [74]:
import pandas as pd
import numpy as np
from itertools import combinations
import fileinput

In [75]:
quartiles = 'Q1|Q2|Q3' #arg
quartiles = quartiles.lower()
drop_without_quartile = False if 'q4' in quartiles else True

In [76]:
core_df = pd.read_excel('Final.xlsx', na_values='Q-нет'). \
loc[:, ['Авторы', 'Название публикации', 'Квартиль', 'AuthorID']]
core_df = core_df.loc[core_df.loc[:, 'Авторы'].notnull(), :]
print(len(core_df))
core_df = core_df.loc[core_df.loc[:, 'Квартиль'].str. \
                      lower().str. \
                      contains(quartiles,na=(not drop_without_quartile)) , :]
print(len(core_df))
core_df.head()

7859
3771


Unnamed: 0,Авторы,Название публикации,Квартиль,AuthorID
0,"Il'ichev, A.T.",Dynamics of front-like water evaporation phase...,Q1,7006402366
7,"Alexandrova, D.",Evolution of detonation wave and parameters of...,Q1,57203815035
9,"Rusanov, A.",Evolution of tribologically induced chemical a...,Q1,55553512600
10,"Rusin, M.Y.",Transient radiative-conductive heat transfer m...,Q1,7003908260
11,"Orlikovsky, N.A.",Interfering surface and localized plasmon: Tun...,Q2,54080136200


has_scopus_name = lambda row: False if row['Издание'].find('Scopus') == -1 else True
core_df['Scopus'] = core_df.apply(has_scopus_name, axis=1)
core_df = core_df[core_df['Scopus']]

In [77]:
author_count = core_df.loc[:, ['AuthorID', 'Название публикации']]. \
groupby(by=['AuthorID']).count(). \
rename(columns={'Название публикации':'Публикаций'})
print(len(author_count))
author_count.head()

1450


Unnamed: 0_level_0,Публикаций
AuthorID,Unnamed: 1_level_1
6503967581,3
6504176114,1
6504204735,3
6504208418,2
6504215409,4


In [78]:
author_count = author_count.merge(core_df.loc[:, ['Авторы', 'AuthorID']].drop_duplicates('AuthorID'),
                   left_index=True, right_on='AuthorID',
                   validate='one_to_one')
print(len(author_count))
author_count.head()

1450


Unnamed: 0,Публикаций,Авторы,AuthorID
6842,3,"Skorobogatskii, V.N.",6503967581
2582,1,"Kotiev, G.O.",6504176114
1029,3,"Khorokhorov, A.M.",6504204735
399,2,"Slukin, G.P.",6504208418
1742,4,"Strazdovskii, V.V.",6504215409


In [79]:
group_pubs = core_df.loc[:, ['AuthorID', 'Название публикации']]. \
groupby(by=['Название публикации'])

pairs_pub = {}
def process_group_of_authors(df):
    ids = df.loc[:, 'AuthorID']
    ids_pairs = combinations(ids, 2)
    for pair in ids_pairs:
        pair = tuple(sorted(pair))
        pairs_pub.setdefault(pair, 0)
        pairs_pub[pair] += 1
        
group_pubs.apply(process_group_of_authors)

In [80]:
pairs_count = pd.DataFrame.from_dict(pairs_pub, orient='index', columns=['count'])
pairs_count = pairs_count.reset_index()
pairs_count[['id1', 'id2']] = pairs_count.loc[:, 'index'].apply(pd.Series)
pairs_count.drop(columns=['index'], inplace=True)
pairs_count.head()

Unnamed: 0,count,id1,id2
0,2,55538667500,56004471700
1,2,6603262781,7003810125
2,1,7003810125,55890594600
3,1,6603262781,55890594600
4,1,8670550300,57128559400


In [81]:
#convert dataset to vis.js format
edges = pairs_count.rename(columns={'count':'value', 'id1':'from', 'id2':'to'})
edges.loc[:, 'title'] = 'Совместных работ: ' + edges.loc[:, 'value'].astype(str)

nodes = author_count.rename(columns={'Авторы':'label', 'AuthorID':'id', 'Публикаций':'mass'}) #title
nodes.loc[:, 'value'] = np.sqrt(nodes.loc[:, 'mass'] / np.pi)
nodes.loc[:, 'title'] = 'Публикаций: ' + nodes.loc[:, 'mass'].astype(str)
nodes.loc[:, 'group'] = np.random.randint(1, 9, nodes.shape[0])
print(nodes.head())
print(edges.head())

      mass                 label          id     value          title  group
6842     3  Skorobogatskii, V.N.  6503967581  0.977205  Публикаций: 3      3
2582     1          Kotiev, G.O.  6504176114  0.564190  Публикаций: 1      6
1029     3     Khorokhorov, A.M.  6504204735  0.977205  Публикаций: 3      6
399      2          Slukin, G.P.  6504208418  0.797885  Публикаций: 2      4
1742     4    Strazdovskii, V.V.  6504215409  1.128379  Публикаций: 4      4
   value         from           to                title
0      2  55538667500  56004471700  Совместных работ: 2
1      2   6603262781   7003810125  Совместных работ: 2
2      1   7003810125  55890594600  Совместных работ: 1
3      1   6603262781  55890594600  Совместных работ: 1
4      1   8670550300  57128559400  Совместных работ: 1


In [82]:
def replace_str(filename, text_to_search, replacement_text):
    with fileinput.FileInput(filename, inplace=True) as file:
        for line in file:
            print(line.replace(text_to_search, replacement_text), end='')

In [83]:
edges_filename = 'edges ' + quartiles + '.json'
nodes_filename = 'nodes ' + quartiles + '.json'

edges.to_json(edges_filename, orient='records', force_ascii=True)
nodes.to_json(nodes_filename, orient='records', force_ascii=True)

replace_str(nodes_filename, '"title"', ' title ')
replace_str(nodes_filename, '"value"', ' value ')
replace_str(nodes_filename, '"group"', ' group ')
replace_str(nodes_filename, '"id"', ' id ')
replace_str(nodes_filename, '"label"', ' label ')
replace_str(nodes_filename, '"mass"', ' mass ')
replace_str(nodes_filename, '"x"', ' x ')
replace_str(nodes_filename, '"y"', ' y ')
# replace_str('pers_pub_net.json', '"physics"', ' physics ')
# replace_str('pers_pub_net.json', '"false"', ' false ')

replace_str(edges_filename, '"value"', ' value ')
replace_str(edges_filename, '"from"', ' from ')
replace_str(edges_filename, '"title"', ' title ')
replace_str(edges_filename, '"to"', ' to ')
replace_str(edges_filename, '"hidden"', ' hidden ')

## setting up positions of nodes

In [84]:
positions = pd.read_json('positions_data/' + quartiles + '_positions.json', orient='records')
print(len(positions))
positions.head()

1450


Unnamed: 0,id,x,y
0,6503967581,31404,-15817
1,6504176114,13623,-10255
2,6504204735,1804,617
3,6504208418,-7644,23182
4,6504215409,30912,-15054


In [85]:
nodes_with_coord = nodes.merge(right=positions, left_on='id', right_on='id', validate='one_to_one')
nodes_with_coord = nodes_with_coord.sort_values('mass', ascending=False)

In [86]:
edges_filename = 'edges ' + quartiles + '.json'
nodes_filename = 'nodes_coord ' + quartiles + '.json'

edges.to_json(edges_filename, orient='records', force_ascii=True)
nodes_with_coord.to_json(nodes_filename, orient='records', force_ascii=True)

replace_str(nodes_filename, '"title"', ' title ')
replace_str(nodes_filename, '"value"', ' value ')
replace_str(nodes_filename, '"group"', ' group ')
replace_str(nodes_filename, '"id"', ' id ')
replace_str(nodes_filename, '"label"', ' label ')
replace_str(nodes_filename, '"mass"', ' mass ')
replace_str(nodes_filename, '"x"', ' x ')
replace_str(nodes_filename, '"y"', ' y ')
# replace_str('pers_pub_net.json', '"physics"', ' physics ')
# replace_str('pers_pub_net.json', '"false"', ' false ')

replace_str(edges_filename, '"value"', ' value ')
replace_str(edges_filename, '"from"', ' from ')
replace_str(edges_filename, '"title"', ' title ')
replace_str(edges_filename, '"to"', ' to ')
replace_str(edges_filename, '"hidden"', ' hidden ')

stucked_names = core_df.loc[:, 'Авторы']
stucked_names.head()

def split_names(stucked_names_entry):
    splitted = stucked_names_entry.split('\n')
    str_list = []
    for substr in splitted:
        temp = ' '.join(substr.split())
        str_list.append(temp)
    str_list = list(filter(None, str_list))
    return str_list

def pairwise(t):
    it = iter(t)
    return zip(it,it)

def detect_faculty(str_with_faculty):
    selector = {
        str_with_faculty.find('МТ'): 'МТ',
        str_with_faculty.find('ИУ'): 'ИУ',
        str_with_faculty.find('РЛ'): 'РЛ',
        str_with_faculty.find('ФН'): 'ФН',
        str_with_faculty.find('СМ'): 'СМ',
        str_with_faculty.find('Э'): 'Э',
        str_with_faculty.find('РК'): 'РК',
        str_with_faculty.find('БМТ'): 'БМТ',
        str_with_faculty.find('Л'): 'Л',
        str_with_faculty.find('ИБМ'): 'ИБМ',
        str_with_faculty.find('СГН'): 'СГН',
        str_with_faculty.find('РКТ'): 'РКТ',
        str_with_faculty.find('АК'): 'АК',
        str_with_faculty.find('ПС'): 'ПС',
        str_with_faculty.find('РТ'): 'РТ',
        str_with_faculty.find('ОЭП'): 'ОЭП',
        str_with_faculty.find('ЮР'): 'ЮР',
        str_with_faculty.find('ГУИМЦ'): 'ГУИМЦ',
        str_with_faculty.find('ФМОП'): 'ФМОП',
        str_with_faculty.find('ФВО'): 'ФВО',
    }
    try:
        sel = selector[True]
        assert(type(sel) == str), 'Multiple faculties detected: {}'.format(type(sel))
        return sel
    except KeyError as e:
        return 'Other'

def process_group_of_names_df(group_of_names, personal_publications_df, pair_publications_df):
    splitted = split_names(group_of_names)
    name_position_pairs = pairwise(splitted)
    bmstu_members = [entry for entry in name_position_pairs if (entry[1].find('Не МГТУ') == -1)]
    
    
    
def process_group_of_names(group_of_names):
    splitted = split_names(group_of_names)
    name_position_pairs = pairwise(splitted)
    bmstu_members = [entry for entry in name_position_pairs if (entry[1].find('Не МГТУ') == -1)]
    
    for entry in bmstu_members:
        faculty = detect_faculty(entry[1])
        personal_publications_num.setdefault(entry[0], [0, faculty])
        personal_publications_num[entry[0]][0] += 1
    for single_comb in combinations(bmstu_members, 2):
        sorted_comb = sorted(single_comb, key=lambda tup: tup[0])
        names_comb = (sorted_comb[0][0], sorted_comb[1][0])
        pair_publications_num.setdefault(names_comb, 0)
        pair_publications_num[names_comb] += 1

personal_publications_num = {}
pair_publications_num = {}
stucked_names = core_df['Авторы']
stucked_names.apply(process_group_of_names);

personal_publications_num_df = pd.DataFrame.from_dict(personal_publications_num, orient='index',
                                                      columns=['pub_num', 'faculty'])
personal_publications_num_df.index.names = ['full_name']
personal_publications_num_df.reset_index(inplace=True)
personal_publications_num_df.loc[:, 'id'] = personal_publications_num_df.apply(lambda row: id(row['full_name']),
                                                                               axis=1)


pair_publications_num_df = pd.DataFrame.from_dict(pair_publications_num, orient='index', columns=['pair_pub_num'])
pair_publications_num_df.index.names = ['names_pair']
pair_publications_num_df.reset_index(inplace=True)
get_ids = lambda row: pd.Series([id(row['names_pair'][0]),
                                 id(row['names_pair'][1])])
pair_publications_num_df[['name1_id', 'name2_id']] = pair_publications_num_df.apply(get_ids, axis=1)

def append_key_data(row):
     return row.loc['full_name'] + ' (' + str(row.loc['pub_num']) + ', ' + row.loc['faculty'] + ')'

personal_publications_num_df['key_data'] = personal_publications_num_df.apply(
    append_key_data, axis=1)

personal_publications_num_network_format_df = personal_publications_num_df.rename(
    columns={'key_data':'title', 'pub_num':'value', 'faculty':'group'})[['title', 'value', 'group', 'id']]
pair_publications_num_network_format_df = pair_publications_num_df.rename(
    columns={'pair_pub_num':'value', 'name1_id':'from', 'name2_id':'to'})[['value', 'from', 'to']]

personal_publications_num_network_format_df.to_json('pers_pub_net.json', orient='records')
pair_publications_num_network_format_df.to_json('pair_pub_net.json', orient='records')