In [27]:
import re
import textdistance
import jellyfish
import itertools
import editdistance
import pandas as pd
from collections import Counter
from cdifflib import CSequenceMatcher
from tqdm import tqdm
import pickle

In [101]:
chars=[',',";","\.",":","-","_"]

In [40]:
def rm_splChar(name):
    name = str(name)
    name1 = re.sub(" +","",name)
    regex = "|".join(chars)
    name1 = re.sub(regex,"", name1)
    name2 = re.sub(regex,"", name)
    #val = re.sub('[^A-Za-z]+', '', val)
    return name1, name2

In [4]:
def char_dist(name1, name2):
    name1=rm_splChar(name1)
    name2=rm_splChar(name2)
    return int(Counter(name1)==Counter(name2))

In [31]:
def diff_lib(name1, name2):
    name1=name1.lower()
    name2=name2.lower()
    ratio=CSequenceMatcher(lambda x: x == ' ', name1, name2).ratio()
    return ratio

In [5]:
def jaro_winkler_score(name1, name2):
    jw_score=textdistance.jaro_winkler.normalized_similarity(name1,name2)
    return jw_score

In [6]:
def levenshtein_score(name1, name2):
    leven_score = textdistance.levenshtein.normalized_similarity(name1,name2)
    return leven_score

In [7]:
def hamming_similarity(name1, name2):
    h_score=textdistance.hamming.normalized_similarity(name1,name2)
    return h_score

In [8]:
def jaccard_similarity(name1, name2):
    j_score=textdistance.jaccard.normalized_similarity(name1,name2)
    return j_score

In [9]:
def damerau_levenshtein_similarity(name1, name2):
    dl_score=textdistance.damerau_levenshtein.normalized_similarity(name1,name2)
    return dl_score

In [10]:
def sorensen_dice_similarity(name1, name2):
    sd_score=textdistance.sorensen_dice.normalized_similarity(name1,name2)
    return sd_score

In [11]:
def cosine_similarity(name1, name2):
    c_score=textdistance.jaccard.normalized_similarity(name1,name2)
    return c_score

In [87]:
def calculate_feats(name1, name2):
    sim_score=[]
    #sim_score.append(char_dist(name1, name2))
    sim_score.append(diff_lib(name1,name2))
    #sim_score.append(jaro_winkler_score(name1, name2))
    sim_score.append(levenshtein_score(name1, name2))
    #sim_score.append(hamming_similarity(name1, name2))
    sim_score.append(jaccard_similarity(name1, name2))
    sim_score.append(cosine_similarity(name1, name2))
    sim_score.append(damerau_levenshtein_similarity(name1, name2))
    sim_score.append(sorensen_dice_similarity(name1, name2))
    return sim_score

In [76]:
def lcs(name1, name2):
    match = CSequenceMatcher(None, name1, name2).find_longest_match(0, len(name1), 0, len(name2))
    common_subs=name1[match.a: match.a + match.size]
    name1=re.sub(re.escape(common_subs),"",name1)
    name2=re.sub(re.escape(common_subs),"",name2)
    return name1,name2

In [77]:
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [57]:
def load_obj(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [16]:
def Merge(dict1, dict2):
    res = {**dict1, **dict2}
    return res

In [17]:
thesis1 = load_obj('./index_files4/similar_thesis/similar_thesis_0_100000.pkl')

In [18]:
thesis2 = load_obj('./index_files4/similar_thesis/similar_thesis_100000_200000.pkl')

In [19]:
thesis3 = load_obj('./index_files4/similar_thesis/similar_thesis_200000_281425.pkl')

In [20]:
tmp1 = Merge(thesis1,thesis2)

In [21]:
final_thesis = Merge(tmp1,thesis3)

In [22]:
f_final_thesis = [(t[0][0],t[1][0],t[2], final_thesis[t][0],final_thesis[t][1]) for t in final_thesis if t[0][0]!=t[1][0]]

In [121]:
f_final_thesis

[('T138459',
  'T2109',
  0.9527476285584368,
  'A comparative study of public and private sector in general insurance',
  'Dynamics of life insurance business: A study of private and public sector in India'),
 ('T172826',
  'T195466',
  0.9554287577373454,
  'Karyomorphometrical analysis and exploration of major essential oil constituents in zingiberaceae',
  'Karyomorphometrical analysis and exploration of major chemical constituents in Dioscoreaceae'),
 ('T155527',
  'T132833',
  0.9928626803485257,
  'Fabrication electrical and structural characterization of metal schottky contacts on n type indium phosphide inp',
  'Fabrication electrical and structural caracterization of metal schottky contacts on n type indium phosphide inp'),
 ('T106152',
  'T73079',
  0.9163396150386152,
  'Studies on structural electrical and dielectric properties of pani metal oxide conducting nanopolymer composites',
  'Studies on structural and electrical properties of poly ethylene oxide nanocomposite ele

In [23]:
thesis_99 = [a for a in f_final_thesis if a[2] > 0.99]

In [24]:
thesis_99[0]

('T155527',
 'T132833',
 0.9928626803485257,
 'Fabrication electrical and structural characterization of metal schottky contacts on n type indium phosphide inp',
 'Fabrication electrical and structural caracterization of metal schottky contacts on n type indium phosphide inp')

In [102]:
tmp={}
for a in thesis_99:
    n1, n10 = rm_splChar(a[3])
    n2, n20 = rm_splChar(a[4])
    n11, n21 = lcs(n1, n2)
    n101, n201 = lcs(n10,n20)
    if (len(n11) > 5 and len(n21)>5): 
        vec1 = calculate_feats(n11, n21)
        vec2 = calculate_feats(n11.lower(), n21.lower())
    else:
        vec1 = calculate_feats(n1, n2)
        vec2 = calculate_feats(n1.lower(), n2.lower())
    if (len(n101)>5 and len(n201)>5): 
        vec3=calculate_feats(n101, n201)
        vec4=calculate_feats(n101.lower(), n201.lower())
    else:
        vec3=calculate_feats(n10, n20)
        vec4=calculate_feats(n10.lower(), n20.lower())
    tmp[a]=(sum(vec1),sum(vec2),sum(vec3),sum(vec4))

In [103]:
tmp

{('T155527',
  'T132833',
  0.9928626803485257,
  'Fabrication electrical and structural characterization of metal schottky contacts on n type indium phosphide inp',
  'Fabrication electrical and structural caracterization of metal schottky contacts on n type indium phosphide inp'): (5.860719874804381,
  5.860719874804381,
  5.874683544303798,
  5.874683544303798),
 ('T4865',
  'T32989',
  1.0000001429913685,
  'Toxicity Evaluation and Hypoglycemic Activity Of Indigenous Medicinal Plants Of Gujarat',
  'Toxicity Evaluation And Hypoglycemic Activity Of Indigenous Medicinal Plants Of Gujarat'): (5.642105263157895,
  6.0,
  5.675324675324675,
  6.0),
 ('T82551',
  'T91408',
  0.9999999588854146,
  'ETHNOMEDICINAL SURVEY OF TRADITIONAL MEDICINAL PLANTS USED BY FOLK VILLAGE PEOPLES OF WESTERN GHATS SOUTHERN INDIA AND ASSESSMENT OF NUTRITIVE VALUES OF THE FRUITS OF ELEOCARPUS OBLONGUS Gartn ELAEOCARPACEAE',
  'THNOMEDICINAL SURVEY OF TRADITIONAL MEDICINAL PLANTS USED BY FOLK VILLAGE PEOPLES 

In [104]:
for a in tmp:
    if len([b for b in tmp[a] if b > 5.00]) >0:
        print(str(a)+":"+str(tmp[a]))
        print("\n")

('T155527', 'T132833', 0.9928626803485257, 'Fabrication electrical and structural characterization of metal schottky contacts on n type indium phosphide inp', 'Fabrication electrical and structural caracterization of metal schottky contacts on n type indium phosphide inp'):(5.860719874804381, 5.860719874804381, 5.874683544303798, 5.874683544303798)


('T4865', 'T32989', 1.0000001429913685, 'Toxicity Evaluation and Hypoglycemic Activity Of Indigenous Medicinal Plants Of Gujarat', 'Toxicity Evaluation And Hypoglycemic Activity Of Indigenous Medicinal Plants Of Gujarat'):(5.642105263157895, 6.0, 5.675324675324675, 6.0)


('T82551', 'T91408', 0.9999999588854146, 'ETHNOMEDICINAL SURVEY OF TRADITIONAL MEDICINAL PLANTS USED BY FOLK VILLAGE PEOPLES OF WESTERN GHATS SOUTHERN INDIA AND ASSESSMENT OF NUTRITIVE VALUES OF THE FRUITS OF ELEOCARPUS OBLONGUS Gartn ELAEOCARPACEAE', 'THNOMEDICINAL SURVEY OF TRADITIONAL MEDICINAL PLANTS USED BY FOLK VILLAGE PEOPLES OF WESTERN GHATS SOUTHERN INDIA AND ASS

('T11622', 'T93605', 0.9999999457992674, 'Contribution of micro finance in economic development through women empowerment', 'Contribution of micro finance in Economic development through Women empowerment'):(5.675324675324675, 6.0, 5.702898550724638, 6.0)


('T163876', 'T175038', 0.9958681912953906, 'ENHANCED PHOTOCATALYTIC DEGRADATION OF CERTAIN ENDOCRINE DISRUPTORS USING ZnO AND M DOPED ZnO M Zr La Ce Mg and Ba NANOSTRUCTURES PREPARED BY A LOW TEMPERATURE SIMPLE PRECIPITATION ROUTE', 'ENHANCED PHOTOCATALYTIC DEGRADATION OF CERTAIN ENDOCRINE DISRUPTORS USING ZnO AND MDOPED ZnO M Zr La Ce Mg and Ba NANOSTRUCTURES PREPARED BY A LOW TEMPERATURE SIMPLE PRECIPITATION ROUTE'):(6.0, 6.0, 5.940404904476761, 5.940404904476761)


('T82602', 'T17515', 1.0000001095383793, 'Kinetic and Mechanistic studies on oxidations with Bromamine T', 'Kinetic and mechanistic studies on oxidations with Bromamine T'):(5.3939393939393945, 6.0, 5.483516483516484, 6.0)


('T180793', 'T68861', 0.9927897970992782, 'S

('T22204', 'T145402', 1.0000000328953575, 'Assessment of groundwater quality in Shirur taluka Pune district Maharashtra', 'Assessment of groundwater quality in Shirur taluka Pune District Maharashtra'):(5.642105263157895, 6.0, 5.65952380952381, 6.0)


('T128165', 'T197136', 0.9999999476198929, 'Bastar ke pathar ka jansankhya bhugol', 'Bastar ke padhar ka janshnkhaya bhugol'):(5.311352657004831, 5.311352657004831, 5.390145801910508, 5.390145801910508)


('T135675', 'T52020', 1.0000000565083074, 'Biostratigraphy of the marine neogene sediments of east coast of india', 'Biostratigraphy of the marine neogene sediments of east coast of India'):(5.884426229508197, 6.0, 5.900804828973843, 6.0)


('T144647', 'T8602', 0.9999999930414603, 'Studies on liquid crystals', 'Studies on Liquid crystals'):(5.336363636363637, 6.0, 5.4423076923076925, 6.0)


('T47989', 'T82223', 0.9951093178863617, 'Investigation on the electronic spectra of some organic compounds', 'Investigations on the electronic spect

('T92834', 'T42309', 0.9999999194712254, 'Studies in ascomycetous fungi', 'Studies in discomycetous fungi'):(4.85064935064935, 4.85064935064935, 5.0237362637362635, 5.0237362637362635)


('T141317', 'T69472', 0.9999999506640512, 'Physiology of salt tolerance of plants', 'Physiology of salt tolerance of Plants'):(4.928571428571429, 6.0, 4.928571428571429, 6.0)


('T58745', 'T135825', 0.9924562576019968, 'Studies on surface deformation of copper chalcogenide thin films by double exposure holographic interferometry technique', 'Studies on surface deformation of silver chalcogenide thin films by double exposure holographic interferometry technique'):(5.082719082719082, 5.082719082719082, 5.197994987468671, 5.197994987468671)


('T189134', 'T108797', 0.9999999885718108, "Preparation and properties of liquid - crystalline schiff's base esters", "Preparation and properties of liquid - crystalline Schiff's base esters"):(5.62280701754386, 6.0, 5.65952380952381, 6.0)


('T107177', 'T39251', 0.9


('T29308', 'T29310', 0.9984533692716772, 'The dance drama tradition of kuchipudi bhagvata mela nataka and kuravanji with special reference to rasa theory as expounded in bharatas natyasastra Volume 1', 'The dance drama tradition of kuchipudi bhagvata mela nataka and kuravanji with special reference to rasa theory as expounded in bharatas natyasastra Volume 3'):(5.9405196241017135, 5.9405196241017135, 5.949205837297428, 5.949205837297428)


('T179895', 'T179894', 0.9957702850993259, 'Some issues in the phonology of Gujarati Volume 1', 'Some issues in the phonology of Gujarati Volume 2'):(5.807200929152149, 5.807200929152149, 5.838367346938775, 5.838367346938775)


('T53018', 'T53016', 0.9921289962591046, 'Wooden architecture of Gujarat Volume 1', 'Wooden architecture of Gujarat Volume 2'):(5.768067226890756, 5.768067226890756, 5.797435897435897, 5.797435897435897)


('T163656', 'T163655', 0.9991116853394685, 'Dangi culture through museums collection documentation and interpretation of 


('T110553', 'T57205', 0.9921069212181761, 'Synthesis spectral and Biological studies of some metal complexes', 'Synthesis spectral and biological studies of metal complexes'):(5.271083113188377, 5.453281853281854, 5.2368055847596615, 5.39722724532851)


('T192161', 'T160199', 0.9950561119443895, 'Synthetic analytical and mechanistic investigations of some carbohydrates and bioactive compounds by unusual oxidants', 'Synthetic, analytical and mechanistic investigations of some carbohydrates and bioactive compounds by unusual oxidants'):(6.0, 6.0, 6.0, 6.0)


('T12481', 'T63588', 0.9955911806964912, 'Synthesis and characterization of polymeric membranes for the pervaporation separation of aqueous organic mixtures', 'Synthesis and characterization of new polymeric membranes for pervaporation separation of aqueous organic mixtures'):(5.542328042328042, 5.542328042328042, 5.512360135310955, 5.512360135310955)


('T96953', 'T101156', 1.0000000107031686, 'A Correlational Study Of Academic Sel

In [98]:
ment = pd.read_csv("index_files4/final_mod_ment_w_baseline_gen4_2.csv",sep=",")

In [117]:
ment[ment['thesisId']=='T160261']

Unnamed: 0,advisorId,researcherId,advisor_name,researcher_name,dc.date.submitted[],dc.language.iso[],publisher_dept,DepartmentId,publisher_institution,instituteId,...,dc.source.uri,dc.description.abstract,dc.title.alternative,advisor_inst_dept,stud_inst_dept,adv_names_rid,res_names_rid,advId,resId,N_DepartmentId
149065,71851,235382,"Gupta, Madhu",Seema,2012-01-01,eng,department of education,D1418,Maharshi Dayanand University,I35,...,http://shodhganga.inflibnet.ac.in/,,,"Gupta, Madhu@I35@D1418",Seema@I35@D1418,"Gupta, Madhu@71851",Seema@235382,71851,235382,D1418


In [118]:
ment[ment['thesisId']=='T177031']

Unnamed: 0,advisorId,researcherId,advisor_name,researcher_name,dc.date.submitted[],dc.language.iso[],publisher_dept,DepartmentId,publisher_institution,instituteId,...,dc.source.uri,dc.description.abstract,dc.title.alternative,advisor_inst_dept,stud_inst_dept,adv_names_rid,res_names_rid,advId,resId,N_DepartmentId
149052,71851,235241,"Gupta, Madhu","Seema, Seema",2012-12-31,eng,department of education,D1418,Maharshi Dayanand University,I35,...,http://shodhganga.inflibnet.ac.in/,,,"Gupta, Madhu@I35@D1418","Seema, Seema@I35@D1418","Gupta, Madhu@71851","Seema, Seema@235241",71851,235241,D1418
