# XML Parse Eden Notebook

## Tek bir xml dosyasında test

In [None]:
from bs4 import BeautifulSoup
import os
import pickle
from Levenshtein import distance
from fuzzywuzzy import fuzz
import itertools
import copy
import pandas as pd

In [None]:

tei_doc = 'grobid_xmls/dergi1_24.grobid.tei.xml'
with open(tei_doc, 'r') as tei:
    soup = BeautifulSoup(tei, 'lxml')


In [None]:
soup.title

In [None]:
soup

In [None]:
soup.title.getText()

## Gerekli Fonksiyonlar

In [None]:
def read_files_in_directory_quoted(directory_path):
    files_dict = {}
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            try:
                with open(file_path, 'r') as file:
                    files_dict[filename[:-4]] = []
            except Exception as e:
                print(f"Error reading '{filename}': {e}")
                continue
    return files_dict

In [None]:
def read_files_in_directory_quoters(directory_path):
    folder_dict = {}
    for foldername in os.listdir(directory_path):
        folder_path = os.path.join(directory_path, foldername)
        if os.path.isdir(folder_path):
            # Read files in the directory
            files_dict = read_files_in_directory_quoted(folder_path)
            # Check if the dictionary is not empty before adding it
            if files_dict:
                folder_dict[foldername] = files_dict
    return folder_dict


In [None]:
def read_files_in_directory_quoters_alternative(directory_path):
    folder_dict = {}
    for foldername in os.listdir(directory_path):
        folder_path = os.path.join(directory_path, foldername)
        if os.path.isdir(folder_path):
            folder_dict[foldername] = []
    return folder_dict

In [None]:
def get_surrounding_text(tag, context_size=30):
    text = tag.parent.get_text()  # Get the text of the parent element
    ref_text = tag.get_text()
    ref_index = text.find(ref_text)

    start_index = max(ref_index - context_size, 0)
    end_index = ref_index + len(ref_text) + context_size

    context = text[start_index:end_index]
    return context

In [None]:
# Function to extract bibliographic information from a <biblStruct>
def extract_bibl_info(bibl_struct):
    title_tag = bibl_struct.find('title')
    title = title_tag.get_text() if title_tag else ''
    
    author_tags = bibl_struct.find_all('author')
    authors = []
    for author in author_tags:
        forename_tag = author.find('forename')
        surname_tag = author.find('surname')
        forename = forename_tag.get_text() if forename_tag else ''
        surname = surname_tag.get_text() if surname_tag else ''
        author_name = f"{forename} {surname}".strip()
        if author_name:
            authors.append(author_name)
    
    date_tag = bibl_struct.find('date')
    date = date_tag.get_text() if date_tag else ''

    #bibl_info = f"{title} by {', '.join(authors)}. {date}."
    bibl_info = (title, authors, date)
    return bibl_info

In [None]:
filtered_ref_tags = soup.find_all('ref', {'type': 'bibr'})
filtered_ref_tags_on_ref = soup.find_all('biblstruct')

In [None]:
filtered_ref_tags_on_ref_text = [e.getText() for e in filtered_ref_tags_on_ref]

In [None]:
context_texts = []

## Tek xml'i parse etmek için loop

In [None]:
context_and_bibl_info_single = []

for ref in filtered_ref_tags:
    # Get the surrounding text
    context_text = get_surrounding_text(ref)

    # Get the reference ID
    ref_id = ref.get('target')
    if ref_id:
        ref_id = ref_id.lstrip('#')
        print(ref_id)
        # Find the corresponding <biblStruct>
        bibl_struct = soup.find('biblstruct', {'xml:id': ref_id})

        if bibl_struct:
            bibl_info = extract_bibl_info(bibl_struct)
            context_and_bibl_info_single.append((context_text, bibl_info))

In [None]:
for context, bibl_info in context_and_bibl_info_single:
    print("Context:")
    print(context)
    print("Bibliographic Information:")
    print(bibl_info)
    print("-----")

## Birden Fazla XML için Parse Etmek

In [None]:
context_and_bibl_info_all = []
folder_path = 'grobid_xmls' 
for filename in os.listdir(folder_path):

    context_and_bibl_info = []

    file_path = os.path.join(folder_path, filename)

    with open(file_path, 'r') as tei:
        soup = BeautifulSoup(tei, 'lxml')
    

    filtered_ref_tags = soup.find_all('ref', {'type': 'bibr'})
    

    title_tag_xml = soup.find('title')
    title_of_xml = title_tag_xml.get_text() if title_tag_xml else ''
    
    #title_of_xml = soup.title.getText()
    context_and_bibl_info.append(title_of_xml)

    for ref in filtered_ref_tags:
    # Get the surrounding text
        context_text = get_surrounding_text(ref, context_size=50)

        # Get the reference ID
        ref_id = ref.get('target')
        if ref_id:
            ref_id = ref_id.lstrip('#')
            # Find the corresponding <biblStruct>
            bibl_struct = soup.find('biblstruct', {'xml:id': ref_id})
            if bibl_struct:
                bibl_info = extract_bibl_info(bibl_struct)
                context_and_bibl_info.append((context_text, bibl_info))
    
    context_and_bibl_info_all.append(context_and_bibl_info)

In [None]:
context_and_bibl_info_all

## Quoters Yapısına Uygun Şekilde Parse Etmek

In [None]:
dict_0 = read_files_in_directory_quoters_alternative('all_all')
dict_0_clone = copy.deepcopy(dict_0)
dict_0_final = copy.deepcopy(dict_0)

In [None]:
dict_0_clone

In [None]:
context_and_bibl_info_all_0 = []
folder_path_0 = 'all_all'

# Start iterating over folder named as 0
for folder in os.listdir(folder_path_0):
    folder_path = os.path.join(folder_path_0, folder)
    
    if os.path.isdir(folder_path):
        # Iterate over files inside of the folder
        for file in os.listdir(folder_path):

            # file adı .DS_Store ise devam et
            if file == '.DS_Store' or file.endswith('.txt'):
                continue

            
            context_and_bibl_info_0 = []

            file_path = os.path.join(folder_path, file)

            # tei.xml dosyasını aç ve BeautifulSoup objesine dönüştür
            with open(file_path, 'r') as tei:
                soup = BeautifulSoup(tei, 'lxml')
            
            # metin içi refaransları bul
            filtered_ref_tags = soup.find_all('ref', {'type': 'bibr'})
            
            # metin içi refaransları dolaş
            for ref in filtered_ref_tags:

                context_text = get_surrounding_text(ref, context_size=50)
                ref_id = ref.get('target')
                if ref_id:
                    ref_id = ref_id.lstrip('#')
                    bibl_struct = soup.find('biblstruct', {'xml:id': ref_id})
                    if bibl_struct:
                        quoted_title_tag = bibl_struct.find('title')
                        quoted_title = quoted_title_tag.get_text() if quoted_title_tag else ''
                        
                        if fuzz.ratio(quoted_title.lower(), folder.lower()) > 50:
                            dict_0[folder].append(context_text)
                            

            


## Abstractı yanına eklemek için kod

In [None]:
articles_2700 = '/Users/borankahraman/ITU/lcr/articles_2700.pickle'

with open(articles_2700, 'rb') as file:
    articles = pickle.load(file)

In [None]:
articles

In [None]:
# List of keys to keep
keys_to_keep = ['ArticleTitle', 'Author', 'Date', 'Abstract']

# Filter and transform the articles
articles_filtered = [
    (
        article.get('ArticleTitle', [' '])[0],
        article.get('Author', [' '])[0],
        article.get('Abstract', [' '])[0],
        article.get('Date', [' '])[0]
    )
    for article in articles
]

In [None]:
articles_filtered

In [None]:
dict_0_keys = list(dict_0.keys())

In [None]:
int_articles_filtered = [e for e in articles_filtered if e[0][0] in '0123456789']
int_dict_0_keys = [e for e in dict_0_keys if e[0] in '0123456789']
a_articles_filtered = [e for e in articles_filtered if e[0][0] == 'a' or e[0][0] == 'A']
a_dict_0_keys = [e for e in dict_0_keys if e[0] == 'a' or e[0] == 'A']
b_articles_filtered = [e for e in articles_filtered if e[0][0] == 'b' or e[0][0] == 'B']
b_dict_0_keys = [e for e in dict_0_keys if e[0] == 'b' or  e[0] == 'B']
c_articles_filtered = [e for e in articles_filtered if e[0][0] == 'c' or e[0][0] == 'C']
c_dict_0_keys = [e for e in dict_0_keys if e[0] == 'c' or e[0] == 'C']
ç_dict_0_keys = [e for e in dict_0_keys if e[0] == 'ç' or e[0] == 'Ç']
ç_articles_filtered = [e for e in articles_filtered if e[0][0] == 'ç' or e[0][0] == 'Ç']
d_articles_filtered = [e for e in articles_filtered if e[0][0] == 'd' or e[0][0] == 'D']
d_dict_0_keys = [e for e in dict_0_keys if e[0] == 'd' or e[0] == 'D'] 
e_articles_filtered = [e for e in articles_filtered if e[0][0] == 'e' or e[0][0] == 'E']
e_dict_0_keys = [e for e in dict_0_keys if e[0] == 'e' or e[0] == 'E']
f_articles_filtered = [e for e in articles_filtered if e[0][0] == 'f' or e[0][0] == 'F']
f_dict_0_keys = [e for e in dict_0_keys if e[0] == 'f' or e[0] == 'F' ]
g_articles_filtered = [e for e in articles_filtered if e[0][0] == 'g' or e[0][0] == 'G']
g_dict_0_keys = [e for e in dict_0_keys if e[0] == 'g' or e[0] == 'G']
h_articles_filtered = [e for e in articles_filtered if e[0][0] == 'h' or e[0][0] == 'H']
h_dict_0_keys = [e for e in dict_0_keys if e[0] == 'h' or e[0] == 'H']
i_articles_filtered = [e for e in articles_filtered if e[0][0] == 'i' or e[0][0] == 'I']
i_dict_0_keys = [e for e in dict_0_keys if e[0] == 'i' or e[0] == 'I']
j_articles_filtered = [e for e in articles_filtered if e[0][0] == 'j' or e[0][0] == 'J']
j_dict_0_keys = [e for e in dict_0_keys if e[0] == 'j' or e[0] == 'J']
k_articles_filtered = [e for e in articles_filtered if e[0][0] == 'k' or e[0][0] == 'K']
k_dict_0_keys = [e for e in dict_0_keys if e[0] == 'k' or e[0] == 'K']
l_articles_filtered = [e for e in articles_filtered if e[0][0] == 'l' or e[0][0] == 'L']
l_dict_0_keys = [e for e in dict_0_keys if e[0] == 'l' or e[0] == 'L']
m_articles_filtered = [e for e in articles_filtered if e[0][0] == 'm' or e[0][0] == 'M']
m_dict_0_keys = [e for e in dict_0_keys if e[0] == 'm' or e[0] == 'M']
n_articles_filtered = [e for e in articles_filtered if e[0][0] == 'n' or e[0][0] == 'N']
n_dict_0_keys = [e for e in dict_0_keys if e[0] == 'n' or e[0] == 'N']
o_articles_filtered = [e for e in articles_filtered if e[0][0] == 'o' or e[0][0] == 'O']
o_dict_0_keys = [e for e in dict_0_keys if e[0] == 'o' or e[0] == 'O']
p_articles_filtered = [e for e in articles_filtered if e[0][0] == 'p' or e[0][0] == 'P']
p_dict_0_keys = [e for e in dict_0_keys if e[0] == 'p' or e[0] == 'P']
q_articles_filtered = [e for e in articles_filtered if e[0][0] == 'q' or e[0][0] == 'Q']
q_dict_0_keys = [e for e in dict_0_keys if e[0] == 'q' or e[0] == 'Q']
r_articles_filtered = [e for e in articles_filtered if e[0][0] == 'r' or e[0][0] == 'R']
r_dict_0_keys = [e for e in dict_0_keys if e[0] == 'r' or e[0] == 'R']
s_articles_filtered = [e for e in articles_filtered if e[0][0] == 's' or e[0][0] == 'S']
s_dict_0_keys = [e for e in dict_0_keys if e[0] == 's' or e[0] == 'S']
t_articles_filtered = [e for e in articles_filtered if e[0][0] == 't' or e[0][0] == 'T']
t_dict_0_keys = [e for e in dict_0_keys if e[0] == 't' or e[0] == 'T'] 
u_articles_filtered = [e for e in articles_filtered if e[0][0] == 'u' or e[0][0] == 'U']
u_dict_0_keys = [e for e in dict_0_keys if e[0] == 'u' or e[0] == 'U']
v_articles_filtered = [e for e in articles_filtered if e[0][0] == 'v' or e[0][0] == 'V']
v_dict_0_keys = [e for e in dict_0_keys if e[0] == 'v' or e[0] == 'V']
w_articles_filtered = [e for e in articles_filtered if e[0][0] == 'w' or e[0][0] == 'W']
w_dict_0_keys = [e for e in dict_0_keys if e[0] == 'w' or e[0] == 'W']
x_articles_filtered = [e for e in articles_filtered if e[0][0] == 'x' or e[0][0] == 'X']
x_dict_0_keys = [e for e in dict_0_keys if e[0] == 'x' or e[0] == 'X'] 
y_articles_filtered = [e for e in articles_filtered if e[0][0] == 'y' or e[0][0] == 'Y']
y_dict_0_keys = [e for e in dict_0_keys if e[0] == 'y' or e[0][0] == 'Y']
z_articles_filtered = [e for e in articles_filtered if e[0][0] == 'z' or e[0][0] == 'Z']   
z_dict_0_keys = [e for e in dict_0_keys if e[0] == 'z' or e[0] == 'Z']

In [None]:
def match_strings_alt(list1, list2, in_dict):
    min_size = min(len(list1), len(list2))
    matched_pairs = []

    for pair in itertools.product(list1, list2):
        #print(pair)
        str2_not_lowered = pair[1]
        str1, str2 = pair[0][0].lower(), pair[1].lower()
        dist = distance(str1, str2)
        similarity = 1 - (dist / max(len(str1), len(str2)))
        # pair[0][1] -> author, pair[0][2] -> abstract, pair[0][3] -> date
        res = (similarity, str2_not_lowered, pair[0][1], pair[0][2], pair[0][3]) 
        matched_pairs.append(res)

    matched_pairs.sort(key=lambda x: x[0], reverse=True)
    matched_pairs = matched_pairs[:min_size]
    #print(matched_pairs)
    for pair in matched_pairs:
        in_dict[pair[1]].append(pair[2]) # author
        in_dict[pair[1]].append(pair[3]) # abstract
        in_dict[pair[1]].append(pair[4]) # date
    


In [None]:
print("on integers")
match_strings_alt(int_articles_filtered, int_dict_0_keys, dict_0_clone)
print("on a")
match_strings_alt(a_articles_filtered, a_dict_0_keys, dict_0_clone)
print("on b")
match_strings_alt(b_articles_filtered, b_dict_0_keys, dict_0_clone)
print("on c")
match_strings_alt(c_articles_filtered, c_dict_0_keys, dict_0_clone)
print("on d")
match_strings_alt(d_articles_filtered, d_dict_0_keys, dict_0_clone)
print("on e")
match_strings_alt(e_articles_filtered, e_dict_0_keys, dict_0_clone)
print("on f")
match_strings_alt(f_articles_filtered, f_dict_0_keys, dict_0_clone)
print("on g")
match_strings_alt(g_articles_filtered, g_dict_0_keys, dict_0_clone)
print("on h")
match_strings_alt(h_articles_filtered, h_dict_0_keys, dict_0_clone)
print("on i")
match_strings_alt(i_articles_filtered, i_dict_0_keys, dict_0_clone)
print("on j")
match_strings_alt(j_articles_filtered, j_dict_0_keys, dict_0_clone)
print("on k")
match_strings_alt(k_articles_filtered, k_dict_0_keys, dict_0_clone)
print("on l")
match_strings_alt(l_articles_filtered, l_dict_0_keys, dict_0_clone)
print("on m")
match_strings_alt(m_articles_filtered, m_dict_0_keys, dict_0_clone)
print("on n")
match_strings_alt(n_articles_filtered, n_dict_0_keys, dict_0_clone)
print("on o")
match_strings_alt(o_articles_filtered, o_dict_0_keys, dict_0_clone)
print("on p")
match_strings_alt(p_articles_filtered, p_dict_0_keys, dict_0_clone)
print("on q")
match_strings_alt(q_articles_filtered, q_dict_0_keys, dict_0_clone)
print("on r")
match_strings_alt(r_articles_filtered, r_dict_0_keys, dict_0_clone)
print("on s")
match_strings_alt(s_articles_filtered, s_dict_0_keys, dict_0_clone)
print("on t")
match_strings_alt(t_articles_filtered, t_dict_0_keys, dict_0_clone)
print("on u")
match_strings_alt(u_articles_filtered, u_dict_0_keys, dict_0_clone)
print("on v")
match_strings_alt(v_articles_filtered, v_dict_0_keys, dict_0_clone)
print("on w")
match_strings_alt(w_articles_filtered, w_dict_0_keys, dict_0_clone)
print("on x")
match_strings_alt(x_articles_filtered, x_dict_0_keys, dict_0_clone)
print("on y")
match_strings_alt(y_articles_filtered, y_dict_0_keys, dict_0_clone)
print("on z")
match_strings_alt(z_articles_filtered, z_dict_0_keys, dict_0_clone)

In [None]:
for key in dict_0_clone.keys():
    if len(dict_0_clone[key]) != 0:
        print("***********")
        print(key)
        print(dict_0_clone[key])
        print(len(dict_0_clone[key]))
        print("-----------------")

In [None]:
column_names = ['quoted_title', 'context', 'author', 'abstract', 'date']
df = pd.DataFrame(columns = column_names)

In [None]:
df

In [None]:
for key in dict_0_final.keys():
    e_tuple = dict_0_final[key]
    
    first_e_tuple = e_tuple[0]
    second_e_tuple = e_tuple[1]

    if len(first_e_tuple) == 0:
        continue

    if len(second_e_tuple) > 1:
        for e in second_e_tuple:
            df = df._append({'quoted_title': key, 'quoter_context': e, 'quoted_author': first_e_tuple[0], 'quoted_abstract': first_e_tuple[1], 'quoted_date': first_e_tuple[2]}, ignore_index=True)


In [None]:
df

In [None]:
#df.to_csv('lcr_input.csv', index=False)