In [1]:
import pickle
import re
import PyPDF2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import nltk
from nltk.corpus import stopwords
from snowballstemmer import TurkishStemmer


In [2]:
with open('articles_2700.pickle', 'rb') as f:
    queries = pickle.load(f)

In [3]:
articles_with_lowercase_titles = [
    {**article, 'ArticleTitle': [title.lower() for title in article.get('ArticleTitle', [])]}
    for article in queries
]

In [5]:
def info_exc(article):
    title = article["ArticleTitle"]
    title = ''.join(title).replace('\n', ' ').strip()
    author = article["Author"]
    author = ''.join(author).replace('\n', ' ').strip()
    year = article["Date"]
    year = ''.join(year)[:4]
    references = article["References"]
    return title, author, year, references

In [6]:
title, author,year,references = info_exc(queries[295312])

In [7]:
def pdf_to_text(pdf_file_path):
    text = ""
    with open(pdf_file_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
    return text

In [8]:
text=pdf_to_text("295312.pdf")

In [9]:
def split_article(article_text):
    reference_pattern = r"Kaynakça\n|Kaynaklar\n|References\n|Bibliography\n|Citations\n|Sonnotlar\n"
    reference_match = re.search(reference_pattern, article_text, re.IGNORECASE)

    if reference_match:
        main_text = article_text[:reference_match.start()] #extract main text from references
        reference_section = article_text[reference_match.start():]

        return main_text.replace('\n', ' ')
    else:
        return article_text.replace('\n', ' ')

text = split_article(text)

In [11]:
pattern_to_remove =  r'\b(\d+)\s+([\w\s]+)\((\d{4})\)\.\s*([^\.]+)\.\s*([^,]+),\s*(\d+)\s*\((\d+)\),\s*(\d+)-(\d+)\.\b'
matches = re.finditer(pattern_to_remove, text)

# Initialize a count variable
count = 0

# Iterate through the matches and count them
for match in matches:
    count += 1
    matched_sentence = match.group()
    print(matched_sentence)
    print("\n--------------\n")

# Print the total count of matching sentences
print(f"Total matches found: {count}")

Total matches found: 0


In [12]:
text = re.sub(pattern_to_remove, '', text)
text

'194 muhafazakâr düşünce  • muhafazakârlık ve iktisatABSTRACT It can be said that the Islamic world, which has to adopt the institutions and rules imposed by the  international monetary system, has largely lost its connection with the traditional Islamic institutions  and practices in this context. The number of studies on the Islamic monetary system in the field  of Islamic economics, which has emerged as a new discipline, is increasing day by day. Some  of them can be described as practical studies to run the system on the same basis by adapting  institutions, rules and tools in the conventional system according to Islamic norms. Some other  studies show the effort to draw a plan for all aspects of a monetary system unique to Islam. In all  these studies, the subject that does not receive the attention it deserves is the monetary standard.  Money is a measure of value. Since the value of every economic asset is measured in money, it is  inevitable that there should be a stability in 

In [14]:
def window(text, loc, target_str, left_window_size=150, right_window_size=100):
    left_threshold = max(loc - left_window_size, 0)
    right_threshold = min(loc + right_window_size, len(text))
    return text[left_threshold:right_threshold]


def find_references_with_context(text, title, author, year):
    pattern = re.compile(r'\(.*?\d{4}.*?\)')
    references_with_context = []

    for match in pattern.finditer(text):
        if match.group().lower() != title.lower() and author not in match.group() and str(year) not in match.group():
            match_text = match.group()
            start_loc = text.find(match_text)
            context = window(text, start_loc, match_text, 150, 100)
            references_with_context.append(context)

    return references_with_context

In [15]:
sentences_with_references=find_references_with_context(text, title, author, year)

In [17]:
sentences_with_references = list(set(sentences_with_references))

In [23]:
def extract_titles(references):
    titles = []
    for ref in references:
        # Regex pattern to extract titles within quotation marks
        match = re.search(r'\)\. (.+?)\.', ref)
        if match:
            titles.append(match.group(1).lower())
        else:
            # Fallback patterns for various formats
            patterns = [
                r'\)\. (.*?)\. [A-ZÇĞİÖŞÜ][a-zÇĞİÖŞÜ]+',  # New pattern for title after author/year
                r'(?<=\.\s)([A-ZÇĞİÖŞÜ][^,:()]+?)(?:, çev\.|, ed\.|\s\(\d{4}\))',   # Format 1
                r'\. ([^\.]+)\. The Canadian Journal',                              # Format 2
                r'\. ([^\.]+)\. Journal of [A-Z][a-z]+',                           # Format 3
                r'\. ([^\.]+?)\. [A-ZÇĞİÖŞÜ][a-zÇĞİÖŞÜ]+',                        # Format 4
                r'\. (.+?)\. [A-Z][a-z]+'                                          # General fallback
            ]

            for pattern in patterns:
                match = re.search(pattern, ref)
                if match:
                    titles.append(match.group(1).lower())
                    break

    return titles

In [27]:
def find_matching_articles(titles):
    matched = [query["Abstract"] for query in articles_with_lowercase_titles
               if 'ArticleTitle' in query and
               any(ref in query['ArticleTitle'] for ref in titles)]
    return matched

In [28]:
def integrated_function(sentences_with_references, references):
    def extract_citations(window):
        pattern = re.compile(r'\(.*?\d{4}.*?\)')
        return pattern.findall(window)

    def calculate_cosine_similarity_tfidf(citation, references):
        all_strings = [citation] + references
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(all_strings)
        similarity_matrix = cosine_similarity(tfidf_matrix)

        best_match_index = similarity_matrix[0, 1:].argmax()
        best_match_reference = references[best_match_index]
        similarity_score = similarity_matrix[0, 1 + best_match_index]

        return best_match_reference, similarity_score

    results = []

    for window in sentences_with_references:
        citations = extract_citations(window)
        for citation in citations:
            matched_reference, similarity_score = calculate_cosine_similarity_tfidf(citation, references)
            if similarity_score > 0.25:
                title = extract_titles([matched_reference])
                matched_article = find_matching_articles(title)
                result = {
                    "window": window,
                    "citation": citation,
                    "reference": matched_reference,
                    "cosine_similarity_score": similarity_score,
                    "title": title,
                    "abstract": matched_article  # Add the matched article to the dictionary
                }
                results.append(result)

    return results

In [29]:
results = integrated_function(sentences_with_references, references)

In [30]:
results

[{'window': 'toplumun refahının  elde edilmesi zorlaşmaktadır. Bu doğrultuda, paranın değerinde istikrarı sağla - mak para sisteminde en öncelikli konu olmaktadır (Chapra, 1996) Bu çalışmanın amacı, para sisteminde bir ölçüt mekanizması olarak  para standartların',
  'citation': '(Chapra, 1996)',
  'reference': 'Chapra, M. U. (1996). Moneraty Management in An Islamic Economy. Islamic Economic Studies. 4(1). ss.  1–35.',
  'cosine_similarity_score': 0.42545095190410426,
  'title': ['moneraty management in an islamic economy'],
  'abstract': []},
 {'window': 'dartlarını incelemekte dinarın günümüzde nasıl uygulanabileceğini tar - tışmaktadır. Burada para için öne çıkan özellik değer ölçme birimi olmasıdır  (Meera, 2018). Paranın ve para kurumlarının tarihi ve ideolojik değişim sürecini İslami  açıdan ve A',
  'citation': '(Meera, 2018)',
  'reference': 'Meera, A. K. M. (2018). Islamic Gold Dinar: the Historical Standard. International Journal of Islamic Economics and Finance. 1(1). ss.  