In [None]:
!pip -q install transformers

nltk.download('punkt') # if necessary...

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk, string
from google.colab import drive
import torch
from keras_preprocessing.sequence import pad_sequences
from transformers import BertTokenizer,  AutoModelForSequenceClassification
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize


In [None]:
pd.set_option('display.max_colwidth', None)
data_path = "/content/drive/MyDrive/Data/combined.csv"
df = pd.read_csv(data_path, index_col = 0)
print(df.shape)

(49801, 1)


In [None]:
def preprocess_data(data_path, sample_size):

  # Read the data from specific path
  data = pd.read_csv(data_path, index_col=0)
  # data = pd.read_csv(data_path, low_memory=False, quoting=3, error_bad_lines=False)

  # Drop articles without Abstract
  data = data.dropna(subset = ['abstract']).reset_index(drop = True)

  # Get "sample_size" random articles
  #data = data.sample(sample_size)[['abstract']]

  return data
  
source_data = preprocess_data(data_path, 20000)
print(source_data.shape)
display(source_data.head(2))
#source_data = source_data['abstract'].tolist()

(49801, 1)


Unnamed: 0,abstract
0,"Transformers have shown great potential in computer vision tasks. A common\nbelief is their attention-based token mixer module contributes most to their\ncompetence. However, recent works show the attention-based module in\ntransformers can be replaced by spatial MLPs and the resulted models still\nperform quite well. Based on this observation, we hypothesize that the general\narchitecture of the transformers, instead of the specific token mixer module,\nis more essential to the model's performance. To verify this, we deliberately\nreplace the attention module in transformers with an embarrassingly simple\nspatial pooling operator to conduct only the most basic token mixing.\nSurprisingly, we observe that the derived model, termed as PoolFormer, achieves\ncompetitive performance on multiple computer vision tasks. For example, on\nImageNet-1K, PoolFormer achieves 82.1% top-1 accuracy, surpassing well-tuned\nvision transformer/MLP-like baselines DeiT-B/ResMLP-B24 by 0.3%/1.1% accuracy\nwith 35%/52% fewer parameters and 48%/60% fewer MACs. The effectiveness of\nPoolFormer verifies our hypothesis and urges us to initiate the concept of\n""MetaFormer"", a general architecture abstracted from transformers without\nspecifying the token mixer. Based on the extensive experiments, we argue that\nMetaFormer is the key player in achieving superior results for recent\ntransformer and MLP-like models on vision tasks. This work calls for more\nfuture research dedicated to improving MetaFormer instead of focusing on the\ntoken mixer modules. Additionally, our proposed PoolFormer could serve as a\nstarting baseline for future MetaFormer architecture design. Code is available\nat https://github.com/sail-sg/poolformer"
1,"A critical aspect of reliable communication involves the design of codes that\nallow transmissions to be robustly and computationally efficiently decoded\nunder noisy conditions. Advances in the design of reliable codes have been\ndriven by coding theory and have been sporadic. Recently, it is shown that\nchannel codes that are comparable to modern codes can be learned solely via\ndeep learning. In particular, Turbo Autoencoder (TURBOAE), introduced by Jiang\net al., is shown to achieve the reliability of Turbo codes for Additive White\nGaussian Noise channels. In this paper, we focus on applying the idea of\nTURBOAE to various practical channels, such as fading channels and chirp noise\nchannels. We introduce TURBOAE-TI, a novel neural architecture that combines\nTURBOAE with a trainable interleaver design. We develop a carefully-designed\ntraining procedure and a novel interleaver penalty function that are crucial in\nlearning the interleaver and TURBOAE jointly. We demonstrate that TURBOAE-TI\noutperforms TURBOAE and LTE Turbo codes for several channels of interest. We\nalso provide interpretation analysis to better understand TURBOAE-TI."


In [None]:
# Initialize the stemmer and tokenizer
stemmer = PorterStemmer()
tokenizer = word_tokenize
# Create a TfidfVectorizer object to convert the text data into numerical features
tfidf = TfidfVectorizer()

# Define a function to preprocess the text data
def preprocess_text(text):
    # Tokenize the text
    tokens = tokenizer(text)
    
    # Stem each token
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    
    # Join the stemmed tokens back into a single string
    stemmed_text = ' '.join(stemmed_tokens)
    
    return stemmed_text

In [None]:
# Preprocess the text data of each research paper
preprocessed_data = [preprocess_text(text) for text in source_data['abstract'].tolist()]
training_data = tfidf.fit_transform(preprocessed_data)
display(preprocessed_data)

["transform have shown great potenti in comput vision task . a common belief is their attention-bas token mixer modul contribut most to their compet . howev , recent work show the attention-bas modul in transform can be replac by spatial mlp and the result model still perform quit well . base on thi observ , we hypothes that the gener architectur of the transform , instead of the specif token mixer modul , is more essenti to the model 's perform . to verifi thi , we deliber replac the attent modul in transform with an embarrassingli simpl spatial pool oper to conduct onli the most basic token mix . surprisingli , we observ that the deriv model , term as poolform , achiev competit perform on multipl comput vision task . for exampl , on imagenet-1k , poolform achiev 82.1 % top-1 accuraci , surpass well-tun vision transformer/mlp-lik baselin deit-b/resmlp-b24 by 0.3 % /1.1 % accuraci with 35 % /52 % fewer paramet and 48 % /60 % fewer mac . the effect of poolform verifi our hypothesi and u

In [None]:
def is_plagiarism(similarity_score, plagiarism_threshold):

  is_plagiarism = False

  if(similarity_score >= plagiarism_threshold):
    is_plagiarism = True

  return is_plagiarism

def run_plagiarism_analysis(query_text, plagiarism_threshold=0.8):

    top_N=10

    # Preprocess the given document
    preprocessed_document = preprocess_text(query_text)
    
    # Convert the given document into numerical features
    X_new = tfidf.transform([preprocessed_document])
    
    # Compute cosine similarity between the given document and each research paper
    similarities = cosine_similarity(training_data, X_new)

    # Sort the research papers based on their similarity with the given document
    indices = np.argsort(similarities, axis=0)
    indices = indices[::-1]
    
    # Get the top N research papers with the highest similarity
    top_n_indices = indices[:top_N, 0]
    
    # Compute the similarity percentage with the top 10 research papers
    similarity_percentage = similarities[top_n_indices, 0] * 100

    # assign data of lists.
    texts = source_data['abstract'].tolist();
    similar_articles = {'abstract': [texts[i] for i in top_n_indices], 'similarity': similarity_percentage}  
  
    # Create DataFrame  
    formated_result = pd.DataFrame(similar_articles)
    
    # Create JSON Array
    similarity_decision=[]
    for x in formated_result.iloc:
        json={'similarity_score': x["similarity"], 
                           'similarity_percentage': str(round(x["similarity"])) + '%',
                            'similar_article': x["abstract"]
                        }
        similarity_decision.append(json)
    return similarity_decision

In [None]:
new_incoming_text = "hi, hello, how are you?"

# Run the plagiarism detection
analysis_result = run_plagiarism_analysis(new_incoming_text, plagiarism_threshold=0.8)
analysis_result

[{'similarity_score': 24.753818061915094,
  'similarity_percentage': '25%',
  'similar_article': "How do you persuade philanthropists to pay $1 million for every pathogenic human virus you discover? Anjali Nayar talks to 'virus hunter' Nathan Wolfe in Cameroon to find out."},
 {'similarity_score': 21.506302762940383,
  'similarity_percentage': '22%',
  'similar_article': 'gas velocity dispersion measures a amount of disordered motions of the rotating disk. accurate estimates of this parameter are of a utmost importance because it was directly linked to disk stability and star formation. the global measure of a gas velocity dispersion should be inferred from a width of a atomic hydrogen hi 21 cm line. we explore how several systematic effects involved inside a production of hi cubes affect a approximate of hi velocity dispersion. we do so by comparing a hi velocity dispersion derived from different types of data cubes provided by a hi nearby galaxy survey (things). we find that residual

In [None]:
new_incoming_text = "Kaluwan kalu"

# Run the plagiarism detection
analysis_result = run_plagiarism_analysis(new_incoming_text, plagiarism_threshold=0.8)
analysis_result

[{'similarity_score': 0.0,
  'similarity_percentage': '0%',
  'similar_article': 'Abstract The complement system is a family of serum and cell surface proteins that recognize pathogen-associated molecular patterns, altered-self ligands, and immune complexes. Activation of the complement cascade triggers several antiviral functions including pathogen opsonization and/or lysis, and priming of adaptive immune responses. In this review, we will examine the role of complement activation in protection and/or pathogenesis against infection by Flaviviruses, with an emphasis on experiments with West Nile and Dengue viruses.'},
 {'similarity_score': 0.0,
  'similarity_percentage': '0%',
  'similar_article': 'a eu solvency ii directive recommends insurance companies to pay more attention to a risk management methods. a sense of risk management was a ability to quantify risk and apply methods that reduce uncertainty. inside life insurance, a risk was the consequence of a random variable describing

In [None]:
new_incoming_text = "BACKGROUND: Human infection studies (HIS) are valuable in vaccine development. Deliberate infection, however, creates challenging questions, particularly in low and middle-income countries (LMICs) where HIS are new and ethical challenges may be heightened. Consultation with stakeholders is needed to support contextually appropriate and acceptable study design. We examined stakeholder perceptions about the acceptability and ethics of HIS in Malawi, to inform decisions about planned pneumococcal challenge research and wider understanding of HIS ethics in LMICs. METHODS: We conducted 6 deliberative focus groups and 15 follow-up interviews with research staff, medical students, and community representatives from rural and urban Blantyre. We also conducted 5 key informant interviews with clinicians, ethics committee members, and district health government officials. RESULTS: Stakeholders perceived HIS research to have potential population health benefits, but they also had concerns, particularly related to the safety of volunteers and negative community reactions. Acceptability depended on a range of conditions related to procedures for voluntary and informed consent, inclusion criteria, medical care or support, compensation, regulation, and robust community engagement. These conditions largely mirror those in existing guidelines for HIS and biomedical research in LMICs. Stakeholder perceptions pointed to potential tensions, for example, balancing equity, safety, and relevance in inclusion criteria. CONCLUSIONS: Our findings suggest HIS research could be acceptable in Malawi, provided certain conditions are in place. Ongoing assessment of participant experiences and stakeholder perceptions will be required to strengthen HIS research during development and roll-out."

# Run the plagiarism detection
analysis_result = run_plagiarism_analysis(new_incoming_text, plagiarism_threshold=0.8)
analysis_result

[{'similarity_score': 100.0,
  'similarity_percentage': '100%',
  'similar_article': 'BACKGROUND: Human infection studies (HIS) are valuable in vaccine development. Deliberate infection, however, creates challenging questions, particularly in low and middle-income countries (LMICs) where HIS are new and ethical challenges may be heightened. Consultation with stakeholders is needed to support contextually appropriate and acceptable study design. We examined stakeholder perceptions about the acceptability and ethics of HIS in Malawi, to inform decisions about planned pneumococcal challenge research and wider understanding of HIS ethics in LMICs. METHODS: We conducted 6 deliberative focus groups and 15 follow-up interviews with research staff, medical students, and community representatives from rural and urban Blantyre. We also conducted 5 key informant interviews with clinicians, ethics committee members, and district health government officials. RESULTS: Stakeholders perceived HIS rese