In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup

In [56]:
df = pd.read_csv('train.csv')

In [59]:
df.head()

Unnamed: 0,abstract,title
0,we consider the problem of utility maximizatio...,on optimal investment with processes of long o...
1,in this paper we provide an explicit formula f...,boolean complexes for ferrers graphs
2,"kinesin-5, also known as eg5 in vertebrates is...",relative velocity of sliding of microtubules b...
3,we discuss the transition paths in a coupled b...,bifurcation of transition paths induced by cou...
4,two types of room temperature detectors of ter...,all-electric detectors of the polarization sta...


In [2]:
# Load the GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [3]:
# Function to generate article names
def generate_article_name(article_text, model, tokenizer):
    input_ids = tokenizer.encode(article_text, return_tensors="pt")
    # Generate names
    output = model.generate(input_ids, max_length=128, top_p=0.95, do_sample=True, pad_token_id=128)
    text = tokenizer.decode(output[0], skip_special_tokens=True)
    return text

In [4]:
# Test the function
article_text = "This article discusses the latest research on artificial intelligence and its potential applications in the field of medicine. TL;DR"
article_name = generate_article_name(article_text, model, tokenizer)
print(article_name)

This article discusses the latest research on artificial intelligence and its potential applications in the field of medicine. TL;DR: You can read a lot about AI using various different methods that we discussed.

This article also discusses the latest research on Artificial Intelligence and its potential applications in the field of medicine. TL;DR: You can read a lot about AI using various different methods that we discussed.

What will be a novel method of artificial intelligence being used will be the "brain scan", as discussed in this article.

What will be a novel method of artificial intelligence being used will be the "brain scan", as discussed in


In [63]:
articles = df.abstract.values
titles = df.title.values

In [67]:
# Prepare the data
# Assume that `articles` is a list of strings, each representing the text of a scientific article
# and `titles` is a list of corresponding titles
input_ids = [tokenizer.encode(article, return_tensors="pt") for article in articles]
labels = [tokenizer.encode(title, return_tensors="pt") for title in titles]
dataset = TensorDataset(input_ids, labels)
data_loader = DataLoader(dataset, batch_size=1000, shuffle=True)

# Set up the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=-1)

# Fine-tune the model
for epoch in range(1, 5):
    total_loss = 0
    for input_ids, labels in data_loader:
        optimizer.zero_grad()
        logits = model(input_ids, labels=labels)
        loss = logits[0]
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    print(f"Epoch {epoch}, Loss: {total_loss / len(data_loader)}")


AttributeError: 'list' object has no attribute 'size'

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

# Load the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Prepare the data
# Assume that `articles` is a list of strings, each representing the text of a scientific article
# and `titles` is a list of corresponding titles
input_ids = [tokenizer.encode(article, return_tensors="pt") for article in articles]
labels = [[1] for _ in range(len(articles))] # 1 for title, 0 for non-title
dataset = TensorDataset(input_ids, labels)
data_loader = DataLoader(dataset, batch_size=4, shuffle=True)

# Set up the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=-1)

# Fine-tune the model
for epoch in range(1, 5):
    total_loss = 0
    for input_ids, labels in data_loader:
        optimizer.zero_grad()
        logits = model(input_ids, labels=labels)
        loss = logits[0]
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    print(f"Epoch {epoch}, Loss: {total_loss / len(data_loader)}")


In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

# Load the RuBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
model = BertForSequenceClassification.from_pretrained("DeepPavlov/rubert-base-cased", num_labels=2)

# Prepare the data
# Assume that `articles` is a list of strings, each representing the text of a scientific article
# and `titles` is a list of corresponding titles
input_ids = [tokenizer.encode(article, return_tensors="pt") for article in articles]
labels = [[1] for _ in range(len(articles))] # 1 for title, 0 for non-title
dataset = TensorDataset(input_ids, labels)
data_loader = DataLoader(dataset, batch_size=4, shuffle=True)

# Set up the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=-1)

# Fine-tune the model
for epoch in range(1, 5):
    total_loss = 0
    for input_ids, labels in data_loader:
        optimizer.zero_grad()
        logits = model(input_ids, labels=labels)
        loss = logits[0]
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    print(f"Epoch {epoch}, Loss: {total_loss / len(data_loader)}")


In [None]:
import torch

def generate_article_name(article_text, model, tokenizer):
    input_ids = tokenizer.encode(article_text, return_tensors="pt")
    logits = model(input_ids)[0]
    # Get the index of the highest probability
    max_index = torch.argmax(logits).item()
    # Get the corresponding token
    predicted_token = tokenizer.convert_ids_to_tokens([max_index])[0]
    return predicted_token

# Test the function
article_text = "This article discusses the latest research on artificial intelligence and its potential applications in the field of medicine."
article_name = generate_article_name(article_text, model, tokenizer)
print(article_name)


In [None]:
import torch

def generate_article_name_rubert(article_text, model, tokenizer):
    input_ids = tokenizer.encode(article_text, return_tensors="pt")
    logits = model(input_ids)[0]
    # Get the index of the highest probability
    max_index = torch.argmax(logits).item()
    # Get the corresponding token
    predicted_token = tokenizer.convert_ids_to_tokens([max_index])[0]
    return predicted_token

# Test the function
article_text = "Эта статья обсуждает последние исследования в области искусственного интеллекта и его возможные приложения в медицине."
tokenizer = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
article_name = generate_article_name_rubert(article_text, model, tokenizer)
print(article_name)


In [None]:
import torch

def generate_article_name_rubert(article_text, model, tokenizer):
    input_ids = tokenizer.encode(article_text, return_tensors="pt")
    logits = model(input_ids)[0]
    # Get the index of the highest probability
    max_index = torch.argmax(logits).item()
    # Get the corresponding token
    predicted_token = tokenizer.convert_ids_to_tokens([max_index])[0]
    # Decode the token
    decoded_title = tokenizer.decode(predicted_token)
    return decoded_title

# Test the function
article_text = "Эта статья обсуждает последние исследования в области искусственного интеллекта и его возможные приложения в медицине."
tokenizer = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
article_name = generate_article_name_rubert(article_text, model, tokenizer)
print(article_name)


In [6]:
# Import required libraries
import torch
import transformers
from transformers import BartTokenizer, BartForConditionalGeneration

# Pre-process data
descriptions = ["This article explores the impact of global warming on the environment.","This study examines the effects of climate change on the Arctic."]
titles = ["The Impact of Global Warming on the Environment","The Effects of Climate Change on the Arctic"]

# Convert strings to tokens
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
encodings = [tokenizer.encode(description, return_tensors='pt', max_length=64, truncation=True) for description in descriptions]

# Load BART model
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

# Generate titles
generated_titles = []
for encoding in encodings:
  outputs = model.generate(encoding, num_beams=4, min_length=4, max_length=16, early_stopping=True)
  generated_titles.append(tokenizer.decode(outputs[0], skip_special_tokens=True))

# Print generated titles
print('Generated titles:', generated_titles)

# Print original titles
print('Original titles:', titles)

Generated titles: ['This article explores the impact', 'This study examines the effects']
Original titles: ['The Impact of Global Warming on the Environment', 'The Effects of Climate Change on the Arctic']


In [None]:
# Import required libraries
import torch
import transformers
from transformers import BartTokenizer, BartForConditionalGeneration

# Pre-process data
descriptions = ["This article explores the impact of global warming on the environment.","This study examines the effects of climate change on the Arctic."]
titles = ["The Impact of Global Warming on the Environment","The Effects of Climate Change on the Arctic"]

# Convert strings to tokens
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
encodings = [tokenizer.encode(description, return_tensors='pt', max_length=64, truncation=True) for description in descriptions]
targets = [tokenizer.encode(title, return_tensors='pt', max_length=32, truncation=True) for title in titles]

# Load BART model
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

# Fine-tune the model
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
for _ in range(10):
  loss = 0
  for encoding, target in zip(encodings, targets):
    outputs = model(encoding, decoder_input_ids=target[:, :-1])
    loss += outputs[0]
  loss.backward()
  optimizer.step()
  optimizer.zero_grad()

# Generate titles
generated_titles = []
for encoding in encodings:
  outputs = model.generate(encoding, num_beams=4, max_length=32, early_stopping=True)
  generated_titles.append(tokenizer.decode(outputs[0], skip_special_tokens=True))

# Print generated titles
print('Generated titles:', generated_titles)

# Print original titles
print('Original titles:', titles)