In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
!mkdir -p ./kaggle
!cp kaggle.json ~/.kaggle/

cp: cannot stat 'kaggle.json': No such file or directory


In [4]:
!kaggle datasets download -d asad1m9a9h6mood/news-articles

Dataset URL: https://www.kaggle.com/datasets/asad1m9a9h6mood/news-articles
License(s): CC0-1.0
Downloading news-articles.zip to /content
  0% 0.00/1.73M [00:00<?, ?B/s]
100% 1.73M/1.73M [00:00<00:00, 61.0MB/s]


In [5]:
!unzip /content/news-articles.zip

Archive:  /content/news-articles.zip
  inflating: Articles.csv            


In [6]:
df = pd.read_csv("/content/Articles.csv",encoding = "ISO-8859-1")
df = df.dropna()

In [7]:
df.sample(5)

Unnamed: 0,Article,Date,Heading,NewsType
2411,strong>RIO DE JANEIRO: Pakistan flag has offic...,8/5/2016,Pakistan flag hoisted Rio Olympics villag,sports
2601,strong>ISLAMABAD: Pakistan economy is all set ...,1/17/2017,Pakistans economy set to grow further in 2017 ...,business
1201,KARACHI: Sana Mir will lead a 15-member Pakist...,2/11/2016,Sana Mir to lead Pakistan in Womens World T20,sports
948,"strong>PARIS: Militant attacks, strikes and fl...",8/23/2016,Paris tourism lost 750 mln euros after,business
1026,CAPE TOWN: Ben Stokes scored the second fastes...,1/3/2016,Stokes batters South Africa in blistering double,sports


In [8]:
df.head()

Unnamed: 0,Article,Date,Heading,NewsType
0,KARACHI: The Sindh government has decided to b...,1/1/2015,sindh govt decides to cut public transport far...,business
1,HONG KONG: Asian markets started 2015 on an up...,1/2/2015,asia stocks up in new year trad,business
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1/5/2015,hong kong stocks open 0.66 percent lower,business
3,HONG KONG: Asian markets tumbled Tuesday follo...,1/6/2015,asian stocks sink euro near nine year,business
4,NEW YORK: US oil prices Monday slipped below $...,1/6/2015,us oil prices slip below 50 a barr,business


In [9]:
df.sample(4)

Unnamed: 0,Article,Date,Heading,NewsType
279,KARACHI: Pakistan is set to sign a 40-year-lea...,9/9/2015,china pakistan set to sign cpec zone d,business
2028,"Malahide, Ireland: Sri Lanka captain Angelo Ma...",6/19/2016,Mathews tells Sri Lanka to up their game again...,sports
446,strong>TOKYO: Asian stocks moved further away ...,1/25/2016,Asia stocks rise after US snowstorm rescues oi...,business
2599,"strong>DAVOS, SWITZERLAND: Just eight individu...",1/16/2017,World eight richest wealthy half humanity Oxfa...,business


In [10]:
def cleaning(s):
    s = str(s)  # Ensure the input is a string
    s = re.sub(r'\s\W', ' ', s)  # Replace whitespace followed by a non-word character with a space
    s = re.sub(r'\W,\s', ' ', s)  # Replace a non-word character followed by a comma and whitespace with a space
    s = re.sub(r"\d+", "", s)  # Remove all digits
    s = re.sub(r'\s+', ' ', s)  # Replace multiple whitespace characters with a single space
    s = re.sub(r'[!@#$_]', '', s)  # Remove specific special characters (!, @, #, $, _)
    s = s.replace("co", "")  # Remove occurrences of the substring "co"
    s = s.replace("https", "")  # Remove occurrences of the substring "https"
    s = s.replace("[\w*", " ")  # Replace the literal string "[\w*" with a space
    return s  # Return the cleaned string


In [11]:
# applying the preprocessing function on the given textual dataset
text_data = open("/content/Articles.csv",'w')
for idx,item in df.iterrows():
  article = cleaning(item['Article'])
  text_data.write(article)
text_data.close()

In [13]:
text_data

<_io.TextIOWrapper name='/content/Articles.csv' mode='w' encoding='UTF-8'>

In [24]:
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer,GPT2LMHeadModel
from transformers import Trainer,TrainingArguments

In [13]:
# code for loading dataset
def load_dataset(file_path,tokenizer,block_size = 128):
  dataset = TextDataset(
      tokenizer = tokenizer,
      file_path = file_path,
      block_size = block_size
  )
  return dataset

In [14]:
# loading datacollator
def load_data_collator(tokenizer,mlm = False):
  data_collator = DataCollatorForLanguageModeling(
      tokenizer = tokenizer,
      mlm = mlm
  )
  return data_collator

In [25]:
# defining the function for training
def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name) # defining the tokenizer
  train_dataset = load_dataset(train_file_path,tokenizer) # loading the textual dataset
  data_collator = load_data_collator(tokenizer) # defining the datacollator
  tokenizer.save_pretrained(output_dir) # saving the tokenizer
  model = GPT2LMHeadModel.from_pretrained(model_name)
  model.save_pretrained(output_dir)
  training_args = TrainingArguments(
      output_dir = output_dir,
      overwrite_output_dir = overwrite_output_dir,
      per_device_train_batch_size = per_device_train_batch_size,
      num_train_epochs = num_train_epochs,
  )
  my_trainer = Trainer(
      model = model,
      args = training_args,
      data_collator = data_collator,
      train_dataset = train_dataset,
  )
  my_trainer.train()
  my_trainer.save_model()



In [26]:
# setting parameters
train_file_path = "/content/Articles.csv"
model_name = "gpt2"
output_dir = "/content/results"
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 3
save_steps = 500

In [27]:
# now its training time
train(train_file_path = train_file_path,
      model_name = model_name,
      output_dir = output_dir,
      overwrite_output_dir = overwrite_output_dir,
      per_device_train_batch_size = per_device_train_batch_size,
      num_train_epochs = num_train_epochs,
      save_steps = save_steps)



Step,Training Loss
500,3.6909
1000,3.4251
1500,3.1911
2000,3.1443
2500,3.0392
3000,3.0155


In [28]:
# now comes the testing part
def load_model(model_path):
  model = GPT2LMHeadModel.from_pretrained(model_path)
  return model
def load_tokenizer(tokenizer_path):
  tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
  return tokenizer


In [29]:
def generate_text(sequence,max_length):
  model_path = "/content/results"
  model = load_model(model_path)
  tokenizer = load_tokenizer(model_path)
  ids = tokenizer.encode(f'{sequence}',return_tensors = 'pt')
  final_outputs = model.generate(
      ids,
      do_sample = True,
      max_length = max_length,
       pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,

  )

In [32]:
def generate_text(sequence,max_length):
  model_path = "/content/results"
  model = load_model(model_path)
  tokenizer = load_tokenizer(model_path)

  # Set pad token id to eos token id if not already set
  if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

  ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
  attention_mask = ids.ne(tokenizer.pad_token_id).long()

  final_outputs = model.generate(
      ids,
      attention_mask=attention_mask,
      do_sample=True,
      max_length=max_length,
      pad_token_id=model.config.eos_token_id,
      top_k=50,
      top_p=0.95,
  )
   print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [36]:
def generate_text(sequence,max_length):
  model_path = "/content/results"
  model = load_model(model_path)
  tokenizer = load_tokenizer(model_path)

  # Set pad token id to eos token id if not already set
  if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

  ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
  attention_mask = ids.ne(tokenizer.pad_token_id).long()

  final_outputs = model.generate(
      ids,
      attention_mask=attention_mask,
      do_sample=True,
      max_length=max_length,
      pad_token_id=model.config.eos_token_id,
      top_k=50,
      top_p=0.95,
  )
  return tokenizer.decode(final_outputs[0], skip_special_tokens=True)

In [43]:
sequence = input()
max_len = int(input())
generate_text(sequence,max_len)


machine learning a
30


'machine learning a billion euro tax has been introduced for foreigners living in the EU and to be used for tax evasion.It has also been included in a'