## Importing Moduls

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
!pip install transformers



In [7]:
import pandas as pd

from sklearn.model_selection import train_test_split

import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopw = stopwords.words('english')


from tqdm import tqdm

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
data_path = "/content/drive/MyDrive/bbc_data.csv"

In [9]:
df = pd.read_csv(data_path, usecols = ["clean_text", "label"])

In [10]:
df.head()

Unnamed: 0,label,clean_text
0,1,quarterly profits at us media giant timewarn...
1,1,the dollar has hit its highest level against...
2,1,the owners of embattled russian oil giant yu...
3,1,british airways has blamed high fuel prices ...
4,1,shares in uk drinks and food firm allied dom...


In [11]:
df["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,511
1,510
2,417
3,401
4,386


In [12]:
from sklearn.model_selection import train_test_split

In [13]:
df_train, df_test = train_test_split(df, test_size= 0.30, stratify=df.label)

In [14]:
df_train.shape

(1557, 2)

In [15]:
df_test.shape

(668, 2)

In [16]:
%pip install datasets



## Turn pandas dataframe into dataset

In [17]:
from datasets import Dataset, DatasetDict
raw_datasets = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "eval": Dataset.from_pandas(df_test)
})

In [18]:
# Check the datasets
print("Dataset Dict:\n", raw_datasets)

Dataset Dict:
 DatasetDict({
    train: Dataset({
        features: ['label', 'clean_text', '__index_level_0__'],
        num_rows: 1557
    })
    eval: Dataset({
        features: ['label', 'clean_text', '__index_level_0__'],
        num_rows: 668
    })
})


In [19]:
from transformers import  AutoModelForSequenceClassification, AutoTokenizer
checkpoint = "distilbert-base-uncased" # Define which pre-trained model we will be using
classifier = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=5) # Get the classifier
tokenizer = AutoTokenizer.from_pretrained(checkpoint) # Get the tokenizer

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [20]:
tokenizer_dataset = raw_datasets.map(lambda dataset: tokenizer(dataset["clean_text"], truncation= True), batched= True)

Map:   0%|          | 0/1557 [00:00<?, ? examples/s]

Map:   0%|          | 0/668 [00:00<?, ? examples/s]

In [21]:
tokenizer_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'clean_text', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 1557
    })
    eval: Dataset({
        features: ['label', 'clean_text', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 668
    })
})

In [22]:
print(tokenizer_dataset["train"][0])

{'label': 4, 'clean_text': '  actor michael douglas is to coproduce and star in an adventure film about a diamond robbery set in india  the new picture is expected to be similar to douglass action films of the  romancing the stone and the jewel of the nile another hollywood star is being lined up to costar while the rest of the cast will be indian aishwarya rai star of bride and prejudice is the preferred choice of the indian studio involved in the film  on a visit to india the  actor said he hoped to start shooting racing the monsoon next year douglas added that it had been inspired by a wall street journal article about indias angadias who courier money and diamonds around india  the actors own production company further films is working in partnership with two indian filmmaking concerns to bring the picture to the screen shailendra singh the founder of indias percept films said there would be a lot of india in the movie  and that an indian train would play a big role the train will 

In [23]:
tokenizer_dataset = tokenizer_dataset.remove_columns(["__index_level_0__"])
# tokenizer_dataset = tokenizer_dataset.rename_column("label", "labels")
print(tokenizer_dataset)

DatasetDict({
    train: Dataset({
        features: ['label', 'clean_text', 'input_ids', 'attention_mask'],
        num_rows: 1557
    })
    eval: Dataset({
        features: ['label', 'clean_text', 'input_ids', 'attention_mask'],
        num_rows: 668
    })
})


In [24]:
%pip -q install evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [25]:
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer
import numpy as np
import evaluate

In [26]:
# Padding for batch of data that will be fed into model for training
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [27]:
# Training args
training_args = TrainingArguments("test-trainer", num_train_epochs=10, evaluation_strategy="epoch",
                                  weight_decay=5e-4, save_strategy="no", report_to="none")



In [28]:
# Metric for validation error
def compute_metrics(eval_preds):
    metric = evaluate.load("accuracy", "f1") # F1 and Accuracy
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


In [29]:
# Define trainer
trainer = Trainer(
    classifier,
    training_args,
    train_dataset=tokenizer_dataset["train"],
    eval_dataset=tokenizer_dataset["eval"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [30]:
# Start the fine-tuning
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.14923,0.959581
2,No log,0.090571,0.983533
3,0.163300,0.209898,0.964072
4,0.163300,0.094527,0.980539
5,0.163300,0.100792,0.982036
6,0.021200,0.126203,0.974551
7,0.021200,0.096604,0.980539
8,0.005400,0.109355,0.980539
9,0.005400,0.101085,0.983533
10,0.005400,0.101263,0.983533


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

TrainOutput(global_step=1950, training_loss=0.04938158719967573, metrics={'train_runtime': 239.3524, 'train_samples_per_second': 65.051, 'train_steps_per_second': 8.147, 'total_flos': 2062627743283200.0, 'train_loss': 0.04938158719967573, 'epoch': 10.0})

In [31]:
from sklearn.metrics import classification_report

# Make prediction on evaluation dataset
y_pred = trainer.predict(tokenizer_dataset["eval"]).predictions
y_pred = np.argmax(y_pred, axis=-1)

# Get the true labels
y_true = tokenizer_dataset["eval"]["label"]
y_true = np.array(y_true)

# Print the classification report
print(classification_report(y_true, y_pred, digits=3))

              precision    recall  f1-score   support

           0      0.994     1.000     0.997       154
           1      0.980     0.967     0.974       153
           2      0.968     0.976     0.972       125
           3      0.992     0.975     0.983       120
           4      0.983     1.000     0.991       116

    accuracy                          0.984       668
   macro avg      0.983     0.984     0.983       668
weighted avg      0.984     0.984     0.984       668



In [33]:
save_directory = "/content/drive/MyDrive/data"

classifier.save_pretrained(save_directory)

tokenizer.save_pretrained(save_directory)

('/content/drive/MyDrive/data/tokenizer_config.json',
 '/content/drive/MyDrive/data/special_tokens_map.json',
 '/content/drive/MyDrive/data/vocab.txt',
 '/content/drive/MyDrive/data/added_tokens.json',
 '/content/drive/MyDrive/data/tokenizer.json')

In [34]:
token_fine = AutoTokenizer.from_pretrained(save_directory)

In [35]:
model_fine = AutoModelForSequenceClassification.from_pretrained(save_directory)

In [36]:
from transformers import pipeline

In [37]:
pipe = pipeline("text-classification", model= model_fine, tokenizer= token_fine)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [38]:
pipe("I love stock market to do my bussiness")

[{'label': 'LABEL_1', 'score': 0.9990910291671753}]

In [41]:
pipe("Bollywood star Aamir Khan recently opened up about his retirement plans and said that he hopes to work for more than a decade.")

[{'label': 'LABEL_4', 'score': 0.9995751976966858}]

In [None]:
label_mapping = {""}

In [1]:
from transformers import pipeline

In [3]:
from transformers import  AutoModelForSequenceClassification, AutoTokenizer

In [4]:
token_fine = AutoTokenizer.from_pretrained("/content/drive/MyDrive/data")

In [5]:
model_fine = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/data")

In [6]:
pipe = pipeline("text-classification", model= model_fine, tokenizer= token_fine)
pipe

<transformers.pipelines.text_classification.TextClassificationPipeline at 0x7adfa7045630>

In [21]:
pipe("Bollywood star Aamir Khan recently opened up about his retirement plans and said that he hopes to work for more than a decade.")

[{'label': 'LABEL_4', 'score': 0.9995751976966858}]

In [22]:
classify=pipe("Bollywood star Aamir Khan recently opened up about his retirement plans and said that he hopes to work for more than a decade.")[0]['label']

In [23]:
label_mapping = {'LABEL_0': "Sport", 'LABEL_1': "Business", 'LABEL_2': "Politics",
                 'LABEL_3': "Tech", 'LABEL_4': "Entertainment"}

In [24]:
print(label_mapping[classify])

Entertainment


In [8]:
news= "Wrestler Aman Sehrawat won India's fifth bronze medal at the Paris Olympics. Golfer Aditi Ashok finished in a tie for 29th place and vowed to return in 2028. Manu Bhaker and Sarabjot Singh also won bronze medals."

In [29]:
print(label_mapping[pipe(news)[0]['label']])

Sport


In [30]:
doc= "India is a Sovereign Socialist Secular Democratic Republic with a Parliamentary form of government which is federal in structure with unitary features."

In [31]:
print(label_mapping[pipe(doc)[0]['label']])

Politics


In [32]:
tech = "Nvidia has become the world's most valuable company due to its high-end processors, which are central to the race to dominate artificial intelligence (AI) technology."

In [33]:
print(label_mapping[pipe(tech)[0]['label']])

Tech


In [39]:
import re
import nltk
import string
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [40]:
stemmer = nltk.SnowballStemmer("english")
stopword = set(stopwords.words('english'))

In [41]:
def clean_text(file):
    ## Reading the document.
    with open(file, encoding='utf8') as f:
        for text in f:
             text =text.strip()
             print(text)
    ## Applying text-processing on text.
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    print(text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text

In [44]:
tech = clean_text("/content/tech.txt")

Editor’s note: This blog was updated on Aug. 9 to reflect changes to the availability of ‘Warhammer 40,000: Speed Freeks.’

This GFN Thursday marks 2,000 games in the GeForce NOW library, with five new games joining this week, alongside a demo for Square Enix’s Visions of Mana and a new reward for members playing Elder Scrolls Online.

From epic role-playing games (RPGs) to heart-pounding shooters, the GeForce NOW library offers a variety of adventures for members to dive into anytime, anywhere.

There’s more to come — the highly anticipated action RPG Black Myth: Wukong from Game Science will soon be available for members to stream when it comes to the cloud at launch on Tuesday, Aug. 20.

Plus, gamers looking to try GeForce NOW can lock in a one- or six-month Priority or Ultimate membership at half price with the limited-time summer sale.
plus gamers looking to try geforce now can lock in a one or sixmonth priority or ultimate membership at half price with the limitedtime summer sale

In [45]:
print(label_mapping[pipe(tech)[0]['label']])

Tech


In [48]:
entertainment = clean_text("/content/entertainment.txt")

Actor Akshay Kumar has had a series of flops off-late, with films like Mission Raniganj, Bade Miyan Chote Miyan and Sarfira failing to strike gold at the box office. While he is going through a rough patch, his spirit to entertain doesn’t seem to fade. Akshay, who is gearing up to once again showcase his comedy chops in Khel Khel Mein which releases on August 15, is also celebrated as an action star in Bollywood. During a recent interview, Akshay revealed how the cartoon Tom and Jerry has given him major action references.
actor akshay kumar has had a series of flops offlate with films like mission raniganj bade miyan chote miyan and sarfira failing to strike gold at the box office while he is going through a rough patch his spirit to entertain doesn’t seem to fade akshay who is gearing up to once again showcase his comedy chops in khel khel mein which releases on august  is also celebrated as an action star in bollywood during a recent interview akshay revealed how the cartoon tom and

In [49]:
print(label_mapping[pipe(entertainment)[0]['label']])

Entertainment


In [56]:
poli = clean_text("/content/poli.txt")

The Opposition party on Thursday protested as the Waqf (Amendment) Bill, 2024 was tabled in the Lok Sabha. As the Waqf (Amendment) Bill was tabled by Union Minority Affairs Minister Kiren Rijiju, Congress MP KC Venugopal, who had previously submitted a notice to oppose the Bill, accused the government of infringing on religious freedoms and undermining the federal system, triggering a strong uproar from the NDA MPs.

Congress MP KC Venugopal denounced the bill as "draconian" and an “attack on the Constitution,” adding, “this is a draconian law and a fundamental attack on the Constitution,” Venugopal said.
congress mp kc venugopal denounced the bill as draconian and an “attack on the constitution” adding “this is a draconian law and a fundamental attack on the constitution” venugopal said


In [57]:
print(label_mapping[pipe(poli)[0]['label']])

Politics


In [60]:
sport = clean_text("/content/sport.txt")

So, we're done with another Olympics. Some dreams realised. Some shattered. Some taking another step towards realization.

With Reetika Hooda's elimination from the women's 76kg wrestling, Indian participation came to an end in Paris. A fortnight that began so promisingly with Manu Bhaker's two medals early on had its troughs to deal with, and eventually ended up as the third-most successful campaign ever for India. Paris promised a lot more, but proved that there are no guarantees in sport at any level.

INDIA'S OLYMPIC MEDAL TALLY | INDIA AT PARIS OLYMPICS | LATEST OLYMPIC NEWS |

This is the best and worst of India's Paris 2024, as told by the numbers:

6
The good 6 is that India won six medals at Paris 2024. Comparisons with Tokyo and whether that could've been bettered will always be there, but every Olympic medal is a treasured possession for the athlete and the country. Six more were added to that collection over the last fortnight.
the good  is that india won six medals at pari

In [62]:
print(label_mapping[pipe(sport)[0]['label']])

Sport
