In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder


In [None]:
df=pd.read_csv('/content/Question_Classification_Dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Questions,Category0,Category1,Category2
0,0,How did serfdom develop in and then leave Russ...,DESCRIPTION,DESC,manner
1,1,What films featured the character Popeye Doyle ?,ENTITY,ENTY,cremat
2,2,How can I find a list of celebrities ' real na...,DESCRIPTION,DESC,manner
3,3,What fowl grabs the spotlight after the Chines...,ENTITY,ENTY,animal
4,4,What is the full form of .com ?,ABBREVIATION,ABBR,exp


In [None]:
df["Category0"].unique()

array(['DESCRIPTION', 'ENTITY', 'ABBREVIATION', 'HUMAN', 'NUMERIC',
       'LOCATION'], dtype=object)

In [None]:
df.columns

Index(['Unnamed: 0', 'Questions', 'Category0', 'Category1', 'Category2'], dtype='object')

In [None]:
le = LabelEncoder()
df['encoded'] = le.fit_transform(df['Category0'])


In [None]:
df.columns = df.columns.str.strip()


In [None]:
df_final=df[['Questions','Category0']]
df_final.head()

Unnamed: 0,Questions,Category0
0,How did serfdom develop in and then leave Russ...,DESCRIPTION
1,What films featured the character Popeye Doyle ?,ENTITY
2,How can I find a list of celebrities ' real na...,DESCRIPTION
3,What fowl grabs the spotlight after the Chines...,ENTITY
4,What is the full form of .com ?,ABBREVIATION


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_final["Questions"], df["Category0"], test_size=0.2)


In [None]:
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
clf = MultinomialNB()
clf.fit(X_train_vec, y_train)

y_pred = clf.predict(X_test_vec)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

ABBREVIATION       0.00      0.00      0.00        19
 DESCRIPTION       0.72      0.69      0.71       228
      ENTITY       0.64      0.73      0.68       258
       HUMAN       0.67      0.87      0.75       230
    LOCATION       0.94      0.66      0.78       181
     NUMERIC       0.88      0.78      0.82       175

    accuracy                           0.73      1091
   macro avg       0.64      0.62      0.62      1091
weighted avg       0.74      0.73      0.73      1091



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Hugging  face transformation**

In [None]:
pip install transformers datasets


Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(df_final_2[["Questions", "encoded"]])


dataset = dataset.train_test_split(test_size=0.2)

In [None]:
# Ensure the 'encoded' column is included and renamed to 'labels' in the tokenized dataset
def tokenize(example):
    tokenized_inputs = tokenizer(example["Questions"], truncation=True, padding="max_length")
    tokenized_inputs["labels"] = example["encoded"] # Include and rename the labels
    return tokenized_inputs

tokenized = dataset.map(tokenize, batched=True)

# The rest of the code remains the same
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=len(le.classes_)
)

from google.colab import drive
drive.mount('/content/drive')

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/models/results",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer
)

trainer.train()

In [None]:
def predict_question(question):
    inputs = tokenizer(question, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    predicted_class = outputs.logits.argmax().item()
    return le.inverse_transform([predicted_class])[0]

# Example
print(predict_question("What are the top-selling items last month?"))  # → "Top-selling"
