In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


**1. Import necessary libraries**

In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


2. Load the dataset

In [3]:
df = pd.read_csv('/content/gdrive/MyDrive/Datasets/BBCNews.csv')
df.head()


Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [17]:
df.columns

Index(['ArticleId', 'Text', 'Category', 'cleaned_text'], dtype='object')

In [18]:
df.drop('ArticleId',axis=1,inplace=True)

**3. Preprocessing of textual data**

In [6]:
# preprocessing function to remove stop words, punctuations and getting the root lemma
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemma = WordNetLemmatizer()
    tokens = [lemma.lemmatize(word) for word in tokens]
    clean_text = ' '.join(tokens)
    return clean_text


In [7]:
# Apply preprocessing to clean text
df['cleaned_text'] = df['Text'].apply(clean_text)

**4. Feature Selection**

In [8]:
# TF-IDF vectorizer to convert text to numerical features
vectorizer = TfidfVectorizer()
# Convert the clean text data to TF-IDF features
X = vectorizer.fit_transform(df['cleaned_text'])
# Target categorical variables
y = df['Category']

**5. Split train and validation data**

In [9]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

**6. Model Developement**

In [10]:
# SVM classifier
svm_classifier = SVC()
# Fit the SVM classifier on train data
svm_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = svm_classifier.predict(X_test)


**7. Model Evaluation **

In [11]:
# Evaluate the model on test data
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy of the model is :", accuracy)
#classification report
print(classification_report(y_test, y_pred))

Test Accuracy of the model is : 0.9731543624161074
               precision    recall  f1-score   support

     business       0.93      0.99      0.96        68
entertainment       1.00      0.96      0.98        55
     politics       0.98      0.94      0.96        53
        sport       0.99      1.00      0.99        67
         tech       0.98      0.96      0.97        55

     accuracy                           0.97       298
    macro avg       0.98      0.97      0.97       298
 weighted avg       0.97      0.97      0.97       298



**8. User Input to get the classification label**

In [12]:
def classify_input(user_input):
    preprocessed_input = clean_text(user_input)
    tfidf = vectorizer.transform([preprocessed_input])
    predicted = svm_classifier.predict(tfidf)[0]
    return predicted

In [13]:
while True:
    user_input = input("Enter a sentence: ")
    predicted_label = classify_input(user_input)
    print("Predicted Label :", predicted_label)

Enter a sentence: insurance company took a toll of 2 crore
Predicted Label : business


KeyboardInterrupt: ignored