## Importing required packages

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer
from collections import Counter
from nltk.corpus import stopwords

In [2]:
# use this to download required packages to nltk library

import nltk
nltk.download('omw-1.4')
nltk.download('names')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

## Function to check whelther word is a alphabet

In [3]:
def is_letter_only(word):
    return word.isalpha()

In [4]:
all_names = set(names.words())
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')

## Function to clean text

In [5]:
def cleaned_text(docs):
    docs_cleaned = list()
    for doc in docs:
        doc = doc.lower()
        """"""
        doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split()
                            if is_letter_only(word) and word not in all_names and word not in stop_words)
        docs_cleaned.append(doc_cleaned)
    return docs_cleaned

## Declaring categories to classify and fetching dataset for training and testing

In [6]:
categories = ['comp.graphics','sci.med']

data_train = fetch_20newsgroups(subset='train', categories= categories, random_state=42)
data_test = fetch_20newsgroups(subset='test', categories=categories, random_state=42)

## Separating data and labels

In [7]:
cleaned_train = cleaned_text(data_train.data)
label_train = data_train.target

cleaned_test = cleaned_text(data_test.data)
label_test = data_test.target

# Training and Testing Model



### Converting textual features into vectors

In [8]:
tfidf_vectorizer= TfidfVectorizer(stop_words='english',max_features=None)

term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train)
term_docs_test = tfidf_vectorizer.transform(cleaned_test)

## Using SVM

In [9]:
from sklearn.svm import SVC

svm = SVC(kernel='linear', random_state=42)
svm.fit(term_docs_train, label_train)
accuracy = svm.score(term_docs_test, label_test)
print("The accuracy of binary classification is : {0:.1f}%".format(accuracy*100))

The accuracy of binary classification is : 95.4%


In [11]:
from sklearn.metrics import classification_report
prediction = svm.predict(term_docs_test)
report_svm = classification_report(label_test, prediction)
print(report_svm)

              precision    recall  f1-score   support

           0       0.96      0.95      0.95       389
           1       0.95      0.96      0.95       396

    accuracy                           0.95       785
   macro avg       0.95      0.95      0.95       785
weighted avg       0.95      0.95      0.95       785



## Using Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(term_docs_train, label_train)
accuracy = lr.score(term_docs_test, label_test)
print("The accuracy of binary classification is : {0:.1f}%".format(accuracy*100))

The accuracy of binary classification is : 95.0%


In [13]:
from sklearn.metrics import classification_report
prediction = lr.predict(term_docs_test)
report_lr = classification_report(label_test, prediction)
print(report_lr)

              precision    recall  f1-score   support

           0       0.96      0.94      0.95       389
           1       0.94      0.96      0.95       396

    accuracy                           0.95       785
   macro avg       0.95      0.95      0.95       785
weighted avg       0.95      0.95      0.95       785



## Using Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(term_docs_train, label_train)
accuracy = rfc.score(term_docs_test, label_test)
print("The accuracy of binary classification is : {0:.1f}%".format(accuracy*100))

The accuracy of binary classification is : 87.3%


In [18]:
from sklearn.metrics import classification_report
prediction = rfc.predict(term_docs_test)
report_rfc = classification_report(label_test, prediction)
print(report_rfc)

              precision    recall  f1-score   support

           0       0.81      0.96      0.88       389
           1       0.96      0.78      0.86       396

    accuracy                           0.87       785
   macro avg       0.89      0.87      0.87       785
weighted avg       0.89      0.87      0.87       785



In [28]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses

In [69]:
max_features = 500
sequence_length = 250
embedding_dim = 16

model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, embedding_dim),
  layers.Dropout(0.2),
  layers.Dense(128),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(32),
  layers.Dense(1)])

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, None, 16)          8016      
                                                                 
 dropout_2 (Dropout)         (None, None, 16)          0         
                                                                 
 dense_6 (Dense)             (None, None, 128)         2176      
                                                                 
 global_average_pooling1d_3  (None, 128)               0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dropout_3 (Dropout)         (None, 128)               0         
                                                                 
 dense_7 (Dense)             (None, 32)                4128      
                                                      

In [73]:
model.compile(loss=losses.BinaryCrossentropy(),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy())

In [None]:
print(term_docs_train[0])

In [75]:
model.fit(term_docs_test, label_test, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7ec4df60ba30>

In [59]:
loss, accuracy = model.evaluate(term_docs_test, label_test)
print(loss, accuracy, sep="--")

TypeError: ignored