In [37]:
pip install numpy scikit-learn xgboost




In [38]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import xgboost as xgb

# Load dataset
categories = ['sci.electronics', 'sci.space', 'comp.graphics', 'sci.med']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

X_train, y_train = newsgroups_train.data, newsgroups_train.target
X_test, y_test = newsgroups_test.data, newsgroups_test.target


In [39]:
# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the test data
X_test_tfidf = vectorizer.transform(X_test)


In [40]:
# Initialize XGBoost classifier
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Train the classifier
xgb_clf.fit(X_train_tfidf, y_train)


In [41]:
# Make predictions
y_pred = xgb_clf.predict(X_test_tfidf)

# Evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.82      0.86      0.84       389
           1       0.79      0.83      0.81       393
           2       0.89      0.85      0.87       396
           3       0.94      0.90      0.92       394

    accuracy                           0.86      1572
   macro avg       0.86      0.86      0.86      1572
weighted avg       0.86      0.86      0.86      1572



In [42]:
# Example text for prediction
new_texts = ["The computer graphics industry is growing rapidly.",
             "ISRO does space missions.",
             "Medical research has made significant advancements in healthcare.","Intel is a known figure in electronis Industry."]

# Transform the new texts to TF-IDF
new_texts_tfidf = vectorizer.transform(new_texts)

# Make predictions
predictions = xgb_clf.predict(new_texts_tfidf)

# Output the predicted categories
predicted_categories = [newsgroups_train.target_names[pred] for pred in predictions]
print(predicted_categories)


['comp.graphics', 'sci.space', 'sci.med', 'sci.electronics']


In [44]:
# Prompt user for input
user_input = input("Enter text for classification: ")

# Process and predict
user_input_tfidf = vectorizer.transform([user_input])
prediction = xgb_clf.predict(user_input_tfidf)

# Output the predicted category
predicted_category = newsgroups_train.target_names[prediction[0]]
print("Predicted category:", predicted_category)


Enter text for classification: I have some Computer hardware
Predicted category: sci.electronics
