In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'news-category-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F32526%2F4243451%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240917%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240917T080356Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D8ddbab6c4ffbec4050026b721de81d16bb133fb187e33e2844218a9b507c33febf34773535c6ad2a7c1db3bc810896a16e853a0d39ae6157ca706361a7d5ac3d6f69f4bd5805ca3ee54144618c632f87612710670b2d1f518545534fa224da6b1e5a4a99f5d45a07fe493e2c471ffdfa984a8b06d7849658ba73a5ab962b3092b8cdbf7691ccc9902402b92829459473b8a7b756c353bf14ac27a019dd0f06395b5c618dfe5a2b19990a3489aaf1c1be3fcb47dafd4e91e511b3803d3c9fc2228007d90f974fb24b7b717a8440cf7867c22269caf85923795022b64171df34c583f1577ef571bbbb32bea090391beacc9c422345b7d994f75f4b3485d48f82b8'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading news-category-dataset, 27829769 bytes compressed
Downloaded and uncompressed: news-category-dataset
Data source import complete.


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_json('/kaggle/input/news-category-dataset/News_Category_Dataset_v3.json', lines=True)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [4]:
# Combine 'headline' and 'short_description' into a single feature
df['text'] = df['headline'] + ' ' + df['short_description']

In [5]:
# Select features and target
X = df['text']
y = df['category']

In [6]:
# Convert text data to numerical data using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X_vectorized = vectorizer.fit_transform(X)

In [7]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the target labels
y_numeric = label_encoder.fit_transform(y)

In [8]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y_numeric, test_size=0.2, random_state=42)

# SVM Classifier

In [9]:
# Initialize and train the Support Vector Machine classifier
svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(X_train, y_train)

In [10]:
# Make predictions
y_pred_svm = svm_classifier.predict(X_test)

# Evaluate the classifier
print("SVM Classifier")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))

SVM Classifier
Accuracy: 0.6094115401135876
Classification Report:
               precision    recall  f1-score   support

           0       0.35      0.28      0.31       293
           1       0.35      0.21      0.26       275
           2       0.51      0.37      0.43       889
           3       0.50      0.50      0.50      1216
           4       0.38      0.42      0.40       202
           5       0.52      0.42      0.47      1022
           6       0.52      0.59      0.55       713
           7       0.67      0.28      0.39       202
           8       0.80      0.72      0.75       664
           9       0.43      0.36      0.39       209
          10       0.58      0.76      0.66      3419
          11       0.66      0.27      0.38       313
          12       0.51      0.15      0.23       263
          13       0.59      0.78      0.67      1270
          14       0.43      0.19      0.27       270
          15       0.39      0.40      0.39       532
          16 

# Decision Tree Classifier

In [11]:
# Initialize and train the Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

In [16]:
# Make predictions
y_pred_dt = dt_classifier.predict(X_test)

# Evaluate the classifier
print("Decision Tree Classifier")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Classification Report:\n", classification_report(y_test, y_pred_dt))

Decision Tree Classifier
Accuracy: 0.4270987448098124
Classification Report:
               precision    recall  f1-score   support

           0       0.25      0.21      0.23       293
           1       0.14      0.08      0.10       275
           2       0.31      0.25      0.28       889
           3       0.30      0.29      0.29      1216
           4       0.26      0.25      0.25       202
           5       0.32      0.28      0.30      1022
           6       0.30      0.29      0.29       713
           7       0.29      0.24      0.27       202
           8       0.68      0.58      0.63       664
           9       0.22      0.18      0.20       209
          10       0.45      0.52      0.48      3419
          11       0.22      0.16      0.18       313
          12       0.11      0.06      0.08       263
          13       0.43      0.47      0.45      1270
          14       0.09      0.07      0.08       270
          15       0.23      0.20      0.22       532
   

# XGBoost

In [12]:
# Initialize and train the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_classifier.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [13]:
# Make predictions
y_pred_xgb = xgb_classifier.predict(X_test)

# Convert numeric predictions back to original labels
y_pred_xgb_labels = label_encoder.inverse_transform(y_pred_xgb)
y_test_labels = label_encoder.inverse_transform(y_test)

# Evaluate the classifier
print("XGBoost Classifier")
print("Accuracy:", accuracy_score(y_test_labels, y_pred_xgb_labels))
print("Classification Report:\n", classification_report(y_test_labels, y_pred_xgb_labels))

XGBoost Classifier
Accuracy: 0.5706342767145516
Classification Report:
                 precision    recall  f1-score   support

          ARTS       0.40      0.28      0.33       293
ARTS & CULTURE       0.37      0.23      0.29       275
  BLACK VOICES       0.49      0.36      0.42       889
      BUSINESS       0.51      0.43      0.47      1216
       COLLEGE       0.36      0.42      0.39       202
        COMEDY       0.59      0.37      0.46      1022
         CRIME       0.50      0.48      0.49       713
CULTURE & ARTS       0.54      0.32      0.40       202
       DIVORCE       0.79      0.72      0.75       664
     EDUCATION       0.40      0.35      0.37       209
 ENTERTAINMENT       0.56      0.66      0.61      3419
   ENVIRONMENT       0.53      0.28      0.37       313
         FIFTY       0.49      0.16      0.24       263
  FOOD & DRINK       0.59      0.65      0.62      1270
     GOOD NEWS       0.34      0.14      0.20       270
         GREEN       0.38      

# Random Forest Classifier

In [14]:
# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

In [15]:
# Make predictions
y_pred_rf = rf_classifier.predict(X_test)

# Evaluate the classifier
print("Random Forest Classifier")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

Random Forest Classifier
Accuracy: 0.5547415644537775
Classification Report:
               precision    recall  f1-score   support

           0       0.35      0.23      0.28       293
           1       0.45      0.04      0.07       275
           2       0.50      0.24      0.33       889
           3       0.46      0.39      0.42      1216
           4       0.42      0.32      0.36       202
           5       0.49      0.31      0.38      1022
           6       0.47      0.47      0.47       713
           7       0.81      0.19      0.31       202
           8       0.84      0.64      0.72       664
           9       0.47      0.21      0.29       209
          10       0.50      0.73      0.59      3419
          11       0.91      0.12      0.22       313
          12       0.33      0.03      0.06       263
          13       0.54      0.70      0.61      1270
          14       0.30      0.04      0.07       270
          15       0.40      0.22      0.29       532
   