In [None]:
# Import Library
import pandas as pd
import re
import joblib

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 1. Load Dataset JSON
file_path = '/content/drive/MyDrive/Data Set/News_Category_Dataset_v3.json'
data = pd.read_json(file_path, lines=True)

print(data.head())

                                                link  \
0  https://www.huffpost.com/entry/covid-boosters-...   
1  https://www.huffpost.com/entry/american-airlin...   
2  https://www.huffpost.com/entry/funniest-tweets...   
3  https://www.huffpost.com/entry/funniest-parent...   
4  https://www.huffpost.com/entry/amy-cooper-lose...   

                                            headline   category  \
0  Over 4 Million Americans Roll Up Sleeves For O...  U.S. NEWS   
1  American Airlines Flyer Charged, Banned For Li...  U.S. NEWS   
2  23 Of The Funniest Tweets About Cats And Dogs ...     COMEDY   
3  The Funniest Tweets From Parents This Week (Se...  PARENTING   
4  Woman Who Called Cops On Black Bird-Watcher Lo...  U.S. NEWS   

                                   short_description               authors  \
0  Health experts said it is too early to predict...  Carla K. Johnson, AP   
1  He was subdued by passengers and crew when he ...        Mary Papenfuss   
2  "Until you have a dog y

In [None]:
# Gabungkan kategori yang mirip
data['category'] = data['category'].replace({
    'STYLE': 'STYLE & BEAUTY',
    'ARTS': 'CULTURE & ARTS',
    'WORLDPOST': 'WORLD NEWS',
    'PARENTS': 'PARENTING',
    'HEALTHY LIVING': 'WELLNESS',
    'TASTE': 'FOOD & DRINK'
})


In [None]:
print(data['category'].unique())

['U.S. NEWS' 'COMEDY' 'PARENTING' 'WORLD NEWS' 'CULTURE & ARTS' 'TECH'
 'SPORTS' 'ENTERTAINMENT' 'POLITICS' 'WEIRD NEWS' 'ENVIRONMENT'
 'EDUCATION' 'CRIME' 'SCIENCE' 'WELLNESS' 'BUSINESS' 'STYLE & BEAUTY'
 'FOOD & DRINK' 'MEDIA' 'QUEER VOICES' 'HOME & LIVING' 'WOMEN'
 'BLACK VOICES' 'TRAVEL' 'MONEY' 'RELIGION' 'LATINO VOICES' 'IMPACT'
 'WEDDINGS' 'COLLEGE' 'ARTS & CULTURE' 'GREEN' 'THE WORLDPOST' 'GOOD NEWS'
 'FIFTY' 'DIVORCE']


In [None]:
# Tentukan kategori yang ingin di-undersample
category_to_undersample = ['POLITICS', 'WELLNESS','ENTERTAINMENT','PARENTING','STYLE & BEAUTY','TRAVEL','FOOD & DRINK','QUEER VOICES','BUSINESS','WORLD NEWS','COMEDY','SPORTS','BLACK VOICES','HOME & LIVING']

# Tentukan jumlah data yang diinginkan untuk kategori mayoritas
# Misalnya kita ingin menyeimbangkan POLITICS dengan kategori dengan jumlah data paling sedikit
min_class_size = 3000

# Undersampling kategori yang ada dalam daftar category_to_undersample
# Menggunakan isin untuk memilih data yang sesuai dengan kategori yang diinginkan
undersampled_data = data[data['category'].isin(category_to_undersample)]

# Menentukan jumlah data yang ingin diambil dari kategori tersebut
undersampled_data = undersampled_data.groupby('category').apply(lambda x: x.sample(n=min_class_size, random_state=42)).reset_index(drop=True)

# Data lain yang tidak diubah
other_data = data[~data['category'].isin(category_to_undersample)]

# Gabungkan data yang sudah diundersample dengan kategori lainnya
data_balanced = pd.concat([undersampled_data, other_data])

# Lihat distribusi kategori setelah undersampling
print("\nDistribusi Kategori Setelah Undersampling:")
print(data_balanced['category'].value_counts())



Distribusi Kategori Setelah Undersampling:
category
THE WORLDPOST     3664
WEDDINGS          3653
WOMEN             3572
CRIME             3562
IMPACT            3484
DIVORCE           3426
TRAVEL            3000
BUSINESS          3000
WORLD NEWS        3000
WELLNESS          3000
BLACK VOICES      3000
STYLE & BEAUTY    3000
POLITICS          3000
COMEDY            3000
ENTERTAINMENT     3000
FOOD & DRINK      3000
SPORTS            3000
PARENTING         3000
HOME & LIVING     3000
QUEER VOICES      3000
MEDIA             2944
WEIRD NEWS        2777
GREEN             2622
CULTURE & ARTS    2583
RELIGION          2577
SCIENCE           2206
TECH              2104
MONEY             1756
ENVIRONMENT       1444
FIFTY             1401
GOOD NEWS         1398
U.S. NEWS         1377
ARTS & CULTURE    1339
COLLEGE           1144
LATINO VOICES     1130
EDUCATION         1014
Name: count, dtype: int64


  undersampled_data = undersampled_data.groupby('category').apply(lambda x: x.sample(n=min_class_size, random_state=42)).reset_index(drop=True)


In [None]:
# 2. Ambil Kolom yang Relevan
# Kolom 'category' sebagai target dan 'headline' + 'short_description' sebagai fitur
data['text'] = data['headline'].fillna('') + ' ' + data['short_description'].fillna('')

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# 3. Prapemrosesan Teks

nltk.download('wordnet')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))  # Ubah ke bahasa yang sesuai
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

data['clean_text'] = data['text'].apply(preprocess)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# 4. Split Data
X = data['clean_text']  # Fitur
y = data['category']  # Target

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression

In [None]:
# Pisahkan data latih dan uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Mengubah teks menjadi fitur menggunakan TF-IDF
vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
model = LogisticRegression(class_weight='balanced', max_iter=500)
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.5828998234143082
Classification Report:
                precision    recall  f1-score   support

ARTS & CULTURE       0.21      0.40      0.27       275
  BLACK VOICES       0.41      0.45      0.43       889
      BUSINESS       0.45      0.45      0.45      1216
       COLLEGE       0.29      0.55      0.38       202
        COMEDY       0.46      0.49      0.47      1022
         CRIME       0.48      0.62      0.54       713
CULTURE & ARTS       0.34      0.50      0.40       495
       DIVORCE       0.71      0.74      0.73       664
     EDUCATION       0.28      0.56      0.37       209
 ENTERTAINMENT       0.73      0.53      0.61      3419
   ENVIRONMENT       0.29      0.42      0.34       313
         FIFTY       0.15      0.38      0.21       263
  FOOD & DRINK       0.73      0.78      0.75      1697
     GOOD NEWS       0.17      0.39      0.24       270
         GREEN       0.31      0.47      0.37       532
 HOME & LIVING       0.66      0.75      0.70      

In [None]:
from sklearn.metrics import f1_score

print("F1-Score:", f1_score(y_test, y_pred, average='weighted'))

F1-Score: 0.6033445026020556


In [None]:
import pickle

with open('model_kategori_berita_pickle.pkl', 'wb') as model_file, open('vectorizer_kategori_berita_pickle.pkl', 'wb') as vectorizer_file:
    pickle.dump(model, model_file)
    pickle.dump(vectorizer, vectorizer_file)

In [None]:
def clean_text(text):
    text = text.lower()  # Mengubah ke lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Menghapus karakter non-alfabet
    return text

def predict_category(input_text, model, vectorizer):
    # Bersihkan teks input
    cleaned_input = clean_text(input_text)

    # Transformasi ke fitur TF-IDF
    input_tfidf = vectorizer.transform([cleaned_input])

    # Lakukan prediksi
    predicted_category = model.predict(input_tfidf)

    return predicted_category[0]  # Mengembalikan kategori yang diprediksi

In [None]:
# Menerima input teks dari pengguna
input_text = input("Enter the article headline: ")

# Memprediksi kategori artikel
predicted_category = predict_category(input_text, model, vectorizer)

# Menampilkan hasil
print(f"Predicted Category: {predicted_category}")

Enter the article headline: health tips for a better lifestyle in the pandemic
Predicted Category: WELLNESS
