In [1]:
# Librarii

# Citire dataset
import pandas as pd
# Verificare path
import os

# import numpy as np
import tensorflow as tf

# Vectorizare text
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from tensorflow.keras.utils import to_categorical

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2

from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
#  Incarcam setul de date
file_path = 'News_Category_Dataset_v3.json'
if not os.path.exists(file_path):
    print(f"Eroare: Fisierul {file_path} nu a fost gasit.")

# Citim setul de date
data = pd.read_json(file_path, lines=True)

In [None]:
# Debug incarcare dataset
print(data.head(3), "\n\n")


In [4]:
# Prelucrare dataset

# Textul stirii este compus din titlu si descrierea sumara
data['text'] = data['headline'] + " " + data['short_description']

# Modelul se va folosi doar de textul final si categoria stirii respective
data = data[['text', 'category']]

# Valorile nule sunt eliminate
data = data.dropna()


In [None]:
# Debug dataset prelucrat
print(data.head(6), "\n\n")

# Reprezentare grafica a datasetului
sns.set_theme(style="darkgrid", rc={'figure.figsize':(11.7,8.27)})
ax = sns.countplot(y="category", data=data)

In [6]:
# Vectorizare text
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)  # Limităm la 10.000 de termeni
X = vectorizer.fit_transform(data['text'])

# Conversie categorii în valori numerice
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['category'])

# Impartire dataset in set de antrenare si set de testare
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Conversie etichete - one-hot encoding
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

In [None]:
# Creare model - 3 straturi, regularizare L2
model = Sequential([
    Dense(128, activation='relu', kernel_regularizer=l2(0.01),  input_shape=(X_train.shape[1],)), 
    Dense(64, activation='relu'), 
    Dense(y_train_categorical.shape[1], activation='softmax')
])

# Compilare model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', 'precision', 'recall', 'auc'])

# Sumar model
model.summary()

In [8]:
# Callback-uri in caz de stagnare, pentru a preveni overfitting-ul
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)


In [9]:
# Incarcare model preantrenat

model = tf.keras.models.load_model('Proiect_TIA.keras')

In [None]:
'''
# Antrenare

history = model.fit(
    X_train,
    y_train_categorical,
    epochs=100,
    batch_size=80,
    validation_split=0.2,
    callbacks=[early_stopping, reduce_lr]
)
'''

In [None]:
# Evaluare pe setul de testare
loss, accuracy, precision, recall, auc = model.evaluate(X_test, y_test_categorical)
print(f"Acuratete pe setul de testare: {accuracy:.2f}")

In [12]:
# score = model.evaluate(X_test, y_test_categorical, verbose=0)
# print("Test loss:", score[0])
# print("Test accuracy:", score[1])

In [None]:
# Testare model antrenat pe texte predefinite
text_test = [
    "The stock market is down due to the pandemic.",
    "A building caught on fire in the city center.",
    "The government unveils a new economic policy.",
    "A famous actor wins an award at the international film festival."
]

# Vectorizare texte
text_vectorized = vectorizer.transform(text_test)

# Prezicem categoriile
prediction = model.predict(text_vectorized)

# Convert
for text, category in zip(text_test, prediction):
    predict_category = label_encoder.inverse_transform([category.argmax()])[0]
    print(f"Text: {text}")
    print(f"Predictie categorie: {predict_category}\n")


In [32]:

# Testare pe un input de la tastatura

text_test = str(input())
text_vectorized = vectorizer.transform([text_test])
prediction = model.predict(text_vectorized)

for text, category in zip([text_test], prediction):
    predict_category = label_encoder.inverse_transform([category.argmax()])[0]
    print(f"Text: {text}")
    print(f"Predictie categorie: {predict_category}\n")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step
Text: Hot news all students are passing the examn 
Predictie categorie: POLITICS



In [None]:
'''
# Grafice model
plt.figure(figsize=(12, 8))

plt.subplot(2, 2, 1)
plt.plot(history.history['accuracy'], label='Accuracy')
plt.plot(history.history['val_accuracy'], label='Accuracy pe setul de validare')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(2, 2, 2)
plt.plot(history.history['loss'], label='Loss')
plt.plot(history.history['val_loss'], label='Loss pe setul de validare')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(2, 2, 3)
plt.plot(history.history['precision'], label='Precision')
plt.plot(history.history['val_precision'], label='Precizia pe setul de validare')
plt.xlabel('Epoch')
plt.ylabel('Precizie')
plt.legend()

plt.subplot(2, 2, 4)
plt.plot(history.history['auc'], label='AUC')
plt.plot(history.history['val_auc'], label='AUC pe setul de validare')
plt.xlabel('Epoch')
plt.ylabel('AUC (Area Under Curve)')
plt.legend()

plt.tight_layout()
plt.show()
'''

In [16]:
## Salvare model

# model.save('Proiect_TIA.keras')