In [None]:
pip install keras-tuner --upgrade

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, Dense, Dropout, Bidirectional, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras_tuner.tuners import BayesianOptimization
from transformers import BertTokenizer
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE

In [None]:
df = pd.read_csv("/kaggle/input/japanese-newspapers-20052021/japanese_news.csv", delimiter='\t')
df['text'] = df['text'].fillna('')




In [None]:
print(df.head())
print(df.info())
print(df.columns)

percentages = df['source'].value_counts(normalize=True) * 100

plt.figure(figsize=(10, 6))
percentages.plot(kind='bar', color='skyblue')
plt.title('Percentage distribution for {}'.format(source_column))
plt.xlabel(source_column)
plt.ylabel('Percentage')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

source_counts = df['source'].value_counts()

plt.figure(figsize=(10, 6))
sns.barplot(x=source_counts.index, y=source_counts.values, palette="viridis")
plt.title('Source distribution')
plt.xlabel('Source')
plt.ylabel('Number of articles')
plt.xticks(rotation=45)
plt.show()


df['date'] = pd.to_datetime(df['date'], errors='coerce')

invalid_dates = df[df['date'].isnull()]

print("Rows with invalid dates:")
print(invalid_dates)

df = df.dropna(subset=['date'])

df['year'] = df['date'].dt.year

for year, year_data in df.groupby('year'):
    source_counts = year_data['source'].value_counts()

    plt.figure(figsize=(10, 6))
    sns.barplot(x=source_counts.index, y=source_counts.values, hue=source_counts.index, palette="viridis", legend=False)
    plt.title(f'Source distribution - Year {year}')
    plt.xlabel('Source')
    plt.ylabel('Number of articles')
    plt.xticks(rotation=45)
    plt.show()
    plt.tight_layout()

df['year'] = pd.to_datetime(df['date']).dt.year
articles_per_year = df['year'].value_counts().sort_index()

plt.figure(figsize=(10,6))
articles_per_year.plot(kind='line', marker='o')
plt.title('Number of articles per year')
plt.xlabel('Year')
plt.ylabel('Number of articles')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()
plt.tight_layout()

articles_per_source = df['source'].value_counts()

plt.figure(figsize=(10,6))
articles_per_source.plot(kind='line', marker='o')
plt.title('Number of articles per source')
plt.xlabel('Source')
plt.ylabel('Number of articles')
plt.xticks(rotation=45)
plt.show()
plt.tight_layout()

articles_per_year_and_source = df.groupby(['year', 'source']).size().unstack(fill_value=0)

plt.figure(figsize=(18, 12))
articles_per_year_and_source.plot(kind='line', marker='o')
plt.title('Number of articles per year per source')
plt.xlabel('Year')
plt.ylabel('Number of articles')
plt.xticks(articles_per_year_and_source.index, rotation=45)
plt.grid(True)
plt.legend(title='Source', bbox_to_anchor=(1, 1))
plt.tight_layout()
plt.show()



In [None]:
desired_sample_size = 51000
df_sample = df.sample(n=desired_sample_size, random_state=42)

X = df_sample['text']
y = df_sample['source']

num_unique_sources = df_sample['source'].nunique()
print("Number of output classes 'source':", num_unique_sources)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
tokenizer = BertTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
X_train_tok = tokenizer(X_train.tolist(), padding=True, truncation=True, max_length=115, return_tensors="tf")
X_test_tok = tokenizer(X_test.tolist(), padding=True, truncation=True, max_length=115, return_tensors="tf")

max_len = 115 
X_train_pad = pad_sequences(X_train_tok["input_ids"], maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_tok["input_ids"], maxlen=max_len, padding='post')


In [None]:

class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y= y_train)

class_weights_dict = dict(enumerate(class_weights))

print("Class weights:", class_weights_dict)

class_weights_df = pd.DataFrame({'Class': np.unique(y_train), 'Weight': class_weights})

print(class_weights_df)

In [None]:
sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train_pad, y_train)

In [None]:
def build_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer), output_dim=hp.Int('embedding_output_dim', min_value=100, max_value=250, step=50)))
    model.add(Conv1D(hp.Int('conv1d_filters', min_value=64, max_value=256, step=64), 5, activation='relu'))
    model.add(Bidirectional(LSTM(hp.Int('lstm_units', min_value=32, max_value=128, step=32), dropout=hp.Float('lstm_dropout', min_value=0.1, max_value=0.5, step=0.1), recurrent_dropout=hp.Float('recurrent_dropout', min_value=0.1, max_value=0.5, step=0.1))))
    model.add(Dense(hp.Int('dense_units', min_value=32, max_value=128, step=32), activation='relu'))
    model.add(Dropout(hp.Float('dropout', min_value=0.1, max_value=0.5, step=0.1)))
    model.add(Dense(len(label_encoder.classes_), activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.optimizers.Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])), metrics=['accuracy'])
    return model
tuner = BayesianOptimization(
    build_model,
    objective='val_accuracy',
    max_trials=8,  
    directory='my_dir',
    project_name='japanese_news'
)

tuner.search(X_train_resampled, y_train_resampled, epochs=5, validation_data=(X_test_pad, y_test), class_weight=class_weights_dict)

best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print("Best Hyperparameters:")
print(best_hps.values)


model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train_resampled, y_train_resampled, epochs=30, batch_size=32, validation_data=(X_test_pad, y_test))


loss, accuracy = model.evaluate(X_test_pad, y_test)
print("Test Accuracy:", accuracy)



In [None]:
# Evaluate model
loss, accuracy = model.evaluate(X_test_pad, y_test)
print("Test Accuracy:", accuracy)

# Fai previsioni sul set di test
y_pred = model.predict_classes(X_test_pad)

# Calcola l'F1-score e la matrice di confusione
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))



In [None]:
print("Confusion Matrix:")
conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)
# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()