In [None]:
pip install keras-tuner --upgrade


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, Bidirectional, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras_tuner.tuners import Hyperband
from transformers import BertTokenizer
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv("/kaggle/input/japanese-newspapers-20052021/japanese_news.csv", delimiter='\t')
df['text'] = df['text'].fillna('')



In [None]:
print(df.head())
print(df.info())
print(df.columns)

percentages = df['source'].value_counts(normalize=True) * 100

plt.figure(figsize=(10, 6))
percentages.plot(kind='bar', color='skyblue')
plt.title('Percentage distribution for {}'.format(source_column))
plt.xlabel(source_column)
plt.ylabel('Percentage')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

source_counts = df['source'].value_counts()

plt.figure(figsize=(10, 6))
sns.barplot(x=source_counts.index, y=source_counts.values, palette="viridis")
plt.title('Source distribution')
plt.xlabel('Source')
plt.ylabel('Number of articles')
plt.xticks(rotation=45)
plt.show()


df['date'] = pd.to_datetime(df['date'], errors='coerce')

invalid_dates = df[df['date'].isnull()]

print("Rows with invalid dates:")
print(invalid_dates)

df = df.dropna(subset=['date'])

df['year'] = df['date'].dt.year

for year, year_data in df.groupby('year'):
    source_counts = year_data['source'].value_counts()

    plt.figure(figsize=(10, 6))
    sns.barplot(x=source_counts.index, y=source_counts.values, hue=source_counts.index, palette="viridis", legend=False)
    plt.title(f'Source distribution - Year {year}')
    plt.xlabel('Source')
    plt.ylabel('Number of articles')
    plt.xticks(rotation=45)
    plt.show()
    plt.tight_layout()

df['year'] = pd.to_datetime(df['date']).dt.year
articles_per_year = df['year'].value_counts().sort_index()

plt.figure(figsize=(10,6))
articles_per_year.plot(kind='line', marker='o')
plt.title('Number of articles per year')
plt.xlabel('Year')
plt.ylabel('Number of articles')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()
plt.tight_layout()

articles_per_source = df['source'].value_counts()

plt.figure(figsize=(10,6))
articles_per_source.plot(kind='line', marker='o')
plt.title('Number of articles per source')
plt.xlabel('Source')
plt.ylabel('Number of articles')
plt.xticks(rotation=45)
plt.show()
plt.tight_layout()

articles_per_year_and_source = df.groupby(['year', 'source']).size().unstack(fill_value=0)

plt.figure(figsize=(18, 12))
articles_per_year_and_source.plot(kind='line', marker='o')
plt.title('Number of articles per year per source')
plt.xlabel('Year')
plt.ylabel('Number of articles')
plt.xticks(articles_per_year_and_source.index, rotation=45)
plt.grid(True)
plt.legend(title='Source', bbox_to_anchor=(1, 1))
plt.tight_layout()
plt.show()



In [None]:
desired_sample_size = 250000
df_sample = df.sample(n=desired_sample_size, random_state=42)

X = df_sample['text']
y = df_sample['source']

num_unique_sources = df_sample['source'].nunique()
print("Numero di valori unici nella colonna 'source':", num_unique_sources)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
 
tokenizer = BertTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
X_train_tok = tokenizer(X_train.tolist(), padding=True, truncation=True, max_length=115, return_tensors="tf")
X_test_tok = tokenizer(X_test.tolist(), padding=True, truncation=True, max_length=115, return_tensors="tf")

max_len = 115  # Adjust according to the distribution of text lengths
X_train_pad = pad_sequences(X_train_tok["input_ids"], maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_tok["input_ids"], maxlen=max_len, padding='post')

In [None]:

def build_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.vocab), output_dim=200))
    model.add(128, 5, activation='relu')
    model.add(Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0.4)))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(len(label_encoder.classes_), activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.optimizers.Adam(learning_rate=1e-3), metrics=['accuracy'])
    return model
#{'embedding_output_dim': 200, 'conv1d_filters': 128, 'lstm_units': 128, 'lstm_dropout': 0.30000000000000004, 'recurrent_dropout': 0.4, 'dense_units': 64, 'dropout': 0.2, 'learning_rate': 0.001}
# Define the Hyperband tuner
tuner = Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=15,  
    factor=3,  
    directory='my_dir',
    project_name='japanese_news'
)


tuner.search(X_train_pad, y_train, epochs=30, validation_data=(X_test_pad, y_test))


best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print("Best Hyperparameters:")
print(best_hps.values)

model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train_pad, y_train, epochs=30, batch_size=128, validation_data=(X_test_pad, y_test))




In [None]:
loss, accuracy = model.evaluate(X_test_pad, y_test)
print("Test Accuracy:", accuracy)

y_pred = model.predict_classes(X_test_pad)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

print("Confusion Matrix:")
conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()