In [1]:
!pip install keras-tuner --upgrade

Note: you may need to restart the kernel to use updated packages.


In [2]:
import keras_tuner as kt
from tensorflow import keras
from sklearn.utils.class_weight import compute_class_weight
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
from transformers import BertJapaneseTokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, Dense, Dropout, Bidirectional, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences
from kerastuner.tuners import RandomSearch,BayesianOptimization
from transformers import BertTokenizer
from kerastuner.engine.hyperparameters import HyperParameters
import matplotlib.pyplot as plt

2024-02-15 09:02:22.804407: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-15 09:02:22.804546: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-15 09:02:22.971218: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
df = pd.read_csv("/kaggle/input/japanese-newspapers-20052021/japanese_news.csv", delimiter='\t')
df['text'] = df['text'].fillna('')

In [None]:
print(df.head())
print(df.info())
print(df.columns)

percentages = df['source'].value_counts(normalize=True) * 100

plt.figure(figsize=(10, 6))
percentages.plot(kind='bar', color='skyblue')
plt.title('Percentage distribution for Source')
plt.xlabel('Source')
plt.ylabel('Percentage')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

source_counts = df['source'].value_counts()

plt.figure(figsize=(10, 6))
sns.barplot(x=source_counts.index, y=source_counts.values, palette="viridis")
plt.title('Source distribution')
plt.xlabel('Source')
plt.ylabel('Number of articles')
plt.xticks(rotation=45)
plt.show()


df['date'] = pd.to_datetime(df['date'], errors='coerce')

invalid_dates = df[df['date'].isnull()]

print("Rows with invalid dates:")
print(invalid_dates)

df = df.dropna(subset=['date'])

df['year'] = df['date'].dt.year

for year, year_data in df.groupby('year'):
    source_counts = year_data['source'].value_counts()

    plt.figure(figsize=(10, 6))
    sns.barplot(x=source_counts.index, y=source_counts.values, hue=source_counts.index, palette="viridis", legend=False)
    plt.title(f'Source distribution - Year {year}')
    plt.xlabel('Source')
    plt.ylabel('Number of articles')
    plt.xticks(rotation=45)
    plt.show()
    plt.tight_layout()

df['year'] = pd.to_datetime(df['date']).dt.year
articles_per_year = df['year'].value_counts().sort_index()

plt.figure(figsize=(10,6))
articles_per_year.plot(kind='line', marker='o')
plt.title('Number of articles per year')
plt.xlabel('Year')
plt.ylabel('Number of articles')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()
plt.tight_layout()

articles_per_source = df['source'].value_counts()

plt.figure(figsize=(10,6))
articles_per_source.plot(kind='line', marker='o')
plt.title('Number of articles per source')
plt.xlabel('Source')
plt.ylabel('Number of articles')
plt.xticks(rotation=45)
plt.show()
plt.tight_layout()

articles_per_year_and_source = df.groupby(['year', 'source']).size().unstack(fill_value=0)

plt.figure(figsize=(18, 12))
articles_per_year_and_source.plot(kind='line', marker='o')
plt.title('Number of articles per year per source')
plt.xlabel('Year')
plt.ylabel('Number of articles')
plt.xticks(articles_per_year_and_source.index, rotation=45)
plt.grid(True)
plt.legend(title='Source', bbox_to_anchor=(1, 1))
plt.tight_layout()
plt.show()

In [3]:
desired_sample_size = 200000
df_sample = df.sample(n=desired_sample_size, random_state=42)

X = df_sample['text']
y = df_sample['source']

num_unique_sources = df_sample['source'].nunique()
print("Numero di valori unici nella colonna 'source':", num_unique_sources)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

  from kerastuner.tuners import RandomSearch,BayesianOptimization
  df = pd.read_csv("/kaggle/input/japanese-newspapers-20052021/japanese_news.csv", delimiter='\t')


Numero di valori unici nella colonna 'source': 21


In [4]:
tokenizer = BertTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
X_train_tok = tokenizer(X_train.tolist(), padding=True, truncation=True, max_length=115, return_tensors="tf")
X_test_tok = tokenizer(X_test.tolist(), padding=True, truncation=True, max_length=115, return_tensors="tf")

max_len = 115 
X_train_pad = pad_sequences(X_train_tok["input_ids"], maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_tok["input_ids"], maxlen=max_len, padding='post')


tokenizer_config.json:   0%|          | 0.00/104 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/258k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/479 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertJapaneseTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [5]:

class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y= y_train)

class_weights_dict = dict(enumerate(class_weights))

print("Pesi delle classi:", class_weights_dict)

class_weights_df = pd.DataFrame({'Classe': np.unique(y_train), 'Peso': class_weights})
print(class_weights_df)

Pesi delle classi: {0: 0.745066264330884, 1: 190.47619047619048, 2: 1.4003028154838484, 3: 10.091453800063071, 4: 12.469799703842257, 5: 1.7910314102133567, 6: 4.391381912995746, 7: 0.33507993750759163, 8: 0.587209835764749, 9: 0.5059464518923978, 10: 0.7024753475057735, 11: 43.53741496598639, 12: 0.6332846495758971, 13: 0.41315805103018377, 14: 0.5670199909985576, 15: 1.5741833923652107, 16: 82.81573498964804, 17: 217.68707482993196, 18: 0.5696484201157098, 19: 5.690102777481418, 20: 0.659372359934887}
    Classe        Peso
0        0    0.745066
1        1  190.476190
2        2    1.400303
3        3   10.091454
4        4   12.469800
5        5    1.791031
6        6    4.391382
7        7    0.335080
8        8    0.587210
9        9    0.505946
10      10    0.702475
11      11   43.537415
12      12    0.633285
13      13    0.413158
14      14    0.567020
15      15    1.574183
16      16   82.815735
17      17  217.687075
18      18    0.569648
19      19    5.690103
20      

In [6]:
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer), output_dim=200))
model.add(Conv1D(128, 5, activation='relu'))
model.add(Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0.4)))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.optimizers.Adam(learning_rate=1e-3), metrics=['accuracy'])



history = model.fit(X_train_pad, y_train, epochs=30, batch_size=64, validation_data=(X_test_pad, y_test))



Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
loss, accuracy = model.evaluate(X_test_pad, y_test)
print("Test Accuracy:", accuracy)

y_pred_probs = model.predict(X_test_pad)
y_pred = np.argmax(y_pred_probs, axis=1)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
plt.figure(figsize=(10, 8))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()