In [None]:
!pip install pandas numpy scikit-learn matplotlib seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
data = pd.read_csv("japanese_news.csv", delimiter='\t')
print(data.head())
print(data.info())
print(data.columns)

source_column = 'source'

percentages = data[source_column].value_counts(normalize=True) * 100

plt.figure(figsize=(10, 6))
percentages.plot(kind='bar', color='skyblue')
plt.title('Percentage distribution for {}'.format(source_column))
plt.xlabel(source_column)
plt.ylabel('Percentage')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

source_counts = data['source'].value_counts()

plt.figure(figsize=(10, 6))
sns.barplot(x=source_counts.index, y=source_counts.values, palette="viridis")
plt.title('Source distribution')
plt.xlabel('Source')
plt.ylabel('Number of articles')
plt.xticks(rotation=45)
plt.show()


data['date'] = pd.to_datetime(data['date'], errors='coerce')

invalid_dates = data[data['date'].isnull()]

print("Rows with invalid dates:")
print(invalid_dates)

data = data.dropna(subset=['date'])

data['year'] = data['date'].dt.year

for year, year_data in data.groupby('year'):
    source_counts = year_data['source'].value_counts()

    plt.figure(figsize=(10, 6))
    sns.barplot(x=source_counts.index, y=source_counts.values, hue=source_counts.index, palette="viridis", legend=False)
    plt.title(f'Source distribution - Year {year}')
    plt.xlabel('Source')
    plt.ylabel('Number of articles')
    plt.xticks(rotation=45)
    plt.show()
    plt.tight_layout()

data['year'] = pd.to_datetime(data['date']).dt.year
articles_per_year = data['year'].value_counts().sort_index()

plt.figure(figsize=(10,6))
articles_per_year.plot(kind='line', marker='o')
plt.title('Number of articles per year')
plt.xlabel('Year')
plt.ylabel('Number of articles')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()
plt.tight_layout()

articles_per_source = data['source'].value_counts()

plt.figure(figsize=(10,6))
articles_per_source.plot(kind='line', marker='o')
plt.title('Number of articles per source')
plt.xlabel('Source')
plt.ylabel('Number of articles')
plt.xticks(rotation=45)
plt.show()
plt.tight_layout()

articles_per_year_and_source = data.groupby(['year', 'source']).size().unstack(fill_value=0)

plt.figure(figsize=(18, 12))
articles_per_year_and_source.plot(kind='line', marker='o')
plt.title('Number of articles per year per source')
plt.xlabel('Year')
plt.ylabel('Number of articles')
plt.xticks(articles_per_year_and_source.index, rotation=45)
plt.grid(True)
plt.legend(title='Source', bbox_to_anchor=(1, 1))
plt.tight_layout()
plt.show()



In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['source'], test_size=0.2, random_state=42)

# Creazione del modello di classificazione
text_clf = Pipeline([
    ('vect', CountVectorizer()),  # Vettorizzazione del testo
    ('clf', MultinomialNB()),     # Utilizzo di un classificatore Naive Bayes multinomiale
])

# Addestramento del modello
text_clf.fit(X_train, y_train)

# Valutazione del modello
predictions = text_clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:")
print(classification_report(y_test, predictions))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

# Pre-elaborazione del testo e vettorizzazione TF-IDF
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Utilizzo della vettorizzazione TF-IDF
    ('clf', MultinomialNB()),      # Utilizzo di un classificatore Naive Bayes multinomiale
])

# Addestramento del modello
text_clf.fit(X_train, y_train)

# Valutazione del modello aggiornato
predictions = text_clf.predict(X_test)
print("Accuracy (con TF-IDF):", accuracy_score(y_test, predictions))
print("Classification Report (con TF-IDF):")
print(classification_report(y_test, predictions))

# Ricerca degli iperparametri
parameters = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],  # Prova diverse lunghezze di n-grammi
    'clf__alpha': [0.1, 0.5, 1.0],           # Prova diversi valori di alpha per il classificatore NB
}

# Utilizzo della ricerca degli iperparametri per ottimizzare il modello
grid_search = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Valutazione del modello ottimizzato
best_model = grid_search.best_estimator_
best_predictions = best_model.predict(X_test)
print("Best Parameters:", grid_search.best_params_)
print("Accuracy (con TF-IDF e parametri ottimizzati):", accuracy_score(y_test, best_predictions))
print("Classification Report (con TF-IDF e parametri ottimizzati):")
print(classification_report(y_test, best_predictions))

Accuracy (con TF-IDF): 0.6025548324897566
Classification Report (con TF-IDF):
                   precision    recall  f1-score   support

hokkaido-np.co.jp       0.91      0.71      0.80       643
   iwate-np.co.jp       0.00      0.00      0.00         3
    kobe-np.co.jp       0.00      0.00      0.00        11
      mainichi.jp       0.80      0.53      0.64       812
 nikkansports.com       0.45      0.94      0.60      1126
       nikkei.com       1.00      0.00      0.01       252
 shimotsuke.co.jp       0.00      0.00      0.00         6
    tomamin.co.jp       0.75      0.49      0.59      1136
    yomiuri.co.jp       0.00      0.00      0.00       160

         accuracy                           0.60      4149
        macro avg       0.43      0.30      0.29      4149
     weighted avg       0.68      0.60      0.58      4149



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Best Parameters: {'clf__alpha': 0.1, 'tfidf__ngram_range': (1, 1)}
Accuracy (con TF-IDF e parametri ottimizzati): 0.6418414075680887
Classification Report (con TF-IDF e parametri ottimizzati):
                   precision    recall  f1-score   support

hokkaido-np.co.jp       0.83      0.75      0.79       643
   iwate-np.co.jp       0.00      0.00      0.00         3
    kobe-np.co.jp       0.00      0.00      0.00        11
      mainichi.jp       0.76      0.62      0.68       812
 nikkansports.com       0.49      0.91      0.63      1126
       nikkei.com       0.94      0.41      0.57       252
 shimotsuke.co.jp       0.00      0.00      0.00         6
    tomamin.co.jp       0.79      0.48      0.60      1136
    yomiuri.co.jp       0.86      0.04      0.07       160

         accuracy                           0.64      4149
        macro avg       0.52      0.36      0.37      4149
     weighted avg       0.72      0.64      0.63      4149



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Prova con Random Forest
from sklearn.ensemble import RandomForestClassifier
rf_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),       # Utilizzo della vettorizzazione TF-IDF
    ('clf', RandomForestClassifier()), # Random Forest
])

rf_clf.fit(X_train, y_train)
rf_predictions = rf_clf.predict(X_test)
print("Accuracy (Random Forest):", accuracy_score(y_test, rf_predictions))
print("Classification Report (Random Forest):")
print(classification_report(y_test, rf_predictions))

Accuracy (Random Forest): 0.4483007953723789
Classification Report (Random Forest):
                   precision    recall  f1-score   support

hokkaido-np.co.jp       0.25      0.93      0.39       643
   iwate-np.co.jp       0.00      0.00      0.00         3
    kobe-np.co.jp       0.25      0.09      0.13        11
      mainichi.jp       0.60      0.43      0.50       812
 nikkansports.com       0.82      0.48      0.61      1126
       nikkei.com       0.96      0.19      0.32       252
 shimotsuke.co.jp       0.17      0.17      0.17         6
    tomamin.co.jp       0.75      0.28      0.40      1136
    yomiuri.co.jp       0.50      0.02      0.04       160

         accuracy                           0.45      4149
        macro avg       0.48      0.29      0.28      4149
     weighted avg       0.66      0.45      0.46      4149



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Prova con Support Vector Machine (SVM)
svm_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Utilizzo della vettorizzazione TF-IDF
    ('clf', SVC()),                # Support Vector Machine
])

svm_clf.fit(X_train, y_train)
svm_predictions = svm_clf.predict(X_test)
print("Accuracy (SVM):", accuracy_score(y_test, svm_predictions))
print("Classification Report (SVM):")
print(classification_report(y_test, svm_predictions))




Accuracy (SVM): 0.534586647384912
Classification Report (SVM):
                   precision    recall  f1-score   support

hokkaido-np.co.jp       0.34      0.84      0.49       643
   iwate-np.co.jp       0.00      0.00      0.00         3
    kobe-np.co.jp       0.00      0.00      0.00        11
      mainichi.jp       0.71      0.45      0.55       812
 nikkansports.com       0.81      0.62      0.70      1126
       nikkei.com       1.00      0.06      0.11       252
 shimotsuke.co.jp       1.00      0.17      0.29         6
    tomamin.co.jp       0.51      0.53      0.52      1136
    yomiuri.co.jp       0.00      0.00      0.00       160

         accuracy                           0.53      4149
        macro avg       0.49      0.30      0.29      4149
     weighted avg       0.61      0.53      0.52      4149



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:

# Dividi il dataset in set di addestramento e di test
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['source'], test_size=0.2, random_state=42)

# Estrai features dai testi
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Addestra un classificatore
classifier = MultinomialNB()
classifier.fit(X_train_vect, y_train)

# Fai predizioni
y_pred = classifier.predict(X_test_vect)

In [None]:
# Valuta le prestazioni del classificatore
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

Accuracy: 0.6025548324897566
                   precision    recall  f1-score   support

hokkaido-np.co.jp       0.91      0.71      0.80       643
   iwate-np.co.jp       0.00      0.00      0.00         3
    kobe-np.co.jp       0.00      0.00      0.00        11
      mainichi.jp       0.80      0.53      0.64       812
 nikkansports.com       0.45      0.94      0.60      1126
       nikkei.com       1.00      0.00      0.01       252
 shimotsuke.co.jp       0.00      0.00      0.00         6
    tomamin.co.jp       0.75      0.49      0.59      1136
    yomiuri.co.jp       0.00      0.00      0.00       160

         accuracy                           0.60      4149
        macro avg       0.43      0.30      0.29      4149
     weighted avg       0.68      0.60      0.58      4149



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from transformers import BertTokenizer, BertModel

# Dividi il dataset in set di addestramento e di test
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['source'], test_size=0.2, random_state=42)

# Caricamento del tokenizer e del modello BERT preaddestrato
tokenizer = BertTokenizer.from_pretrained('cl-tohoku/bert-base-japanese')
model = BertModel.from_pretrained('cl-tohoku/bert-base-japanese')

# Funzione per ottenere gli embeddings di parole da BERT
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
    return embeddings

# Pre-elaborazione del testo utilizzando BERT embeddings
X_train_embeddings = [get_bert_embeddings(text) for text in X_train]
X_test_embeddings = [get_bert_embeddings(text) for text in X_test]

# Addestramento del modello utilizzando gli embeddings di BERT
clf = RandomForestClassifier()
clf.fit(X_train_embeddings, y_train)

# Valutazione del modello
predictions = clf.predict(X_test_embeddings)
print("Accuracy with BERT embeddings:", accuracy_score(y_test, predictions))
print("Classification Report with BERT embeddings:")
print(classification_report(y_test, predictions))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/104 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/258k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/479 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertJapaneseTokenizer'. 
The class this function is called from is 'BertTokenizer'.


pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Accuracy with BERT embeddings: 0.7727163171848638
Classification Report with BERT embeddings:
                   precision    recall  f1-score   support

hokkaido-np.co.jp       0.95      0.71      0.81       643
   iwate-np.co.jp       0.00      0.00      0.00         3
    kobe-np.co.jp       1.00      0.18      0.31        11
      mainichi.jp       0.80      0.78      0.79       812
 nikkansports.com       0.82      0.87      0.84      1126
       nikkei.com       0.89      0.45      0.60       252
 shimotsuke.co.jp       0.00      0.00      0.00         6
    tomamin.co.jp       0.66      0.90      0.76      1136
    yomiuri.co.jp       1.00      0.04      0.07       160

         accuracy                           0.77      4149
        macro avg       0.68      0.44      0.46      4149
     weighted avg       0.80      0.77      0.76      4149



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Prepara i dati
X = data['text']
y = data['source']

# Dividi il dataset in set di addestramento e set di test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Estrai le features dal testo utilizzando TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Addestra il classificatore (utilizziamo un Linear Support Vector Classifier)
classifier = LinearSVC()
classifier.fit(X_train_tfidf, y_train)

# Valuta le prestazioni del classificatore
y_pred = classifier.predict(X_test_tfidf)
#print(classification_report(y_test, y_pred))
#print(confusion_matrix(y_test, y_pred))

# Valuta le prestazioni del classificatore
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6490720655579658


In [None]:
# Visualizza la matrice di confusione
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Matrice di Confusione')
plt.colorbar()
plt.xticks(range(len(conf_matrix)), conf_matrix, rotation=90)
plt.yticks(range(len(conf_matrix)), conf_matrix)
plt.xlabel('Valore Predetto')
plt.ylabel('Valore Reale')
plt.show()