<a href="https://colab.research.google.com/github/Chris-Congleton/MSc-Thesis/blob/main/Analysis/2021_SourceClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports


In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
!pip install -U gensim
!pip install shap
!pip install datasets

import json, os
import pandas as pd
import numpy as np
import nltk
import glob
import matplotlib.pyplot as plt
import datetime
from tqdm import tqdm
nltk.download('punkt')

# regular expression library
import re
import pprint

import gensim
from gensim.utils import simple_preprocess
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('dutch')

#Visualisation
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction                      
import IPython

from gensim.parsing.preprocessing import preprocess_string
from gensim import utils
import gensim.models

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier

import datasets

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Loading and cleaning data

In [3]:
path_NRC = 'gdrive/MyDrive/CS/Thesis/Testset_2021/NRC_2021.json'
path_Trouw = 'gdrive/MyDrive/CS/Thesis/Testset_2021/Trouw_2021.json'
path_HetParool = 'gdrive/MyDrive/CS/Thesis/Testset_2021/HetParool_2021.json'
path_Telegraaf = 'gdrive/MyDrive/CS/Thesis/Testset_2021/Telegraaf_2021.json'
path_Volkskrant = 'gdrive/MyDrive/CS/Thesis/Testset_2021/Volkskrant_2021.json'

df_NRC = pd.read_json(path_NRC)
df_Trouw = pd.read_json(path_Trouw)
df_HetParool = pd.read_json(path_HetParool)
df_Telegraaf = pd.read_json(path_Telegraaf)
df_Volkskrant = pd.read_json(path_Volkskrant)

df_NRC["Source"] = "NRC"
df_Trouw["Source"] = "Trouw"
df_HetParool["Source"] = "HetParool"
df_Telegraaf["Source"] = "Telegraaf"
df_Volkskrant["Source"] = "Volkskrant"

## Concat data

In [4]:
dfs = [df_NRC, df_Trouw, df_HetParool, df_Volkskrant]
df = pd.concat(dfs, ignore_index=True)

# Source Classifier

## Preprocessing

In [5]:
# Select only political articles
articles = df.drop(columns=['url', 'publisherID', 'category',], axis=1)
articles = articles[articles['cleantext'].notna()]
# Remove punctuation
articles['cleantext_processed'] = articles['cleantext'].map(lambda x: re.sub('[,\\.!?]', '', x))
# Save D66
articles['cleantext_processed'] = articles['cleantext_processed'].map(lambda x: re.sub('D66', 'dzeszes', x))
# Save BIJ1
articles['cleantext_processed'] = articles['cleantext_processed'].map(lambda x: re.sub('BIJ1', 'bijeeen', x))
# Save 50PLUS
articles['cleantext_processed'] = articles['cleantext_processed'].map(lambda x: re.sub('50PLUS', 'vijfplus', x))
# Save JA21
articles['cleantext_processed'] = articles['cleantext_processed'].map(lambda x: re.sub('JA21', 'jatweeeen', x))
# Convert the titles to lowercase
articles['cleantext_processed'] = articles['cleantext_processed'].map(lambda x: x.lower())

p_and_p =  ['vvd', 'dzeszes', 'pvv', 'cda', 'pvda', 'sp', 'gl', 'fvd', 'pvdd',
            'cu', 'sgp', 'volt', 'jatweeeen', 'bbb', 'bijeeen', 'vijfplus', 'rutte', 'kaag',
            'wilders', 'hoekstra', 'ploumen', 'marijnissen', 'klaver', 'baudet',
            'ouwehand', 'segers', 'staaij', 'dassen', 'eerdmans', 'plas',
            'simons', 'den haan']

pat = '|'.join(r"\b{}\b".format(x) for x in p_and_p)
articles = articles[articles['cleantext_processed'].str.contains(pat)]
articles = articles[articles['cleantext'].str.len()>5]


In [6]:
# Remove stopwords
stop_words.extend(['nbsp', 'rsquo', 'lsquo','euml', 'rdquo', 'quot', 'ldquo',
                    'we', 'wel', 'jaar', 'zegt', 'gaat','moeten', 'volgens',
                    'heel', 'maken', 'weer', 'gaan', 'twee', 'mensen', 'waar', 'alleen',
                    'komt', 'zoals', 'alle', 'wij', 'zei','komen','tussen', 'eerste','zien'
                    'eigen', 'leven','grote', 'nieuwe','mee','goed', 'aantal','laten'
                    ,'tweede', 'af', 'eerder', 'eigen', 'elkaar','afgelopen', 'week',
                    'zeggen', 'echt', 'dag', 'zien', 'steeds', 'laten', 'willen', 'waarin',
                   'duizend','voormalig', 'nu', 'we'])

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
            if word not in stop_words] for doc in texts]

articles['cleantext_processed'] = remove_stopwords(articles['cleantext_processed'])

articles['cleantext_processed'] = [' '.join(map(str, l)) for l in articles['cleantext_processed']]

In [7]:
Xy = articles.drop(columns=['timestamp', 'title'])
X = Xy['cleantext_processed']
y = Xy['Source']

# Train test split

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [9]:
X_train[:5]

22425    opluchting hoorbaar telefoon even milad gebeld...
59361    kamerleden voerden mooi inhoudelijk haast hoff...
94442    blijkt reconstructie basis documenten openbaar...
81153    aldus opgewekt geluimd begaf stembureau kijken...
84912    leegte vullen achtergelaten vermoord verlies p...
Name: cleantext_processed, dtype: object

# TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfconverter = TfidfVectorizer(min_df=50)
X_train, X_test = tfidfconverter.fit_transform(X_train).toarray(), tfidfconverter.fit_transform(X_test).toarray()

# Dummy Classifier

In [None]:
from sklearn.dummy import DummyClassifier

In [None]:
classifier = DummyClassifier(strategy='uniform')
classifier.fit(X_train, y_train) 
y_pred = classifier.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

In [None]:
classifier = DummyClassifier(strategy='most_frequent')
classifier.fit(X_train, y_train) 
y_pred = classifier.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

# Decision Tree Classifier

In [None]:
classifier = DecisionTreeClassifier(max_depth=20, random_state=17)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

In [None]:
features = list(tfidfconverter.get_feature_names_out())


In [None]:
from sklearn import tree
text_representation = tree.export_text(classifier, feature_names=features)
print(text_representation)

# SVM

In [11]:
from sklearn import svm

In [12]:
clf = svm.SVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

ValueError: ignored

# xgboost


In [None]:
import xgboost
model = xgboost.XGBClassifier().fit(X_train, y_train)
y_pred = model.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

# KNeighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

# Naïve Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

# Linear SVC

In [None]:
from sklearn.svm import LinearSVC

In [None]:
clf = LinearSVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

# SGD

In [None]:
from sklearn.linear_model import SGDClassifier

In [None]:
clf = SGDClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

# Word2Vec

In [None]:
corpus = list(articles["cleantext_processed"].values)
corp_length = len(corpus)

print("Frame shape: ", articles.shape)
print("Corpus length: ", len(corpus))

class Corpus():
  """An iterator that yields sentences (lists of str)."""

  def __init__(self, corpus):
    self.corpus = corpus

  def __iter__(self):
      for line in self.corpus:
          # assume there's one document per line, tokens separated by whitespace
          yield utils.simple_preprocess(line)

# Training full corpus 60 hours
sentences = Corpus(corpus)
model = gensim.models.Word2Vec(sentences=tqdm(sentences, total=corp_length))

# RobBert

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 11.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 44.5 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 43.8 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 46.1 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed pyyaml-6.0 sacremoses-0.0.49 tokenizers-

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
tokenizer = RobertaTokenizer.from_pretrained("pdelobelle/robbert-v2-dutch-base")

Downloading:   0%|          | 0.00/715k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/374k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/660 [00:00<?, ?B/s]

In [None]:
articles = df.drop(columns=['url', 'publisherID', 'category','timestamp', 'title'], axis=1)
articles = articles[articles['cleantext'].str.len()>5]
df_bert = articles

In [None]:
def preprocess_function(examples):
    return tokenizer(examples['cleantext'], truncation=True, padding=True, max_length=128)

In [None]:
bert_dataset = datasets.Dataset.from_pandas(df_bert)

In [None]:
bert_dataset = bert_dataset.train_test_split(test_size=0.2)

In [None]:
tokenized = bert_dataset.map(preprocess_function, batched=True)

  0%|          | 0/78 [00:00<?, ?ba/s]

  0%|          | 0/20 [00:00<?, ?ba/s]

In [None]:
tokenized = tokenized.class_encode_column("Source")

Casting to class labels:   0%|          | 0/78 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/8 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/20 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
model = RobertaForSequenceClassification.from_pretrained("pdelobelle/robbert-v2-dutch-base", num_labels=4)

Downloading:   0%|          | 0.00/446M [00:00<?, ?B/s]

Some weights of the model checkpoint at pdelobelle/robbert-v2-dutch-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pdelobelle/robbert-v2-dutch-base and are newly initialized: ['classifier.out_proj.bias', 'clas

In [None]:
tokenized = tokenized.rename_column("Source", "label")

In [None]:
!pip install wandb

import wandb
wandb.login()

Collecting wandb
  Downloading wandb-0.12.13-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 12.3 MB/s 
[?25hCollecting GitPython>=1.0.0
  Downloading GitPython-3.1.27-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 54.3 MB/s 
[?25hCollecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.8-py3-none-any.whl (9.5 kB)
Collecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
Collecting setproctitle
  Downloading setproctitle-1.2.2-cp37-cp37m-manylinux1_x86_64.whl (36 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.5.8-py2.py3-none-any.whl (144 kB)
[K     |████████████████████████████████| 144 kB 46.8 MB/s 
[?25hCollecting gitdb<5,>=4.0.1
  Downloading gitdb-4.0.9-py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 1.7 MB/s 
Collecting smmap<6,>=3.0.1
  Downloading smmap-5.0.0-py3-none-any.

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
%env WANDB_PROJECT=source_classification

env: WANDB_PROJECT=source_classification


In [None]:
from datasets import load_metric

In [None]:
def compute_metrics(eval_pred):
    metric1 = load_metric("precision")
    metric2 = load_metric("recall")
    metric3 = load_metric("f1")
    metric4 = load_metric("accuracy")
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision = metric1.compute(predictions=predictions, references=labels, average="macro")["precision"]
    recall = metric2.compute(predictions=predictions, references=labels, average="macro")["recall"]
    f1 = metric3.compute(predictions=predictions, references=labels, average="macro")["f1"]
    accuracy = metric4.compute(predictions=predictions, references=labels)["accuracy"]

    return {"precision": precision, "recall": recall, "f1":f1, "accuracy":accuracy}

In [None]:
training_args = TrainingArguments(

    output_dir="./results",

    learning_rate=1e-5,

    per_device_train_batch_size=16,

    per_device_eval_batch_size=16,

    num_train_epochs=3,

    weight_decay=0.01,

    report_to="wandb",

    evaluation_strategy = "steps",

    logging_steps = 100,                    # we will log every 100 steps
    eval_steps = 1000,                      # we will perform evaluation every 1000 steps
    save_steps = 5000,
    load_best_model_at_end = True,

)

trainer = Trainer(

    model=model,

    args=training_args,

    train_dataset=tokenized['train'],

    eval_dataset=tokenized['test'],

    tokenizer=tokenizer,

    data_collator=data_collator,

    compute_metrics=compute_metrics,

)

trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: cleantext, __index_level_0__. If cleantext, __index_level_0__ are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 77472
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 14526
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


[34m[1mwandb[0m: Currently logged in as: [33mchriscongleton[0m (use `wandb login --relogin` to force relogin)


Step,Training Loss,Validation Loss


In [None]:
wandb.finish()