In [1]:
import numpy as np
import pandas as pd
import spacy
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, get_scheduler
import torch
from tqdm import tqdm
import evaluate
from sklearn.model_selection import train_test_split
from datasets import Dataset
import pyarrow as pa




# Импорт и предобработка

In [80]:
df = pd.read_csv('dataset/train.csv', index_col=['id'])
df.head()

Unnamed: 0_level_0,title,author,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [81]:
def ohe(value_in, value_comp):
    if value_in == value_comp:
        return 1
    else:
        return 0      

In [82]:
df['LABEL_0'] = df['label'].map(lambda x: ohe(x, 0))
df['LABEL_1'] = df['label'].map(lambda x: ohe(x, 1))
df.head()

Unnamed: 0_level_0,title,author,text,label,LABEL_0,LABEL_1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,0,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,1,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,0,1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,0,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,0,1


In [83]:
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words

df['cleaned_text'] = df['text'].apply(
    lambda x: ' '.join(
        token.lemma_.lower() for token in nlp(str(x)) if 
        not token.is_stop 
        and not token.is_punct
        and not token.is_digit
        and not token.like_email
        and not token.like_num
        and not token.is_space
    )
)

In [84]:
df.to_csv('temp.csv')

# Выбор модели

## Fake-News-Bert-Detect

In [4]:
df = pd.read_csv('temp.csv', index_col=['id'])

In [86]:
df.drop(['text', 'title', 'author', 'label'], axis=1, inplace=True)
df.rename(columns={'cleaned_text':'text'}, inplace=True)
df.head()

Unnamed: 0_level_0,LABEL_0,LABEL_1,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,1,house dem aide comey letter jason chaffetz twe...
1,1,0,feeling life circle roundabout head straight l...
2,0,1,truth fire october tension intelligence analys...
3,0,1,videos civilians kill single airstrike identif...
4,0,1,print iranian woman sentence year prison iran ...


In [47]:
df['cleaned_text'] = df['cleaned_text'].str.split()[:450].str.join(' ')

In [49]:
MODEL = "jy46604790/Fake-News-Bert-Detect"
clf = pipeline("text-classification", model=MODEL, tokenizer=MODEL)



In [73]:
n_true = 0
n_false = 0
n_failed = 0

for n, (_, row) in enumerate(df.iterrows()):
    text = row[2]
    try:
        lbl = clf(text)[0]['label']
        if (lbl == 'LABEL_0' and df['LABEL_0'][n] == 1) or (lbl == 'LABEL_1' and df['LABEL_1'][n]==1):
            n_true += 1
        else:
            n_false += 1       
    except:
        n_failed += 1

In [74]:
n_true, n_false, n_failed

(121, 140, 20539)

Данная модель непригодна для поставленной задачи

## roberta-fake-news-classification

In [5]:
tokenizer = AutoTokenizer.from_pretrained("hamzab/roberta-fake-news-classification")
model = AutoModelForSequenceClassification.from_pretrained("hamzab/roberta-fake-news-classification")



In [19]:
def predict_fake(title,text):
    input_str = "<title>" + title + "<content>" +  text + "<end>"
    input_ids = tokenizer.encode_plus(input_str, max_length=512, padding="max_length", truncation=True, return_tensors="pt")
    device =  'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    with torch.no_grad():
        output = model(input_ids["input_ids"].to(device), attention_mask=input_ids["attention_mask"].to(device))
    return dict(zip(["Fake","Real"], [x.item() for x in list(torch.nn.Softmax()(output.logits)[0])] ))

In [7]:
df = pd.read_csv('dataset/train.csv', index_col=['id'])

In [8]:
df.head()

Unnamed: 0_level_0,title,author,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [9]:
pred = predict_fake(df['title'][0], df['text'][0])
pred

  return self._call_impl(*args, **kwargs)


{'Fake': 0.9999282360076904, 'Real': 7.174160418799147e-05}

In [10]:
n_true = 0
n_false = 0
n_failed = 0

for _, row in tqdm(df.iterrows()):
    text = row[2]
    title = row[0]
    content = row[2]
    lbl = row[3]
    try:
        pred = predict_fake(title, content)
        if pred['Fake'] > pred['Real'] and lbl == 1:
            n_true += 1
        else:
            n_false += 1
    except:
        n_failed += 1
n_true, n_false, n_failed

  return self._call_impl(*args, **kwargs)
20800it [3:52:45,  1.49it/s]


(8621, 11582, 597)

In [11]:
print('Accuracy: ', n_true / (n_true + n_false + n_failed))

Accuracy:  0.41447115384615385


# Дообучение

### Trainer

In [3]:
df = pd.read_csv('dataset/train.csv', index_col=['id'])
df.dropna(inplace=True)
df.reset_index(inplace=True)
df.drop(['id'], axis=1, inplace=True)
df.head()

Unnamed: 0,title,author,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
indexes = np.random.permutation(df.shape[0])
train_df = df.loc[indexes[:int(len(indexes)*0.8)], :]
test_df = df.loc[indexes[int(len(indexes)*0.8):], :]
print(train_df.shape, test_df.shape)

(14628, 4) (3657, 4)


In [2]:
tokenizer = AutoTokenizer.from_pretrained("hamzab/roberta-fake-news-classification")
def tokenize_function(examples):
	input_str = "<title>" + examples['title'] + "<content>" +  examples['text'] + "<end>"
	input_ids = tokenizer.encode_plus(input_str, max_length=512, padding="max_length", truncation=True, return_tensors="pt")
	return input_ids



In [12]:
train = Dataset.from_pandas(train_df)
test = Dataset.from_pandas(test_df)

In [13]:
tokenized_train = train.map(tokenize_function)
tokenized_test = test.map(tokenize_function)

Map:   0%|          | 0/14628 [00:00<?, ? examples/s]

Map:   0%|          | 0/3657 [00:00<?, ? examples/s]

In [58]:
tokenized_train

Dataset({
    features: ['title', 'author', 'text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 14628
})

In [59]:
tokenized_train_rm = tokenized_train.remove_columns(['title', 'author', 'text'])
tokenized_test_rm = tokenized_train.remove_columns(['title', 'author', 'text'])
tokenized_train_rm.set_format("torch")
tokenized_test_rm.set_format("torch")

In [16]:
model = AutoModelForSequenceClassification.from_pretrained("hamzab/roberta-fake-news-classification")



In [61]:
training_args = TrainingArguments(
	output_dir = 'test_trainer_log',
	evaluation_strategy = 'epoch',
	per_device_train_batch_size = 6,
	per_device_eval_batch_size = 6,
	num_train_epochs = 5,
	report_to='none')

In [62]:
metric = evaluate.load('f1')
def compute_metrics(eval_pred):
	logits, labels = eval_pred
	predictions = np.argmax(logits, axis=-1)
	return metric.compute(predictions=predictions, references=labels)

In [65]:
trainer = Trainer(
	model = model,
	args = training_args,
	train_dataset = tokenized_train_rm,
	eval_dataset = tokenized_test_rm,
	compute_metrics = compute_metrics)

In [66]:
trainer.train()

  0%|          | 0/12190 [00:00<?, ?it/s]

ValueError: too many values to unpack (expected 2)

### Native torch

In [2]:
df = pd.read_csv('dataset/train.csv', index_col=['id'])
df.dropna(inplace=True)
df.reset_index(inplace=True)
df.drop(['id'], axis=1, inplace=True)
indexes = np.random.permutation(df.shape[0])
train_df = df.loc[indexes[:int(len(indexes)*0.8)], :]
test_df = df.loc[indexes[int(len(indexes)*0.8):], :]
train = Dataset.from_pandas(train_df)
test = Dataset.from_pandas(test_df)

In [3]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.col_text = df['text'].to_numpy()
        self.col_label = df['label'].to_numpy()
        self.col_title = df['title'].to_numpy()

    def __getitem__(self, idx):
        text = self.col_text[idx]
        title = self.col_title[idx]
        label = self.col_label[idx]
        
        return title, text, label
    
    def __len__(self):
        return len(self.col_text)
    
    def getitems(self, indices):
        return [self[idx] for idx in indices]
    
train_data = MyDataset(pa.Table.from_pandas(train_df))
train_dataloader = torch.utils.data.DataLoader(train_data)

In [4]:
tokenizer = AutoTokenizer.from_pretrained("hamzab/roberta-fake-news-classification")
model = AutoModelForSequenceClassification.from_pretrained("hamzab/roberta-fake-news-classification")
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)



In [5]:


def predict_logits(title,text):
    input_str = "<title>" + title + "<content>" +  text + "<end>"
    input_ids = tokenizer.encode_plus(input_str, max_length=512, padding="max_length", truncation=True, return_tensors="pt")
    device =  'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    output = model(input_ids["input_ids"].to(device), attention_mask=input_ids["attention_mask"].to(device))
    return torch.nn.Softmax()(output.logits)[0]

In [6]:
predict_logits('sdd', 'asdasdasdad asdasdsad dsfsf').argmax(-1)

  return self._call_impl(*args, **kwargs)


tensor(0)

In [7]:
def predict_fake(title,text):
    input_str = "<title>" + title + "<content>" +  text + "<end>"
    input_ids = tokenizer.encode_plus(input_str, max_length=512, padding="max_length", truncation=True, return_tensors="pt")
    device =  'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    with torch.no_grad():
        output = model(input_ids["input_ids"].to(device), attention_mask=input_ids["attention_mask"].to(device))
    return dict(zip(["Fake","Real"], [x.item() for x in list(torch.nn.Softmax()(output.logits)[0])] ))

In [8]:
predict_fake('sdd', 'asdasdasdad asdasdsad dsfsf')

{'Fake': 0.999477207660675, 'Real': 0.0005227898363955319}

In [13]:
EPOCHS = 5
total_acc, total_count = 0, 0
log_interval = 1000

model.train()
for epoch in range(1, EPOCHS + 1):
    n_true = 0
    n_false = 0
    n_failed = 0
    for idx, (title, text, label) in enumerate(train_dataloader):
        optimizer.zero_grad()
        try:
            output = predict_fake(title=title[0], text=text[0])
            #print(output)
            if output['Fake'] > output['Real']:
                predicted_label = 1
            else:
                predicted_label = 0
            if predicted_label == label.item():
                n_true += 1
            else:
                n_false += 1
            loss = criterion(predicted_label, label)
            loss.backward()
            optimizer.step()
        except:
            predicted_label = 0
            if predicted_label == label.item():
                n_true += 1
            else:
                n_false += 1
        total_acc = n_true/(n_true + n_false + n_failed)
        if idx %  log_interval == 0:
            print(f'Epoch: {epoch}, total: {idx}, accuracy: {total_acc}')       

Epoch: 1, total: 0, accuracy: 0.5
Epoch: 1, total: 1000, accuracy: 0.48301698301698304
Epoch: 1, total: 2000, accuracy: 0.4847576211894053
Epoch: 1, total: 3000, accuracy: 0.4830056647784072
Epoch: 1, total: 4000, accuracy: 0.48412896775806047
Epoch: 1, total: 5000, accuracy: 0.4854029194161168
Epoch: 1, total: 6000, accuracy: 0.48591901349775035
Epoch: 1, total: 7000, accuracy: 0.48664476503356663
Epoch: 1, total: 8000, accuracy: 0.48662667166604173
Epoch: 1, total: 9000, accuracy: 0.4865570492167537
Epoch: 1, total: 10000, accuracy: 0.48635136486351366
Epoch: 1, total: 11000, accuracy: 0.48622852467957456
Epoch: 1, total: 12000, accuracy: 0.4860011665694525
Epoch: 1, total: 13000, accuracy: 0.486501038381663
Epoch: 1, total: 14000, accuracy: 0.48657238768659383
Epoch: 2, total: 0, accuracy: 0.5
Epoch: 2, total: 1000, accuracy: 0.482017982017982
Epoch: 2, total: 2000, accuracy: 0.4832583708145927
Epoch: 2, total: 3000, accuracy: 0.4810063312229257
Epoch: 2, total: 4000, accuracy: 0.48