<a href="https://colab.research.google.com/github/Akshayextreme/Fake_news_detection_hackathon/blob/master/Fake_news_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [2]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader

from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from transformers import BertForSequenceClassification, BertTokenizer

## Data & EDA

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
df = pd.read_csv('Train.csv')

In [5]:
df.head()

Unnamed: 0,Labels,Text,Text_Tag
0,1,Says the Annies List political group supports ...,abortion
1,2,When did the decline of coal start? It started...,"energy,history,job-accomplishments"
2,3,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy
3,1,Health care reform legislation is likely to ma...,health-care
4,2,The economic turnaround started at the end of ...,"economy,jobs"


In [6]:
df.isnull().sum()

Labels      0
Text        0
Text_Tag    2
dtype: int64

In [7]:
df.Text_Tag.fillna('Not Available', inplace=True)

In [8]:
df['Text_Tag'] = df.Text_Tag.str.replace(",", " ")

We try 3 approaches here for handling TEXT_TAG column
- Approach A : Neglect it
- Approach B : Consider it as a second sentence while giving input to BERT (For this change Dataset class)
- Approach C : Concanate it with TEXT column

In [None]:
# Approach C
# df['Text'] = df['Text'] + df['Text_Tag']

In [99]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [77]:
def leng_txt(x):
    in_ids = tokenizer.encode(x, add_special_tokens=True)
    return len(in_ids)

In [78]:
df['length'] = df.Text.apply(leng_txt)

Token indices sequence length is longer than the specified maximum sequence length for this model (712 > 512). Running this sequence through the model will result in indexing errors


In [79]:
df.length.quantile(0.95)

42.0

In [80]:
df.length.median()

22.0

In [81]:
MAX_LEN = 50
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 32
LEARNING_RATE = 1e-05

In [112]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.text = dataframe.Text
        # Approach B
        #self.text_tag = dataframe.Text_Tag
        self.targets = dataframe.Labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        text = str(self.text[idx])
        text = " ".join(text.split())

        # Approach B
        #text_tag = str(self.text_tag[idx])

        inputs = self.tokenizer.encode_plus(
            text,
            # Approach B
            None,#text_tag,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[idx], dtype=torch.long)
        }

In [113]:
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=648)
val_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(val_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
validation_set = CustomDataset(val_dataset, tokenizer, MAX_LEN)

FULL Dataset: (10240, 4)
TRAIN Dataset: (8192, 4)
TEST Dataset: (2048, 4)


In [114]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 4
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 4
                }

training_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(validation_set, **test_params)

## DistilBert / BERT - Sequence Classifier

In [115]:
# DistilBert
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels = 6,
    output_attentions = False,
    output_hidden_states = False,
)

# Tell pytorch to run this model on the GPU.
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [116]:
# # BERT
# model = BertForSequenceClassification.from_pretrained(
#     "bert-base-uncased",
#     num_labels = 6,
#     output_attentions = False,
#     output_hidden_states = False,
# )

# # Tell pytorch to run this model on the GPU.
# model.to(device)

In [117]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [118]:
from transformers import get_linear_schedule_with_warmup
EPOCHS = 30
total_steps = len(training_loader) * EPOCHS
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [119]:
def train(epoch):
    model.train()
    running_loss = 0.0
    running_corrects = 0

    for i, data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        loss, outputs = model(ids, mask, labels=targets) # DistilBert
        # loss, outputs = model(ids, mask, token_type_ids, labels=targets) #Bert
        preds = torch.argmax(outputs, 1)

        optimizer.zero_grad()

        #Statistics
        running_loss += loss.item()
        running_corrects += torch.sum(preds == targets.data)
        
        optimizer.zero_grad()
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    epoch_loss = running_loss / training_set.__len__()
    epoch_acc = running_corrects.double() / training_set.__len__()
    print(f'Train -> Epoch: {epoch}, Loss: {epoch_loss}, Accuracy: {epoch_acc}')

In [120]:
def validation(epoch):
    model.eval()
    running_loss = 0.0
    log_loss = 0.0
    running_corrects = 0

    with torch.no_grad():
        for _, data in enumerate(validation_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)

            loss, outputs = model(ids, mask, labels=targets) #DistilBert
            # loss, outputs = model(ids, mask, token_type_ids, labels=targets) #Bert
            preds = torch.argmax(outputs, 1)

            running_loss += loss.item()
            running_corrects += torch.sum(preds == targets.data)
        
        epoch_loss = running_loss / validation_set.__len__()
        epoch_acc = running_corrects.double() / validation_set.__len__()

        print(f'Valid -> Epoch: {epoch}, Loss: {epoch_loss}, Accuracy: {epoch_acc}')
        print('\n ==================================================================== \n')
        return epoch_acc

In [None]:
import copy
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0

for epoch in range(EPOCHS):
    train(epoch)
    epoch_acc = validation(epoch)
    if epoch_acc > best_acc:
        best_acc = epoch_acc
        best_model_wts = copy.deepcopy(model.state_dict())

model.load_state_dict(best_model_wts)
print(f'Best Valid Acc: {best_acc}')

Train -> Epoch: 0, Loss: 0.02732998393184971, Accuracy: 0.2196044921875
Valid -> Epoch: 0, Loss: 0.05356467189267278, Accuracy: 0.25927734375


Train -> Epoch: 1, Loss: 0.026572294023935683, Accuracy: 0.2647705078125
Valid -> Epoch: 1, Loss: 0.052947554213460535, Accuracy: 0.26953125


Train -> Epoch: 2, Loss: 0.025991988455643877, Accuracy: 0.2901611328125
Valid -> Epoch: 2, Loss: 0.05258731753565371, Accuracy: 0.26513671875


Train -> Epoch: 3, Loss: 0.02520090561301913, Accuracy: 0.330322265625
Valid -> Epoch: 3, Loss: 0.0525475979084149, Accuracy: 0.26171875


Train -> Epoch: 4, Loss: 0.024354281689738855, Accuracy: 0.359130859375
Valid -> Epoch: 4, Loss: 0.053016281861346215, Accuracy: 0.267578125


Train -> Epoch: 5, Loss: 0.023395030584651977, Accuracy: 0.4012451171875
Valid -> Epoch: 5, Loss: 0.05351278459420428, Accuracy: 0.26416015625


Train -> Epoch: 6, Loss: 0.022352007712470368, Accuracy: 0.43896484375
Valid -> Epoch: 6, Loss: 0.0545324407867156, Accuracy: 0.255859375


T

### Test

In [92]:
df_test = pd.read_csv('Test.csv')
df_test['Labels'] = 0

In [93]:
df_test['Text_Tag'] = df_test.Text_Tag.str.replace(",", " ")

In [None]:
df_test['length'] = df_test.Text.apply(leng_txt)
df_test.length.quantile(0.95)

Token indices sequence length is longer than the specified maximum sequence length for this model (681 > 512). Running this sequence through the model will result in indexing errors


41.0

In [94]:
testing_set = CustomDataset(df_test, tokenizer, MAX_LEN)
TEST_BATCH_SIZE = 32
test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 4
                }
testing_loader = DataLoader(testing_set, **test_params)

In [95]:
proba = torch.nn.Softmax(dim=1)

def test():
    model.eval()
    prediction = []

    with torch.no_grad():
        for i, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            #targets = data['targets'].to(device, dtype = torch.long)

            #outputs = model(ids, mask, token_type_ids)
            outputs = model(ids, mask)
            outputs = proba(outputs[0])
            outputs = outputs.detach().cpu().numpy()
            prediction.append(outputs)
        return prediction

In [96]:
tmp = test()

In [97]:
submit = pd.DataFrame(np.vstack(tmp))

In [98]:
submit.to_csv('submit.csv', index=False)

## BERT as Feautre Extractor

In [None]:
from transformers import BertModel

In [None]:
train_size = 1
train_dataset=df.sample(frac=train_size,random_state=648)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)

FULL Dataset: (10240, 3)
TRAIN Dataset: (10240, 3)
TEST Dataset: (1267, 3)


In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 4
                }

train_dataloader = DataLoader(training_set, **train_params)

In [None]:
Bertmodel = BertModel.from_pretrained('bert-base-uncased')
Bertmodel.to(device)

In [None]:
Bertmodel.eval()
prediction = []

with torch.no_grad():
    for i, data in enumerate(train_dataloader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)

        # Only taking pooled output i.e. [CLS] token
        _, outputs = Bertmodel(ids, mask, token_type_ids)
        outputs = outputs.detach().cpu().numpy()
        prediction.append(outputs)

In [None]:
prediction[0].shape

(64, 768)

In [None]:
bert_features = np.vstack(prediction)

In [None]:
bert_features.shape

(1267, 768)

In [None]:
train_dataset.head()

Unnamed: 0,Labels,Text,Text_Tag
0,0,Congress decision to spend less around the wor...,congress federal-budget foreign-policy
1,0,As a result of Chris Sununus vote against a Pl...,abortion health-care voting-record
2,5,Says ending the direct payment farm subsidy pr...,agriculture federal-budget
3,3,Im also the only lieutenant governor to not ta...,state-budget
4,0,President Ronald Reagan sent troops into confl...,foreign-policy history military


### TF - IDF for TEXT_TAG column

In [None]:
corpus = train_dataset.Text_Tag.values

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(stop_words='english')
tf_idf = vectorizer.fit_transform(corpus)
#print(vectorizer.get_feature_names())
print(tf_idf.shape)

(10240, 176)


In [None]:
tf_idf = tf_idf.toarray()

In [None]:
X = np.concatenate((bert_features, tf_idf), axis=1)
X.shape

(10240, 944)

In [None]:
y = train_dataset.Labels.values
y.shape

(10240,)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.utils import np_utils

In [None]:
# one-hot encoding using keras' numpy-related utilities
n_classes = 6
print("Shape before one-hot encoding: ", y_train.shape)
y_train = np_utils.to_categorical(y_train, n_classes)
y_test = np_utils.to_categorical(y_test, n_classes)
print("Shape after one-hot encoding: ", y_train.shape)

Shape before one-hot encoding:  (8192,)
Shape after one-hot encoding:  (8192, 6)


In [None]:
model = Sequential()
model.add(Dense(1024, input_shape=(944,), activation='relu'))
model.add(Dropout(0.4))
# output layer
model.add(Dense(6, activation='softmax'))

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_30 (Dense)             (None, 1024)              967680    
_________________________________________________________________
dropout_21 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_31 (Dense)             (None, 6)                 6150      
Total params: 973,830
Trainable params: 973,830
Non-trainable params: 0
_________________________________________________________________


In [None]:
import matplotlib.pyplot as plt


def plot_hist(hist):
    plt.plot(hist.history["accuracy"])
    plt.plot(hist.history["val_accuracy"])
    plt.title("model accuracy")
    plt.ylabel("accuracy")
    plt.xlabel("epoch")
    plt.legend(["train", "validation"], loc="upper left")
    plt.show()

In [None]:
hist = model.fit(X_train, y_train,
                 batch_size=128,
                 epochs=500,
                 validation_data=(X_test, y_test),
                 )
plot_hist(hist)

### Test

In [None]:
df_test = pd.read_csv('Test.csv')
df_test['Labels'] = 0

In [None]:
df_test['Text_Tag'] = df_test.Text_Tag.str.replace(",", " ")

In [None]:
tf_idf_test = vectorizer.transform(df_test.Text_Tag.values)
print(tf_idf_test.shape)
tf_idf_test = tf_idf_test.toarray()

(1267, 176)


In [None]:
testing_set = CustomDataset(df_test, tokenizer, MAX_LEN)
print("TEST Dataset: {}".format(df_test.shape))

In [None]:
test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 4
                }

test_dataloader = DataLoader(testing_set, **test_params)

In [None]:
Bertmodel.eval()
prediction = []

with torch.no_grad():
    for i, data in enumerate(test_dataloader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)

        # Only taking pooled output i.e. [CLS] token
        _, outputs = Bertmodel(ids, mask, token_type_ids)
        outputs = outputs.detach().cpu().numpy()
        prediction.append(outputs)

In [None]:
bert_features = np.vstack(prediction)

In [None]:
bert_features.shape

(1267, 768)

In [None]:
testset = np.concatenate((bert_features, tf_idf_test), axis=1)
testset.shape

(1267, 944)

In [None]:
pred_sub = model.predict(testset)
pred_sub.shape

(1267, 6)

In [None]:
pd.DataFrame(pred_sub).to_csv('submit.csv', index=False)