In [None]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install T5



In [None]:
!pip install transformers -q
!pip install wandb -q

In [None]:
! pip install transformers==4.1.1



In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import T5Tokenizer, T5ForConditionalGeneration
import wandb

In [None]:
!nvidia-smi

Wed Dec  1 03:21:36 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   57C    P0    28W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mkkon[0m (use `wandb login --relogin` to force relogin)


In [None]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.text
        self.ctext = self.data.ctext

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [None]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
        loss = outputs[0]
        
        if _%10 == 0:
            wandb.log({"Training Loss": loss.item()})

        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=150, 
                num_beams=4,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [None]:
def test(tokenizer, model, device, loader):

    with torch.no_grad():
      for _, data in enumerate(loader, 0):
        tokenized_text = tokenizer(input_text, truncation=True, padding=True, return_tensors='pt')

        ids = tokenized_text['input_ids'].to(device, dtype = torch.long)
        mask = tokenized_text['attention_mask'].to(device, dtype = torch.long)

        generated_ids = model.generate(
            input_ids = ids,
            attention_mask = mask, 
            max_length=500, 
            num_beams=4,
            repetition_penalty=2.5, 
            length_penalty=1.0, 
            early_stopping=True
        )

        pred = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]

    print("\noutput:\n" + pred[0])

In [None]:
def main(input_text):
    # WandB – Initialize a new run
    wandb.init(project="transformers_tutorials_summarization")

    # WandB – Config is a variable that holds and saves hyperparameters and inputs
    # Defining some key variables that will be used later on in the training  
    config = wandb.config          # Initialize config
    config.TRAIN_BATCH_SIZE = 2    # input batch size for training (default: 64)
    config.VALID_BATCH_SIZE = 2    # input batch size for testing (default: 1000)
    config.TEST_BATCH_SIZE = 2
    config.TRAIN_EPOCHS = 2        # number of epochs to train (default: 10)
    config.VAL_EPOCHS = 1 
    config.LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
    config.SEED = 42               # random seed (default: 42)
    config.MAX_LEN = 512
    config.SUMMARY_LEN = 500

    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(config.SEED) # pytorch random seed
    np.random.seed(config.SEED) # numpy random seed
    torch.backends.cudnn.deterministic = True

    # tokenzier for encoding the text
    tokenizer = T5Tokenizer.from_pretrained("t5-base")
    

    # Importing and Pre-Processing the domain data
    # Selecting the needed columns only. 
    # Adding the summarzie text in front of the text. This is to format the dataset similar to how T5 model was trained for summarization task. 
    df = pd.read_csv('/content/drive/MyDrive/news_summary.csv', encoding='latin-1')
    df = df[['text','ctext']]
    df.ctext = 'summarize: ' + df.ctext
    print(df.head())

    
    # Creation of Dataset and Dataloader
    # Defining the train size. So 80% of the data will be used for training and the rest will be used for validation. 
    train_size = 0.8
    train_dataset = df.sample(frac=train_size,random_state = config.SEED)
    val_dataset = df.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)

    test_dataset = pd.DataFrame({'text': [''], 'ctext':[input_text]})

    print("FULL Dataset: {}".format(df.shape))
    print("TRAIN Dataset: {}".format(train_dataset.shape))
    print("TEST Dataset: {}".format(val_dataset.shape))


    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = CustomDataset(train_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)
    val_set = CustomDataset(val_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)
    test_set = CustomDataset(test_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)

    # Defining the parameters for creation of dataloaders
    train_params = {
        'batch_size': config.TRAIN_BATCH_SIZE,
        'shuffle': True,
        'num_workers': 0
        }

    val_params = {
        'batch_size': config.VALID_BATCH_SIZE,
        'shuffle': False,
        'num_workers': 0
        }

    test_params = {
        'batch_size': config.TEST_BATCH_SIZE,
        'shuffle': False,
        'num_workers': 0
        }

    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)
    testing_loader = DataLoader(test_set, **test_params)


    
    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
    # Further this model is sent to device (GPU/TPU) for using the hardware.
    model = T5ForConditionalGeneration.from_pretrained("t5-base")
    model = model.to(device)

    # Defining the optimizer that will be used to tune the weights of the network in the training session. 
    optimizer = torch.optim.Adam(params =  model.parameters(), lr=config.LEARNING_RATE)

    # Log metrics with wandb
    wandb.watch(model, log="all")
    # Training loop
    print('Initiating Fine-Tuning for the model on our dataset')

    for epoch in range(config.TRAIN_EPOCHS):
        train(epoch, tokenizer, model, device, training_loader, optimizer)


    # Validation loop and saving the resulting file with predictions and acutals in a dataframe.
    # Saving the dataframe as predictions.csv
    print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
    for epoch in range(config.VAL_EPOCHS):
        predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
        final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
        final_df.to_csv('/content/drive/MyDrive/predictions.csv')
        print('Output Files generated for review')

    pred = test(tokenizer, model, device, val_loader)

if __name__ == '__main__':
    print("input:")
    input_text = input()
    main(input_text)

input:
So you have to run k-means algorithm again, and again, then until you find the optimal result, why we it requires initial means, and it matters what you pick for initial means, for example, this centers are initialized like this, these data points. Okay, and actually, using another similar to measure means the these 2D input space. So you can see this input image is broken in two places and each class form or process, actually, similar reasons. But if you don't use equally their distance, instead, you use another way to define a similarity between vectors, you will have different clustering result. So it forms a cluster 1 and this Center is close to these oranges, of course, and these data points, make a cluster 2 and in the same way, we make a cluster 3 and close to 40 K. So, this green point is more close to this data point rather than this data point. But let's say you have different initial means. And then next small step of iteration is change the cluster Center to the aver

[34m[1mwandb[0m: Currently logged in as: [33mkkon[0m (use `wandb login --relogin` to force relogin)


                                                text                                              ctext
0  The Administration of Union Territory Daman an...  summarize: The Daman and Diu administration on...
1  Malaika Arora slammed an Instagram user who tr...  summarize: From her special numbers to TV?appe...
2  The Indira Gandhi Institute of Medical Science...  summarize: The Indira Gandhi Institute of Medi...
3  Lashkar-e-Taiba's Kashmir commander Abu Dujana...  summarize: Lashkar-e-Taiba's Kashmir commander...
4  Hotels in Maharashtra will train their staff t...  summarize: Hotels in Mumbai and other Indian c...
FULL Dataset: (4514, 2)
TRAIN Dataset: (3611, 2)
TEST Dataset: (903, 2)


Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by prov

Initiating Fine-Tuning for the model on our dataset
Epoch: 0, Loss:  6.237545490264893




Epoch: 0, Loss:  2.215460777282715
Epoch: 0, Loss:  2.6175198554992676
Epoch: 0, Loss:  1.394105076789856
Epoch: 1, Loss:  2.008376359939575
Epoch: 1, Loss:  1.1268078088760376
Epoch: 1, Loss:  1.7109566926956177
Epoch: 1, Loss:  1.443698763847351
Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe


  next_indices = next_tokens // vocab_size


Completed 0
Completed 100
Completed 200
Completed 300
Completed 400
Output Files generated for review

output:
this data points, make a cluster 1 and in the same way, make a cluster 3 and close to 40 K. So, the initial Center of five classes were here, but we have the new, assigns data point like this wall, like this. This means clustering but I'm sure this one is quite busy. The next small step of iteration is change the cluster Center to the average of its assigned, right point.


Almost as abruptly as she had vanished, Chinese tennis star Peng Shuai reappeared in public view over the weekend.

Since Friday evening, a steady stream of photos and videos purporting to show a smiling Peng going about her life in Beijing have surfaced on Twitter -- all posted by individuals working for Chinese government-controlled media and the state sport system, on a platform blocked in China.
The apparent propaganda push was followed Sunday by a video call between Peng and International Olympic Committee (IOC) president Thomas Bach, during which the three-time Olympian insisted she is "safe and well, living at her home in Beijing" and "would like to have her privacy respected," according to a statement from the IOC.
The flurry of "proof of life" videos came amid a firestorm of global concern for Peng, who disappeared from the public eye for more than two weeks after taking to social media to accuse former Vice Premier Zhang Gaoli of coercing her into sex at his home -- an explosive and politically sensitive allegation that triggered blanket censorship in China.
While Peng's public reappearances may allay some of the worst fears about her immediate safety and well-being, they have failed to quell broader concerns about her freedoms and growing calls for a full investigation into her sexual assault allegations.
"It was good to see Peng Shuai in recent videos, but they don't alleviate or address the WTA's concern about her well-being and ability to communicate without censorship or coercion," a spokesperson for the Women's Tennis Association (WTA) told CNN in a statement, following Peng's call with the IOC.

Human rights advocates who have long followed Beijing's silencing campaigns are also unconvinced.
"What we have here is essentially a state-controlled narrative: only the government and its affiliated media are generating and distributing the content about Peng's story," said Maya Wang, senior China researcher at Human Rights Watch (HRW).
"While it is possible that Peng is well, the history of the Chinese government disappearing people and then making videos of them to prove that they are unharmed when it is, in fact, the opposite, should make us worried about Peng's safety," she added.
The video clips appear to be specifically -- yet crudely -- crafted to show that Peng is "free" and living a "normal" life.
In footage released on Saturday, Peng was seen out to dinner with several people state media journalists have described as "her coach and friends." The clips made repeated, deliberate references to the dates, while Peng kept nodding to the man speaking next to her, not saying anything.
None of the videos made even the vaguest mention of Peng's sexual assault allegations against Zhang. Instead, they focused on her smiles and apparent good-spirits -- which state media propagandists were eager to highlight.

"Can any girl fake such sunny smile under pressure?" asked Hu Xijin, the editor-in-chief of state-run tabloid the Global Times, in a tweet Sunday, accompanying a clip of a smiley Peng signing larged-sized tennis balls for children at a junior tennis match in Beijing.
"Those who suspect Peng Shuai is under duress, how dark they must be inside. There must be many, many forced political performances in their countries," Hu wrote on Twitter.
The Global Times, like other government-controlled media outlets in China, has made no reference to Peng's apparent disappearance, nor her allegations against Zhang. Hu has also been careful on Twitter not to mention the reason why Peng is in the spotlight, referring to it only obliquely as "the thing people talked about."
Chinese authorities have not acknowledged Peng's allegations against Zhang, and there is no indication an investigation is underway. It remains unclear if Peng has reported her allegations to the police.
Speaking at a news conference Monday, Chinese Foreign Ministry spokesperson Zhao Lijian reiterated that Peng's accusation is not a diplomatic issue and declined to comment further.
Zhang has kept a low profile and faded from public life since his retirement in 2018, and there is no public information relating to his current whereabouts.
Skepticism as to Peng's well-being especially runs high among Chinese activists who have observed from a close range how the government has silenced and coerced their peers.
"The reality is, they have huge control over Peng Shuai -- to the extent that it's enough to make her cooperate and become an actor," alleged Lv Pin, a prominent Chinese feminist now based in New York.
"This has happened in plenty of cases in the past. Many 'criminals' who were forced to confess on television had to make their performances look real," she said, referring to a series of forced confessions aired on state television, such as from Chinese human rights lawyers and the Hong Kong booksellers.

Chinese authorities have so far elected not to place Peng on state television, perhaps aware that her presence -- even only on its English-language platforms -- would run counter to ongoing efforts to censor all discussions around her original allegations, and thus generate more questions within China than answers.
Instead, Peng appeared in a 30-minute video call with IOC officials, accompanied by -- and under the close watch of -- a Chinese sports official who formerly served as the Communist Party secretary of the Tennis Administration Center of the General Administration of Sport of China.

The interview has not been reported by Chinese state media. But on its website, the IOC posted a statement and a photo of the call. It didn't release the full video, nor explain the circumstances surrounding the virtual meeting, including how it was arranged.
And it appears that IOC officials have walked away from the meeting -- at least publicly -- concluding that Peng is OK.
"I was relieved to see that Peng Shuai was doing fine, which was our main concern," said Chair of the IOC Athletes' Commission Emma Terho, who joined the video call along with Li Lingwei, the Chinese sports official.
By drawing to quick conclusions about Peng's current state and avoiding any mention of her sexual assault allegations that ignited the whole controversy, analysts say the IOC is putting its own credibility on the line -- and potentially risks becoming complicit in Beijing's propaganda push.
"The IOC call hardly alleviates our concerns for Peng's well-being or safety," said Wang from the HRW.
"In fact, it begs the question of why the IOC appears to be participating in what is essentially a state-controlled narrative, as only the government and its affiliated media have been allowed to tell Peng's story."

So you have to run k-means algorithm again, and again, then until you find the optimal result, why we it requires initial means, and it matters what you pick for initial means, for example, this centers are initialized like this, these data points.
Okay, and actually, using another similar to measure means the these 2D input space.
So you can see this input image is broken in two places and each class form or process, actually, similar reasons.
But if you don't use equally their distance, instead, you use another way to define a similarity between vectors, you will have different clustering result.
So it forms a cluster 1 and this Center is close to these oranges, of course, and these data points, make a cluster 2 and in the same way, we make a cluster 3 and close to 40 K. So, this green point is more close to this data point rather than this data point.
But let's say you have different initial means.
And then next small step of iteration is change the cluster Center to the average of its assigned, right point.
So here is a picture example, each Square corresponds to each object, or data sample, or data point, and you can see the this yellow group are grouped together since they are more similar to each other.
Well, you can take another way if we just faced by these data points with two classes.
So again, like, in the previous example, If you have this kind of result on this result will not change order.
We'll have a better result if this cluster one is formed here because zero is from the here.
And the third limitation is the k-means cannot be able to properly cluster in some cases like this.
If our algorithm will not stop on this local minimum, you can change this on purpose, enter into here and change this blue blue circle to here.
So the initial Center of five classes were here, but we have the new, assigns data point like this wall, like this.
This K means clustering but I'm sure this one is quite busy.
So we are just going to cross through these data points into K classes and then we iterate, these small steps later.
We have only read the data and the final one was refers to learning, but I will talk about it later on in this course.
Is the task of grouping, of several of object, or we can say, or set of data samples data points, in a way that the same group are more similar to each other.
We will have different clustering results like this and in the previous slide, we said the similar data points are grouped together, but these similar is not a scientific term.
And again, let's see the image segmentation result using K means clustering.

