In [1]:
from datasets import load_dataset

ds = load_dataset("knkarthick/dialogsum")

README.md:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

validation.csv:   0%|          | 0.00/442k [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12460 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [2]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [3]:
ds['train'][1]['dialogue']

"#Person1#: Hello Mrs. Parker, how have you been?\n#Person2#: Hello Dr. Peters. Just fine thank you. Ricky and I are here for his vaccines.\n#Person1#: Very well. Let's see, according to his vaccination record, Ricky has received his Polio, Tetanus and Hepatitis B shots. He is 14 months old, so he is due for Hepatitis A, Chickenpox and Measles shots.\n#Person2#: What about Rubella and Mumps?\n#Person1#: Well, I can only give him these for now, and after a couple of weeks I can administer the rest.\n#Person2#: OK, great. Doctor, I think I also may need a Tetanus booster. Last time I got it was maybe fifteen years ago!\n#Person1#: We will check our records and I'll have the nurse administer and the booster as well. Now, please hold Ricky's arm tight, this may sting a little."

In [4]:
ds['train'][1]['summary']

'Mrs Parker takes Ricky for his vaccines. Dr. Peters checks the record and then gives Ricky a vaccine.'

# WITHOUT FINE TUNING

In [5]:
# Use a pipeline as a high-level helper
from transformers import pipeline

text_summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [6]:
article_1 = ds['train'][1]['dialogue']

text_summarizer(article_1, max_length = 30, min_length = 10, do_sample = False)

[{'summary_text': 'Ricky has received his Polio, Tetanus and Hepatitis B shots. He is 14 months old, so he is due'}]

# FINE - TUNING MODEL

In [7]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import TrainingArguments, Trainer

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-base")

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [8]:
# #tokenization

# def preprocess_function(batch):
#     source = batch['dialogue']
#     target = batch["summary"]
#     source_ids = tokenizer(source, truncation=True, padding="max_length", max_length=128)
#     target_ids = tokenizer(target, truncation=True, padding="max_length", max_length=128)

#     # Replace pad token id with -100 for labels to ignore padding in loss computation
#     labels = target_ids["input_ids"]
#     labels = [[(label if label != tokenizer.pad_token_id else -100) for label in labels_example] for labels_example in labels]

#     return {
#         "input_ids": source_ids["input_ids"],
#         "attention_mask": source_ids["attention_mask"],
#         "labels": labels
#     }

# ds_source = ds.map(preprocess_function, batched=True)

In [9]:
#DEFINE TRAING ARGUMENTS
training_args = TrainingArguments(
    per_device_train_batch_size=4, 
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    fp16=True,
    output_dir = '/kaggle/working/',
    # per_device_train_batch_size = 8,
    num_train_epochs = 2,
    remove_unused_columns = False
)

In [10]:
# Tokenize the dataset and add labels
def preprocess_function(examples):
    inputs = tokenizer(examples['dialogue'], truncation=True, padding='max_length', max_length=512)
    inputs["labels"] = tokenizer(examples['summary'], truncation=True, padding='max_length', max_length=512)["input_ids"]
    return inputs

# Apply the preprocessing function
tokenized_ds = ds.map(preprocess_function, batched=True)

# Create the Trainer object again with the tokenized dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"]
)

# Try training again
trainer.train()

Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


UsageError: api_key not configured (no-tty). call wandb.login(key=[your_api_key])

In [None]:
# print(trainer._signature_columns)

In [None]:
# #CREATE TRAINER OBJECT
# trainer = Trainer(
#     model = model,
#     args = training_args,
#     train_dataset = ds["train"],
#     eval_dataset = ds["test"]
# )

In [None]:
# print(ds["train"].column_names)
# print(ds["test"].column_names)

In [None]:
# trainer._signature_columns = ["dialogue", "summary"] 

In [None]:
# trainer.train()

In [None]:
#EVALUATE THE MODEL
eval_results = trainer.evaluate()
print(eval_results)

# SAVING THE MODEL

In [None]:
model.save_pretrained("/kaggle/working/runs")
tokenizer.save_pretrained("/kaggle/working/runs")

# SUMMARIZINFG THE CUSTOM DATA USING SAVED MODEL 

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

#LOAD THE TRAINED MODEL AND TOKENIZER
tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/runs")
model = AutoModelForSeq2SeqLM.from_pretrained("/kaggle/working/runs")

#FUNCTION TO SUMMARIZE A BLOG POST
def summarize(blog_post):
    #TOKEINZE THE INPUT BLOG POST
    inputs = tokenizer(blog_post, max_length = 1024, truncation = True, return_tensors = "pt")
    
    #GENERATE THE SUMMARY
    summary_ids = model.generate(inputs["input_ids"], max_length = 150, min_length = 40, length_penalty = 2.0, num_beams = 4, early_stopping = True)
    
    #DECODE THE SUMMARY
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens = True)
    return summary

# EXAMPLE BLOG POST

In [None]:
blog_post = """
1) Brief Introduction

Hi Everyone! I am Himadri Pandya, a 4th-year undergraduate student of the Department of Mechanical Engineering. I am a boarder of Sarojini Naidu/Indira Gandhi Hall of residence. I sat for the CDC Internship drive last year and have joined Goldman Sachs as a Summer Analyst for the summer of 2023.

2) How did you get into GS? What was the selection procedure?

It’s fair to say that Goldman Sachs was my highest preferred company throughout my CDC preparation, after the shortlists and during the interviews. I focused my preparation around GS all throughout the summers and started solely preparing for the interviews once I was certain that I had performed well in the test and was expecting to get shortlisted for the interviews.

The selection procedure consisted of:

a. Test (for shortlisting): The test had 4 sections which were timed separately:

b. Coding Section: This section had two medium to hard coding questions to be attempted in C++

c. Probability, Statistics and Puzzles: This section had 25 questions on probability and statistics including a few easy puzzles

d. Essay Question: This section was for language fluency in which we had to write a short paragraph on a given topic

e. Computer Architecture: This section had around 20 questions on core CS topics including operating systems and algorithms

f. Personal Interviews: I had two rounds of technical interviews (some people had three rounds) on Zoom where I was asked logic-based puzzles and questions on probability and statistics. They asked me the logic I would use to implement certain coding questions but I wasn’t asked to explicitly code anything. It started off with easy questions and progressed towards more difficult questions. I was called to a meet after these two rounds to confirm the offer.

3) How to prepare for them?

I started off with CP after the end of my 4th semester with only some background from PDS and PDS Lab courses. I started off with basic concepts and STL and slowly progressed with other topics.
There are many resources available to study all of these topics and one can conveniently refer to gfg for theory and practice questions on InterviewBit. I’d say a decent grasp of DSA is essential to attempt the questions in the coding test. You can directly start attempting questions from different buckets on InterviewBit and learn different concepts as you encounter them.
I also picked up the Algorithmic Toolbox course on Coursera. This course really helped me strengthen a few core concepts like Backtracking and DP along with practice questions to give a complete understanding of them since these are generally difficult concepts to pick up. I didn’t participate in any coding contests during my preparation but it can be a good way to practice attempting questions in a test-like environment and develop logical thinking given a limited time.
As for the quant section, I practiced solving questions on Probability and puzzles from Brainstellar, Heard on the Street, and Fifty Challenging Problems in Probability, starting from the end of May. This helped me prepare my Quant section well. I solved all the puzzles on InterviewBit and a few puzzles on gfg. Understanding the process of approaching the questions from the given resources should give you enough hold on the concepts to think of an approach if you face a new question in the test or interviews. TED-Ed’s riddles videos are also a great and fun resource for these questions.
I did not do any separate preparation for the section on computer architecture since it was frankly an open-ended section for someone with a non-CS major and mostly attempted the questions based on intuition. You can prepare by taking some course materials from your CS friends but I don’t have any list of important topics or resources to share.
There is no preparation required for the essay section as it only evaluates your fluency and can be attempted within 5-10 minutes.
As for the interview, I’ll always encourage you to be ready with a good introduction that’s brief and gives an overview of your work, involvements and interests (a few words on why you want to apply for the role can be added). Your introduction is a very important component of the first impression you’ll create on the interviewer and as for most cases, first impressions go a long way. It is essential to communicate with your interviewer throughout the interview. Tell them what you’re thinking, your approach and your thought process since they’re interested more in your analysis and approach than the final answer. Communicating will also help you reach a solution when you’re stuck somewhere since interviewers hint you towards the solution if they think your approach is incorrect. Be alert, be confident and most importantly be yourself because that’s when you’re your best possible version. As for the questions, your preparation for the test will be sufficient to guide you through the technical interview.
4) What difficulties did you face while preparing for this Company/ Profile? How did you overcome this problem?

The difficulties that I faced were primarily centered around the time that I was able to give to the preparation because of my extra-curricular involvements in KGP and the fact that I started my preparation a bit later than others. You shouldn’t face any of these problems for this particular role if you’ve been consistent and disciplined throughout your preparation

Other than that, I’d suggest you contact seniors who have prepared for similar roles that you are interested in and get insights from them. Being connected will give you a more channeled approach and will also be a constant mental support when you need it.

5) According to you, who should ideally apply for this job?

The selection procedure revolves around problem solving abilities so folks with a natural knack for coding and logic-based puzzles have a slight edge in the entire process. However, one can conveniently prepare for all the sections by planning out their preparation well and following the deadlines they set for themselves. The internship majorly involves coding so if you’re someone who hates programming then this possibly isn’t the best option out there.

Regardless, it’s a sought-after day 1 role and a great option for those opting for tech profiles.

6) Any specific advice you want to give to the junta sitting for internships this year.

In general for tech roles, coding is very important and should ideally take up at least 80% of your prep time. It is very important to plan how much time you will be devoting to each question during the tests. Try to gauge if you will be able to solve the question immediately after reading it or if you’ll have to go for a partial solution and thus a partial score.

Your performance in the test is an essential data point both before and during your interview and has a role on how they perceive you throughout given the limited information they are given. So it is preferable to try to perform exceedingly well in the tests to have a good pre-impression on the interviewer. In fact, it is practically improbable to interview for more than two companies on a given day so your focus should not be to get multiple shortlists but to get preferable shortlists in the companies you’re genuinely interested in. It is better to perform “very well” in a few tests and disastrously in the others and getting 2 shortlists than performing “decently” in multiple companies and getting multiple inconvertible shortlists.

As for the interviews, be very clear about your preferences in case you have multiple shortlists. By the end of the tests you should be able to get a fair idea of whether or not you have a chance of getting shortlisted for a given company. Once you’re ready with your preferences your sole focus should be to prepare for interviews and contact as many seniors as possible to understand what type of questions are asked and be certain about evaluations metrics for various companies. For example, I was shortlisted to interview for LEK but was fairly certain about getting a shortlist at GS and since I was clear about my preferences I devoted all my time to prepare for GS while skipping any and all case prep.

Other than that, once you are certain about a shortlist in a few of your highly preferred companies you can consider skipping a few tests to prepare for the interviews of your preferred companies. It can be somewhat categorized as a gamble but if done correctly it can give you a very sharp edge during interview preparation since you’ll have to draw a line for preferences at some point anyway. Better sooner, than later.

7) What are some of the major points you think would be valid to mention in your CV while targeting this profile?(any specific suggestions you would like to make?)

I don’t think CV holds a high weightage for this profile since I wasn’t questioned on my past experiences and projects in any of the interview rounds except for a brief discussion of my PoR and FT project in the second round after my introduction. The focus should be on performing well in the tests and being active during the interview. However, here’s my two cents on CV building in general:
Structure your CV depending upon the profile you’re targeting. For example, a tech CV should focus more on relevant projects and internships rather than extracurriculars and exceptional achievements
Try to summarize every heading in two to three bullet points focussing more on results than methods. For example, if you’re mentioning an ML project, focus more on the accuracy you achieved and the amount of data that you used to train your model than the model itself
Format your CV well and try to stay consistent with the number of points you mention under each heading
Try to customize your CV for different profiles (say, Data, SDE and Quant) and leverage the fact that ERP allows you to submit 3 CVs
Of course, don’t forget to get it reviewed by seniors to make the best out of your work.
"""

#GET THE SUMMARY
summary = summarize(blog_post)
print("Summary:", summary)