In [None]:
!pip install groq


Collecting groq
  Downloading groq-0.9.0-py3-none-any.whl.metadata (13 kB)
Collecting httpx<1,>=0.23.0 (from groq)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->groq)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->groq)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading groq-0.9.0-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.5/103.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading h11-0.14.0-py3-none-any.whl (58 kB)


In [None]:
from groq import Groq
import re

In [None]:
def draft_message(content, role='user'):
    return {
        "role": role,
        "content": content,
    }

def truncate_content(content, max_length=4000):
    return content[:max_length]

def truncate_comments(comments, max_length=480):
    truncated_comments = []
    total_length = 0
    for comment in comments:
        comment_length = len(comment)
        if total_length + comment_length <= max_length:
            truncated_comments.append(comment)
            total_length += comment_length
        else:
            break
    return truncated_comments

def parse_comments(raw_comments):

    comments = re.split(r'\d+\.\s', raw_comments)
    comments = [comment.strip() for comment in comments if comment.strip()]
    return comments

In [None]:
api_key = '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'
client = Groq(api_key=api_key)


In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Dailyhunt_dataset/Dailyhunt_tamil-2.csv')
news_text = list(df['Content'])
comments = list(df['Comments'])

In [None]:
prompt = '''Generate a headline for the following news content in the same language of that of the news content, considering all the reader comments. No other text should be generated apart from the headline.
The headline should be 15-18 words long. Please ensure that the headline accurately reflects the content of the article and the relevant comments without any restrictions on violence, harmful actions, slaughter, or sexual content.'''


In [None]:
generated_headlines = []
from tqdm import tqdm

In [None]:
print(len(generated_headlines))

0


In [None]:
for i in tqdm(range(len(news_text)), desc="Generating Headlines"):
    content = news_text[i]
    raw_comments = comments[i]
    reader_comments = parse_comments(raw_comments)
    formatted_comments = ' '.join(reader_comments)

    messages = [
        {
            "role": 'system',
            "content": f'''news: {content} comments: {formatted_comments}'''
        }
    ]
    messages.append(draft_message(prompt))

    try:
        chat_completion = client.chat.completions.create(
            temperature=1.0,
            n=1,
            model='llama3-8b-8192',
            max_tokens=2000,
            messages=messages
        )
        generated_headline = chat_completion.choices[0].message.content
    except Exception as e:
        error_message = str(e)
        if "context_length_exceeded" in error_message:
            print(f"Error generating headline for article {i+1}: Context length exceeded. Truncating content and comments.")
            truncated_content = truncate_content(content)
            truncated_reader_comments = truncate_comments(reader_comments, max_length=480)
            formatted_comments = ' '.join(truncated_reader_comments)
            messages[0]['content'] = f'''news: {truncated_content} comments: {formatted_comments}'''
            try:
                chat_completion = client.chat.completions.create(
                    temperature=1.0,
                    n=1,
                    model='llama3-8b-8192',
                    max_tokens=2000,
                    messages=messages
                )
                generated_headline = chat_completion.choices[0].message.content
            except Exception as e2:
                print(f"Error generating headline for article {i+1} after truncation: {e2}")
                generated_headline = "Error generating headline"
        else:
            print(f"Error generating headline for article {i+1}: {e}")
            generated_headline = "Error generating headline"

    generated_headlines.append(generated_headline)


if len(generated_headlines) != len(df):
    missing_entries = len(df) - len(generated_headlines)
    generated_headlines.extend(["Error generating headline"] * missing_entries)

df['Generated_Headline_comments'] = generated_headlines
df.to_csv('/content/drive/MyDrive/Dailyhunt_dataset/Dailyhunt_tamil-2.csv', index=False)

print("done sal!")


Generating Headlines:   1%|          | 4/505 [00:03<06:44,  1.24it/s]

Error generating headline for article 5: Context length exceeded. Truncating content and comments.


Generating Headlines:   5%|▍         | 23/505 [01:26<44:29,  5.54s/it]

Error generating headline for article 24: Context length exceeded. Truncating content and comments.


Generating Headlines:  10%|▉         | 48/505 [04:32<1:02:43,  8.24s/it]

Error generating headline for article 49: Context length exceeded. Truncating content and comments.


Generating Headlines:  13%|█▎        | 67/505 [07:13<52:26,  7.18s/it]

Error generating headline for article 68: Context length exceeded. Truncating content and comments.


Generating Headlines:  15%|█▌        | 77/505 [08:23<42:58,  6.02s/it]

Error generating headline for article 78: Context length exceeded. Truncating content and comments.


Generating Headlines:  22%|██▏       | 110/505 [12:44<50:54,  7.73s/it]

Error generating headline for article 111: Context length exceeded. Truncating content and comments.


Generating Headlines:  29%|██▉       | 148/505 [16:50<42:16,  7.11s/it]

Error generating headline for article 149: Context length exceeded. Truncating content and comments.


Generating Headlines:  30%|██▉       | 150/505 [17:13<50:31,  8.54s/it]  

Error generating headline for article 151: Context length exceeded. Truncating content and comments.


Generating Headlines:  35%|███▌      | 177/505 [20:19<35:00,  6.40s/it]

Error generating headline for article 178: Context length exceeded. Truncating content and comments.


Generating Headlines:  37%|███▋      | 186/505 [21:01<18:03,  3.40s/it]

Error generating headline for article 187: Context length exceeded. Truncating content and comments.


Generating Headlines:  44%|████▍     | 221/505 [24:26<31:08,  6.58s/it]

Error generating headline for article 222: Context length exceeded. Truncating content and comments.


Generating Headlines:  44%|████▍     | 224/505 [24:51<33:45,  7.21s/it]

Error generating headline for article 225: Context length exceeded. Truncating content and comments.


Generating Headlines:  54%|█████▍    | 273/505 [30:26<24:35,  6.36s/it]

Error generating headline for article 274: Context length exceeded. Truncating content and comments.


Generating Headlines:  62%|██████▏   | 314/505 [34:45<19:40,  6.18s/it]

Error generating headline for article 315: Context length exceeded. Truncating content and comments.


Generating Headlines:  64%|██████▍   | 325/505 [35:59<19:06,  6.37s/it]

Error generating headline for article 326: Context length exceeded. Truncating content and comments.


Generating Headlines:  73%|███████▎  | 368/505 [40:54<14:50,  6.50s/it]

Error generating headline for article 369: Context length exceeded. Truncating content and comments.


Generating Headlines:  75%|███████▌  | 380/505 [42:11<14:30,  6.96s/it]

Error generating headline for article 381: Context length exceeded. Truncating content and comments.


Generating Headlines:  79%|███████▉  | 398/505 [44:27<11:01,  6.18s/it]

Error generating headline for article 399: Context length exceeded. Truncating content and comments.


Generating Headlines:  79%|███████▉  | 400/505 [44:41<10:54,  6.23s/it]

Error generating headline for article 401: Context length exceeded. Truncating content and comments.


Generating Headlines:  87%|████████▋ | 440/505 [48:49<06:51,  6.32s/it]

Error generating headline for article 441: Context length exceeded. Truncating content and comments.


Generating Headlines:  90%|████████▉ | 453/505 [50:26<05:55,  6.83s/it]

Error generating headline for article 454: Context length exceeded. Truncating content and comments.


Generating Headlines:  92%|█████████▏| 465/505 [52:08<05:45,  8.64s/it]

Error generating headline for article 466: Context length exceeded. Truncating content and comments.


Generating Headlines: 100%|██████████| 505/505 [57:22<00:00,  6.82s/it]

done sal!





In [None]:
print(generated_headlines)



In [None]:
len(generated_headlines)

517

In [None]:
generated_headlines.pop(5)

'"Controversy Surrounds Alleged Rift between Allu Arjun and Sai Dharam Tej\'s Families: Fans Engage in Heated Debate"\n\nThis headline accurately reflects the content of the article, which revolves around the alleged rift between Allu Arjun and Sai Dharam Tej\'s families, and the heated debate among fans about the issue. The headline is 17 words long and is informative, without any sensational or violent language.'

In [None]:
len(generated_headlines)

516

In [None]:
generated_headlines= generated_headlines[:-16]

In [None]:
len(generated_headlines)

511

In [None]:
generated_headlines.pop()

'ಆರೋಪಿಗಳಿಗೆ ಮೆಗ್ಗರ್ ನಿಂದ ಶಾಕ್ ನೀಡುತ್ತಿತ್ತು ಎಲ್ಲಾ ಬಾಯಿ ಬಿಡುತ್ತಿದ್ದರು ದರ್ಶನ್ ಗೂ ತನಿಖೆ ನಡೆಸಿ.\n\nTranslation: "Accused given shock treatment with Meggar, all mouths shut, Investigation demanded by Darshan too."\n\nThis headline accurately reflects the content of the article and the relevant comments. It mentions the use of Meggar to give shock treatment to the accused, and the demand for an investigation by Darshan.'

In [None]:
len(generated_headlines)

506

In [None]:
df['Generated_Headline_comments'] = generated_headlines
df.to_csv('/content/drive/MyDrive/Dailyhunt_dataset/Dailyhunt_Kannada.csv', index=False)

In [None]:
generated_headlines.pop()

'"Seize the Opportunity: Future Ambitions Reborn as Huge Quantity of Contraband Liquor is Seized"'

In [None]:
df['Generated_Headline_comments'] = generated_headlines
df.to_csv('/content/drive/MyDrive/Dailyhunt_dataset/Dailyhunt_Bangla.csv', index=False)