<a href="https://colab.research.google.com/github/AkshataKurane/Shark-Tank/blob/main/MultipleAudioFiles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install git+https://github.com/openai/whisper.git
!pip install ffmpeg
!pip install pydub
!pip install -qq https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
!pip install -qq ipython==7.34.0
!pip install SpeechRecognition
!pip install pocketsphinx

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-po2e5mij
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-po2e5mij
  Resolved https://github.com/openai/whisper.git to commit 25639fc17ddc013d56c594bfbf7644f2185fad84
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper==20240930)
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting triton>=2.0.0 (from openai-whisper==20240930)
  Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (209.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [4]:
from pyannote.audio import Pipeline
from pydub import AudioSegment
import pandas as pd
import whisper
import tempfile
import os

def convert_mp3_to_wav(mp3_file_path, wav_file_path):
    audio = AudioSegment.from_mp3(mp3_file_path)
    audio.export(wav_file_path, format="wav")

def rttm_to_dataframe(rttm_file_path):
    columns = ["Type", "File ID", "Channel", "Start Time", "Duration", "Orthography", "Confidence", "Speaker", 'x', 'y']
    with open(rttm_file_path, 'r') as rttm_file:
        lines = rttm_file.readlines()
        data = [line.strip().split() for line in lines]
    df = pd.DataFrame(data, columns=columns)
    df = df.drop(["Type", "File ID", "Channel", "Orthography", "Confidence", 'x', 'y'], axis=1)
    return df

def extract_text_from_audio_segment(audio_segment):
    model = whisper.load_model("base")
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
        temp_path = temp_file.name
        audio_segment.export(temp_path, format="wav")
        result = model.transcribe(temp_path)
        os.remove(temp_path)
    return result['text']

def get_audio_segment(audio_file_path, start_time, end_time):
    audio = AudioSegment.from_wav(audio_file_path)
    start_ms = int(start_time * 1000)
    end_ms = int(end_time * 1000)
    return audio[start_ms:end_ms]

def process_audio_file(mp3_file_path):
    # Convert MP3 to WAV
    wav_file_path = mp3_file_path.replace('.mp3', '.wav')
    convert_mp3_to_wav(mp3_file_path, wav_file_path)

    # Speaker diarization
    pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1", use_auth_token="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")
    diarization = pipeline(wav_file_path, num_speakers=2)

    rttm_file_path = wav_file_path.replace('.wav', '.rttm')
    with open(rttm_file_path, "w") as rttm:
        diarization.write_rttm(rttm)

    # Process RTTM to DataFrame
    df = rttm_to_dataframe(rttm_file_path)
    df = df.astype({'Start Time': 'float', 'Duration': 'float'})
    df['Utterance'] = None
    df['End Time'] = df['Start Time'] + df['Duration']

    # Transcribe audio segments
    for ind in df.index:
        start_time = df.loc[ind, 'Start Time']
        end_time = df.loc[ind, 'End Time']
        try:
            audio_segment = get_audio_segment(wav_file_path, start_time, end_time)
            transcription = extract_text_from_audio_segment(audio_segment)
            df.loc[ind, 'Utterance'] = transcription
        except Exception as e:
            print(f"Error processing index {ind}: {e}")
            df.loc[ind, 'Utterance'] = 'Error'
    print(df)

    # Save DataFrame to CSV
    global output_csv_path
    output_csv_path = mp3_file_path.replace('.mp3', '.csv')
    df.to_csv(output_csv_path, index=False)
    #print(f"Processed {mp3_file_path}. Results saved to {output_csv_path}")

    file_format = input("Your output is saved in .csv format. Do you want in any other format? (e.g. csv, text, excel) : ")
    if file_format == "text":
      output_csv_path = mp3_file_path.replace('.mp3', '.txt')
      df.to_csv(output_csv_path, index=False)
    elif file_format == "excel":
      output_csv_path = mp3_file_path.replace('.mp3', '.xlsx')
      df.to_excel(output_csv_path, index=False)
    else:
      print("\nInvalid file format")
    print(f"\nProcessed {mp3_file_path}. Results saved to {output_csv_path}")


# Define the path to the folder containing your .mp3 files
folder_path = '/content/SharkTank'

# List all .mp3 files in the folder
mp3_files = [f for f in os.listdir(folder_path) if f.endswith('.mp3')]
mp3_file_paths = [os.path.join(folder_path, f) for f in mp3_files]

for index, mp3_file_path in enumerate(mp3_file_paths):
    process_audio_file(mp3_file_path)
    print(f"{mp3_file_path} is processed successfully!!!!")




config.yaml:   0%|          | 0.00/500 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/318 [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/pyannote/models--pyannote--segmentation/snapshots/c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.4.1+cu121. Bad things might happen unless you revert torch to 1.x.


hyperparams.yaml:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)


embedding_model.ckpt:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

mean_var_norm_emb.ckpt:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

classifier.ckpt:   0%|          | 0.00/5.53M [00:00<?, ?B/s]

label_encoder.txt:   0%|          | 0.00/129k [00:00<?, ?B/s]

  state_dict = torch.load(path, map_location=device)
  stats = torch.load(path, map_location=device)
100%|███████████████████████████████████████| 139M/139M [00:01<00:00, 99.1MiB/s]
  checkpoint = torch.load(fp, map_location=device)
  checkpoint = torch.load(fp, map_location=device)
  checkpoint = torch.load(fp, map_location=device)
  checkpoint = torch.load(fp, map_location=device)
  checkpoint = torch.load(fp, map_location=device)


   Start Time  Duration     Speaker  \
0       0.031    13.298  SPEAKER_01   
1      13.328     3.324  SPEAKER_00   
2      16.653     3.679  SPEAKER_01   
3      19.471     8.387  SPEAKER_00   
4      28.482    63.990  SPEAKER_00   

                                           Utterance  End Time  
0   One thing which I've taken away for the wealt...    13.329  
1   People are running out of money because of ba...    16.652  
2   No, they're in fact, they will live so long. ...    20.332  
3   What do you think Adani and Mukesham money do...    27.858  
4   Now there's countless content pieces on YouTu...    92.472  
Your output is saved in .csv format. Do you want in any other format? (e.g. csv, text, excel) : text

Processed /content/SharkTank/videoplaybackshort.mp3. Results saved to /content/SharkTank/videoplaybackshort.txt
/content/SharkTank/videoplaybackshort.mp3 is processed successfully!!!!


INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/pyannote/models--pyannote--segmentation/snapshots/c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.4.1+cu121. Bad things might happen unless you revert torch to 1.x.


  state_dict = torch.load(path, map_location=device)
  stats = torch.load(path, map_location=device)
  checkpoint = torch.load(fp, map_location=device)
  checkpoint = torch.load(fp, map_location=device)
  checkpoint = torch.load(fp, map_location=device)
  checkpoint = torch.load(fp, map_location=device)
  checkpoint = torch.load(fp, map_location=device)


   Start Time  Duration     Speaker  \
0       0.031    29.059  SPEAKER_00   
1      29.090    15.981  SPEAKER_01   
2      33.882     0.203  SPEAKER_00   
3      40.565     0.641  SPEAKER_00   
4      45.070    23.912  SPEAKER_00   

                                           Utterance  End Time  
0   There's a large cap, mid cap, small cap, many...    29.090  
1   So let's start with the 50 lakh rupee a month...    45.071  
2                                      world brother    34.085  
3                                               KEEN    41.206  
4   Yeah. So, you know, somebody was 50 lakh rupe...    68.982  
Your output is saved in .csv format. Do you want in any other format? (e.g. csv, text, excel) : excel

Processed /content/SharkTank/videoplaybacknew.mp3. Results saved to /content/SharkTank/videoplaybacknew.xlsx
/content/SharkTank/videoplaybacknew.mp3 is processed successfully!!!!


In [6]:
from pyannote.audio import Pipeline
from pydub import AudioSegment
import pandas as pd
import whisper
import tempfile
import os

def convert_mp3_to_wav(mp3_file_path, wav_file_path):
    audio = AudioSegment.from_mp3(mp3_file_path)
    audio.export(wav_file_path, format="wav")

def rttm_to_dataframe(rttm_file_path):
    columns = ["Type", "File ID", "Channel", "Start Time", "Duration", "Orthography", "Confidence", "Speaker", 'x', 'y']
    with open(rttm_file_path, 'r') as rttm_file:
        lines = rttm_file.readlines()
        data = [line.strip().split() for line in lines]
    df = pd.DataFrame(data, columns=columns)
    df = df.drop(["Type", "File ID", "Channel", "Orthography", "Confidence", 'x', 'y'], axis=1)
    return df

def extract_text_from_audio_segment(audio_segment):
    model = whisper.load_model("base")
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
        temp_path = temp_file.name
        audio_segment.export(temp_path, format="wav")
        result = model.transcribe(temp_path)
        os.remove(temp_path)
    return result['text']

def get_audio_segment(audio_file_path, start_time, end_time):
    audio = AudioSegment.from_wav(audio_file_path)
    start_ms = int(start_time * 1000)
    end_ms = int(end_time * 1000)
    return audio[start_ms:end_ms]

def process_audio_file(mp3_file_path):
    # Convert MP3 to WAV
    wav_file_path = mp3_file_path.replace('.mp3', '.wav')
    convert_mp3_to_wav(mp3_file_path, wav_file_path)

    # Speaker diarization
    pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1", use_auth_token="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")
    diarization = pipeline(wav_file_path, num_speakers=2)

    rttm_file_path = wav_file_path.replace('.wav', '.rttm')
    with open(rttm_file_path, "w") as rttm:
        diarization.write_rttm(rttm)

    # Process RTTM to DataFrame
    df = rttm_to_dataframe(rttm_file_path)
    df = df.astype({'Start Time': 'float', 'Duration': 'float'})
    df['Utterance'] = None
    df['End Time'] = df['Start Time'] + df['Duration']

    # Transcribe audio segments
    for ind in df.index:
        start_time = df.loc[ind, 'Start Time']
        end_time = df.loc[ind, 'End Time']
        try:
            audio_segment = get_audio_segment(wav_file_path, start_time, end_time)
            transcription = extract_text_from_audio_segment(audio_segment)
            df.loc[ind, 'Utterance'] = transcription
        except Exception as e:
            print(f"Error processing index {ind}: {e}")
            df.loc[ind, 'Utterance'] = 'Error'
    print(df)

    # Save DataFrame to CSV
    global output_csv_path
    output_csv_path = mp3_file_path.replace('.mp3', '.csv')
    df.to_csv(output_csv_path, index=False)
    #print(f"Processed {mp3_file_path}. Results saved to {output_csv_path}")

    file_format = input("\nYour output is saved in .csv format. Do you want in any other format? (e.g. csv, text, excel) : ")
    if file_format == "text":
      output_csv_path = mp3_file_path.replace('.mp3', '.txt')
      df.to_csv(output_csv_path, index=False)
    elif file_format == "excel":
      output_csv_path = mp3_file_path.replace('.mp3', '.xlsx')
      df.to_excel(output_csv_path, index=False)
    else:
      print("\nInvalid file format")
    print(f"\nProcessed {mp3_file_path}. Results saved to {output_csv_path}\n")


# Define the path to the folder containing your .mp3 files
folder_path = '/content/SharkTank'

# List all .mp3 files in the folder
mp3_files = [f for f in os.listdir(folder_path) if f.endswith('.mp3')]
mp3_file_paths = [os.path.join(folder_path, f) for f in mp3_files]

for index, mp3_file_path in enumerate(mp3_file_paths):
    check = mp3_file_path.replace('.mp3', '.csv')
    if os.path.exists(check):
        print(f"{mp3_file_path} is already processed!!!!")
    else:
      print(f"\nProcessing {mp3_file_path}...")
      process_audio_file(mp3_file_path)
      print(f"{mp3_file_path} is processed successfully!!!!\n\n")





Processing /content/SharkTank/ABPodcast.mp3...


INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/pyannote/models--pyannote--segmentation/snapshots/c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.4.1+cu121. Bad things might happen unless you revert torch to 1.x.


  state_dict = torch.load(path, map_location=device)
  stats = torch.load(path, map_location=device)
  checkpoint = torch.load(fp, map_location=device)
  checkpoint = torch.load(fp, map_location=device)
  checkpoint = torch.load(fp, map_location=device)
  checkpoint = torch.load(fp, map_location=device)
  checkpoint = torch.load(fp, map_location=device)
  checkpoint = torch.load(fp, map_location=device)


   Start Time  Duration     Speaker  \
0       0.031     0.506  SPEAKER_00   
1       0.537    24.587  SPEAKER_01   
2      25.867    15.609  SPEAKER_00   
3      40.244     3.291  SPEAKER_01   
4      42.235     0.236  SPEAKER_00   
5      43.535    62.944  SPEAKER_00   

                                           Utterance  End Time  
0                                to serve the clock.     0.537  
1   So if I was sitting there, I guess prior to t...    25.124  
2   Yeah, sure. The target profile would be, firs...    41.476  
3   So you have to be a nice person too, right? Y...    43.535  
4                                                       42.471  
5   Yeah, that too. So I think your client's prof...   106.479  

Your output is saved in .csv format. Do you want in any other format? (e.g. csv, text, excel) : text

Processed /content/SharkTank/ABPodcast.mp3. Results saved to /content/SharkTank/ABPodcast.txt

/content/SharkTank/ABPodcast.mp3 is processed successfully!!!!


/content

# Summary

In [16]:
!pip install reportlab # Install the necessary package

Collecting reportlab
  Downloading reportlab-4.2.4-py3-none-any.whl.metadata (1.5 kB)
Downloading reportlab-4.2.4-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.9 MB[0m [31m6.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/1.9 MB[0m [31m30.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.2.4


In [18]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline
import re
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import SimpleDocTemplate, Paragraph, Frame

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
qa_model = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
summarization_model = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

def extract_text_from_csv(csv_file_path):
    df = pd.read_csv(csv_file_path)
    text = ""
    for col in df.columns:
        text += " ".join(df[col].astype(str).tolist()) + " "
    return text

def save_summary_to_pdf(summary, pdf_path):
    doc = SimpleDocTemplate(pdf_path, pagesize=letter)
    styles = getSampleStyleSheet()
    story = []
    story.append(Paragraph("Summary of the Content:", styles['Title']))
    story.append(Paragraph(summary, styles['BodyText']))
    doc.build(story)

def summarize_text(text, max_length=150):
    return summarization_model(text, max_length=max_length, min_length=30, do_sample=False)[0]['summary_text']

# Define the path to the folder containing your .mp3 files
folder_path = '/content/SharkTank'

# List all .mp3 files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
csv_file_paths = [os.path.join(folder_path, f) for f in csv_files]
global csv_file_path

for index, csv_file_path in enumerate(csv_file_paths):
    check = csv_file_path.replace('.csv', '.pdf')
    if os.path.exists(check):
        print(f"{csv_file_path} is already processed!!!!\n")
    else:
      print(f"\nSummarzing {csv_file_path}...")
      document_text = extract_text_from_csv(csv_file_path)
      document_sentences = document_text.split('.')
      document_embeddings = embedding_model.encode(document_sentences)
      dimension = document_embeddings.shape[1]
      index = faiss.IndexFlatL2(dimension)
      index.add(np.array(document_embeddings))
      summary = summarize_text(document_text)
      pdf_path = csv_file_path.replace('.csv', '.pdf')
      save_summary_to_pdf(summary, pdf_path)
      print("Summary of the Content:")
      print(summary)
      print(f"{csv_file_path} is summarized successfully!!!!\n\n")




/content/SharkTank/ABPodcast.csv is already processed!!!!

/content/SharkTank/videoplaybackshort.csv is already processed!!!!

/content/SharkTank/videoplaybacknew.csv is already processed!!!!



# Sentiment

In [20]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.10 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.10 PyMuPDFb-1.24.10


In [44]:
import fitz  # PyMuPDF
from textblob import TextBlob
import nltk
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
import os

# Ensure necessary downloads
nltk.download('punkt')

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def analyze_sentiment(text):
    blob = TextBlob(text)
    sentiments = {"positive": [], "negative": [], "neutral": []}
    for sentence in blob.sentences:
        sentiment = sentence.sentiment.polarity
        if sentiment > 0:
            sentiments["positive"].append((sentence, sentiment))
        elif sentiment < 0:
            sentiments["negative"].append((sentence, sentiment))
        else:
            sentiments["neutral"].append((sentence, sentiment))
    return sentiments

def overall_sentiment(sentiments):
    total_sentences = len(sentiments["positive"]) + len(sentiments["negative"]) + len(sentiments["neutral"])
    overall_score = (
        sum([score for _, score in sentiments["positive"]]) +
        sum([score for _, score in sentiments["negative"]]) +
        sum([score for _, score in sentiments["neutral"]])
    )
    return overall_score / total_sentences if total_sentences != 0 else 0

def save_sentiment_to_pdf(sentiments, overall_score, sentiment_label, output_pdf_path):
    # Create a new PDF or append to existing PDF
    if os.path.exists(output_pdf_path):
        # Append to existing PDF
        doc = fitz.open(output_pdf_path)
        page = doc.new_page()
        page.insert_text((50, 50), f"Overall Sentiment Score: {overall_score}\nSentiment Label: {sentiment_label}")
        page.insert_text((50, 100), "Positive Sentences:")
        for sentence, score in sentiments["positive"]:
            page.insert_text((50, 120 + sentiments["positive"].index((sentence, score)) * 20), str(sentence))

        page.insert_text((50, 180 + len(sentiments["positive"]) * 20), "Negative Sentences:")
        for sentence, score in sentiments["negative"]:
            page.insert_text((50, 200 + len(sentiments["positive"]) * 20 + sentiments["negative"].index((sentence, score)) * 20), str(sentence))

        page.insert_text((50, 260 + len(sentiments["positive"]) * 20 + len(sentiments["negative"]) * 20), "Neutral Sentences:")
        for sentence, score in sentiments["neutral"]:
            page.insert_text((50, 280 + len(sentiments["positive"]) * 20 + len(sentiments["negative"]) * 20 + sentiments["neutral"].index((sentence, score)) * 20), str(sentence))

        doc.save(output_pdf_path)
        doc.close()
    else:
        # Create a new PDF
        c = canvas.Canvas(output_pdf_path, pagesize=letter)
        c.drawString(100, 750, f"Overall Sentiment Score: {overall_score}")
        c.drawString(100, 730, f"Sentiment Label: {sentiment_label}")

        c.drawString(100, 700, "Positive Sentences:")
        y_position = 680
        for sentence, score in sentiments["positive"]:
            c.drawString(100, y_position, str(sentence))
            y_position -= 20

        c.drawString(100, y_position, "Negative Sentences:")
        y_position -= 20
        for sentence, score in sentiments["negative"]:
            c.drawString(100, y_position, str(sentence))
            y_position -= 20

        c.drawString(100, y_position, "Neutral Sentences:")
        y_position -= 20
        for sentence, score in sentiments["neutral"]:
            c.drawString(100, y_position, str(sentence))
            y_position -= 20

        c.save()

def sentiment_label(overall_score):
    if overall_score > 0.05:
        return "Positive"
    elif overall_score < -0.05:
        return "Negative"
    else:
        return "Neutral"

folder_path = '/content/SharkTank'
pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
pdf_file_paths = [os.path.join(folder_path, f) for f in pdf_files]
global pdf_file_path, output_pdf_path
print(pdf_file_paths)

for index, pdf_file_path in enumerate(pdf_file_paths):
    #check = pdf_file_path.replace('.pdf', '.pdf')
    output_pdf_path = pdf_file_path.replace('.pdf', 'Sentiment.pdf')
    if os.path.exists(pdf_file_path.endswith('Sentiment.pdf')):
        print(f"{pdf_file_path} is already processed!!!!\n")
    else:
      print(f"\nAnalyzing {pdf_file_path}...")
      # Main processing
      pdf_text = extract_text_from_pdf(pdf_file_path)
      sentiments = analyze_sentiment(pdf_text)
      overall_score = overall_sentiment(sentiments)
      save_sentiment_to_pdf(sentiments, overall_score, sentiment_label(overall_score), output_pdf_path)
      print(f"Sentiment analysis results saved to {output_pdf_path}")
      print(f"{pdf_file_path} is summarized successfully!!!!\n\n")


['/content/SharkTank/videoplaybackshortSentiment.pdf', '/content/SharkTank/ABPodcastSentiment.pdf', '/content/SharkTank/ABPodcast.pdf', '/content/SharkTank/videoplaybacknew.pdf', '/content/SharkTank/videoplaybackshort.pdf']
/content/SharkTank/videoplaybackshortSentiment.pdf is already processed!!!!

/content/SharkTank/ABPodcastSentiment.pdf is already processed!!!!

/content/SharkTank/ABPodcast.pdf is already processed!!!!

/content/SharkTank/videoplaybacknew.pdf is already processed!!!!

/content/SharkTank/videoplaybackshort.pdf is already processed!!!!



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
