In [1]:
import os
import re
import pandas as pd

def convert_srt_to_csv(srt_file_path, output_csv_path):
    with open(srt_file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Split content into subtitle blocks
    blocks = re.split(r'\n\n', content)

    subtitle_lines = []

    for block in blocks:
        # Remove indices and timestamps
        block = re.sub(r'\d+\n\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', '', block)

        # Remove any residual numbering or empty lines
        block = re.sub(r'\n\d+\n', '', block).strip()

        # Only keep text lines
        text_lines = [line.strip() for line in block.split('\n') if line.strip()]

        # Combine multi-line subtitles into a single line
        if text_lines:
            subtitle_lines.append(' '.join(text_lines))

    # Create a DataFrame from the extracted text
    df = pd.DataFrame(subtitle_lines, columns=['Text'])

    # Save the DataFrame to a CSV file
    df.to_csv(output_csv_path, index=False)


def batch_process_srt_files(input_folder, output_folder):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Process each .srt file in the input folder
    for file_name in os.listdir(input_folder):
        if file_name.endswith('.srt'):
            input_srt_file = os.path.join(input_folder, file_name)
            output_csv_file = os.path.join(output_folder, file_name.replace('.srt', '.csv'))

            # Convert the .srt file to .csv
            convert_srt_to_csv(input_srt_file, output_csv_file)
            print(f'Converted {file_name} to {output_csv_file}')


# Example usage
input_folder = '/Users/cedricroetheli/Desktop/Bsc/comparison/sub/'
output_folder = '/Users/cedricroetheli/Desktop/Bsc/comparison/processed_movies/'
batch_process_srt_files(input_folder, output_folder)

print('All .srt files have been processed successfully!')


Converted Inside.Out.2.2024.BluRay.x264.AAC5.1-[YTS.MX].en.srt to /Users/cedricroetheli/Desktop/Bsc/comparison/processed_movies/Inside.Out.2.2024.BluRay.x264.AAC5.1-[YTS.MX].en.csv
Converted Godzilla.x.Kong.The.New.Empire.2024.REPACK2.720p.WEBRip.800MB.x264-GalaxyRG.srt to /Users/cedricroetheli/Desktop/Bsc/comparison/processed_movies/Godzilla.x.Kong.The.New.Empire.2024.REPACK2.720p.WEBRip.800MB.x264-GalaxyRG.csv
Converted Wicked.2024.WEBRip.x264.AAC5.1-[YTS.MX].en[sdh].srt to /Users/cedricroetheli/Desktop/Bsc/comparison/processed_movies/Wicked.2024.WEBRip.x264.AAC5.1-[YTS.MX].en[sdh].csv
Converted Twisters.2024.720p.WEB-DL.DDP5.1.Atmos.H.264-FLUX.srt to /Users/cedricroetheli/Desktop/Bsc/comparison/processed_movies/Twisters.2024.720p.WEB-DL.DDP5.1.Atmos.H.264-FLUX.csv
Converted Deadpool.Wolverine.2024.REPACK.BluRay.x264.AAC5.1-[YTS.MX].en.srt to /Users/cedricroetheli/Desktop/Bsc/comparison/processed_movies/Deadpool.Wolverine.2024.REPACK.BluRay.x264.AAC5.1-[YTS.MX].en.csv
All .srt files 

In [None]:
import os
import re
import pandas as pd
from transformers import pipeline

def convert_srt_to_csv(srt_file_path, output_csv_path):
    with open(srt_file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Split content into subtitle blocks
    blocks = re.split(r'\n\n', content)

    subtitle_lines = []

    for block in blocks:
        # Remove indices and timestamps
        block = re.sub(r'\d+\n\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', '', block)

        # Remove any residual numbering or empty lines
        block = re.sub(r'\n\d+\n', '', block).strip()

        # Only keep text lines
        text_lines = [line.strip() for line in block.split('\n') if line.strip()]

        # Combine multi-line subtitles into a single line
        if text_lines:
            subtitle_lines.append(' '.join(text_lines))

    # Create a DataFrame from the extracted text
    df = pd.DataFrame(subtitle_lines, columns=['Text'])

    # Save the DataFrame to a CSV file
    df.to_csv(output_csv_path, index=False)


def batch_process_srt_files(input_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)

    for file_name in os.listdir(input_folder):
        if file_name.endswith('.srt'):
            input_srt_file = os.path.join(input_folder, file_name)
            output_csv_file = os.path.join(output_folder, file_name.replace('.srt', '.csv'))

            convert_srt_to_csv(input_srt_file, output_csv_file)
            print(f'Converted {file_name} to {output_csv_file}')


def analyze_all_movies(input_folder):
    # Initialize the model pipeline
    pipe = pipeline("text-classification", model="bucketresearch/politicalBiasBERT")

    results = []

    for file_name in os.listdir(input_folder):
        if file_name.endswith('.csv'):
            file_path = os.path.join(input_folder, file_name)
            movie_name = file_name.replace('.csv', '')

            df = pd.read_csv(file_path)
            results = df['Text'].apply(lambda x: pipe(x)[0])

            df['label'] = results.apply(lambda x: x['label'])
            df['score'] = results.apply(lambda x: x['score'])

            label_to_score = {"LEFT": -1, "CENTER": 0, "RIGHT": 1}

            df['weighted_score'] = df.apply(lambda row: row['score'] * label_to_score[row['label']], axis=1)
            overall_political_leaning_score = df['weighted_score'].sum() / len(df)

            label_counts = df['label'].value_counts().to_dict()
            average_scores = df.groupby('label')['score'].mean().to_dict()

            summary = {
                "Movie": movie_name,
                "Total Lines": len(df),
                "LEFT Count": label_counts.get("LEFT", 0),
                "CENTER Count": label_counts.get("CENTER", 0),
                "RIGHT Count": label_counts.get("RIGHT", 0),
                "LEFT Avg Score": average_scores.get("LEFT", 0),
                "CENTER Avg Score": average_scores.get("CENTER", 0),
                "RIGHT Avg Score": average_scores.get("RIGHT", 0),
                "Overall Political Leaning Score": overall_political_leaning_score
            }
            results.append(summary)

    results_df = pd.DataFrame(results)
    results_df.to_csv('/Users/cedricroetheli/Desktop/Bsc/comparison/processed_movies/summary_results.csv', index=False)
    print('All movies have been analyzed successfully!')


# Example usage
input_folder = '/Users/cedricroetheli/Desktop/Bsc/comparison/processed_movies/'
analyze_all_movies(input_folder)


In [None]:
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