In [1]:
import os
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, pipeline
import torch
import re
import numpy as np

# nltk.download('punkt')
# nltk.download('stopwords')





In [2]:
def list_csv_files(directory_path):
    #Lists all CSV files in the given directory.
    csv_files = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith('.csv')]
    return csv_files

def remove_ext_int_lines(text):
    # Remove all lines that include EXT or INT
    lines = text.split('\n')
    cleaned_lines = [line for line in lines if not line.strip().startswith(('EXT', 'INT'))]
    cleaned_text = '\n'.join(cleaned_lines)
    return cleaned_text


def get_features_df(file):
    # Step 1: Read CSV file into df_1
    df_1 = pd.read_csv(file)
    
    # Step 2: Adjust scores based on label
    df_1['score_new'] = df_1.apply(lambda row: -row['score'] if row['label'] == 'NEGATIVE' else row['score'], axis=1)
    
    # Step 3: Extract name and ID number from file name
    pattern = r"^(.*)_(\d+)\.csv$"  # Assuming ID number is digits only
    match = re.match(pattern, os.path.basename(file))
    
    if match:
        name = match.group(1)
        id_number = match.group(2)
    else:
        raise ValueError("String format is incorrect")
    
    # Step 4: Get average, mean_squared_deviation,num of turns and mean squared deviation from sentiment analysis
    average = df_1['score_new'].mean()
    #variance = df_1['score_new'].var()
    mean_squared_deviation = ((df_1['score_new'] - average) ** 2).mean()
    compound_values = df_1['score_new'].values
    sign_changes = np.sign(compound_values[:-1]) * np.sign(compound_values[1:])
    num_turns = int( np.sum(sign_changes == -1))
    scenes_count = len(df_1['score_new'] )
    #print(num_turns,scenes_count)

    return name, id_number, average, mean_squared_deviation,num_turns,scenes_count

# Example usage:
#file_path = '/Users/xiaozhouye/Desktop/neuefische/Capstone-project_copy/data/screenplay_data/data/scene_separated_analysed/3 Idiots_1187043.csv'  # Replace with your actual file path
#features = get_features_df(file_path)
#print(features)



In [8]:
#Processes all CSV files in the given directory, computes statistics, and writes them to a new CSV file.
def statistic_sentiment_all_csv(dir, dir_to_csv_file):
    # dir: Path to the directory containing CSV files.
    # dir_to_csv_file: Path to the output directory for CSV file.
 
    try:
        csv_files = list_csv_files(dir)
        
        Name = []
        Id = []
        Sentiment_score_average = []
        Sentiment_score_mean_squared_deviation = []
        Sentiment_num_turns = []
        Scenes_count = []

        for file in csv_files:
            print(f"Processing file: {file}")
            name, id_number, average, mean_squared_deviation, num_turns, scenes_count = get_features_df(file)
            Name.append(name)
            Id.append(id_number)
            Sentiment_score_average.append(average)
            Sentiment_score_mean_squared_deviation.append(mean_squared_deviation)
            Sentiment_num_turns.append(num_turns)
            Scenes_count.append(scenes_count)

        if not os.path.exists(dir_to_csv_file):
            os.makedirs(dir_to_csv_file)
        
        csv_sum = os.path.join(dir_to_csv_file, 'sentiment_summ.csv')
        with open(csv_sum, 'w', newline='') as csvfile:
            csv_writer = csv.writer(csvfile)
            csv_writer.writerow(['Name', 'Id', 'Sentiment_score_average', 'Sentiment_score_mean_squared_deviation', 'Sentiment_num_turns', 'Scenes_count'])
            for i in range(len(Name)):
                csv_writer.writerow([Name[i], Id[i], Sentiment_score_average[i], Sentiment_score_mean_squared_deviation[i], Sentiment_num_turns[i], Scenes_count[i]])

        print("Statistics saved to", csv_sum)

    except Exception as e:
        print(f"An error occurred: {str(e)}")

           

# Example usage:
dir = 'data/screenplay_data/data/scene_separated_sentiment_3'#------------replace with your path
dir_to_csv_file = 'data/movie_scripts_screenplay_data/data/scene_sentiment'#------------replace with your path
statistic_sentiment_all_csv(dir,dir_to_csv_file)

Processing file: data/movie_scripts/screenplay_data/data/scene_separated_sentiment_3\10 Cloverfield Lane_1179933.csv
Processing file: data/movie_scripts/screenplay_data/data/scene_separated_sentiment_3\10 Things I Hate About You_0147800.csv
Processing file: data/movie_scripts/screenplay_data/data/scene_separated_sentiment_3\101 Days of 101 Dalmatians_0249328.csv
Processing file: data/movie_scripts/screenplay_data/data/scene_separated_sentiment_3\12 Angry Men_0118528.csv
Processing file: data/movie_scripts/screenplay_data/data/scene_separated_sentiment_3\12 Monkeys_0114746.csv
Processing file: data/movie_scripts/screenplay_data/data/scene_separated_sentiment_3\12 Years a Slave_2024544.csv
Processing file: data/movie_scripts/screenplay_data/data/scene_separated_sentiment_3\127 Hours_1542344.csv
Processing file: data/movie_scripts/screenplay_data/data/scene_separated_sentiment_3\13 13 13_2991516.csv
Processing file: data/movie_scripts/screenplay_data/data/scene_separated_sentiment_3\1408_

In [6]:
df = pd.read_csv('data/screenplay_data/data/scene_sentiment/sentiment_summ.csv')
df

FileNotFoundError: [Errno 2] No such file or directory: 'data/movie_scripts/screenplay_data/data/scene_sentimen_new2/sentiment_summ.csv'