In [14]:
import os
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, pipeline
import torch
import re
import numpy as np



In [2]:
import os
import re
import csv
import pandas as pd
import numpy as np

def list_csv_files(directory_path):
    # Lists all CSV files in the given directory.
    csv_files = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith('.csv')]
    return csv_files

def remove_ext_int_lines(text):
    # Remove all lines that include EXT or INT
    lines = text.split('\n')
    cleaned_lines = [line for line in lines if not line.strip().startswith(('EXT', 'INT'))]
    cleaned_text = '\n'.join(cleaned_lines)
    return cleaned_text

def get_features_df(file):
    # Step 1: Read CSV file into df_1
    df_1 = pd.read_csv(file)
    
    # Step 2: Extract name and ID number from file name
    pattern = r"^(.*)_(\d+)\.csv$"  # Assuming ID number is digits only
    match = re.match(pattern, os.path.basename(file))
    
    if match:
        name = match.group(1)
        id_number = match.group(2)
    else:
        raise ValueError("String format is incorrect")
    
    # Step 3: Get average, mean_squared_deviation, num of turns, and mean squared deviation from sentiment analysis
    average = df_1['Compound'].mean()
    mean_squared_deviation = ((df_1['Compound'] - average) ** 2).mean()
    compound_values = df_1['Compound'].values
    sign_changes = np.sign(compound_values[:-1]) * np.sign(compound_values[1:])
    num_turns = int(np.sum(sign_changes == -1))
    scenes_count = len(df_1['Compound'])

    return name, id_number, average, mean_squared_deviation, num_turns, scenes_count

# # Example usage:
# file_path = '/Users/xiaozhouye/Desktop/neuefische/Capstone-project_copy2/data/screenplay_data/data/scene_separated_analysed_vader_new2/2 Broke Girls_1845307.csv'  # Replace with your actual file path
# features = get_features_df(file_path)
# print(features)

# Processes all CSV files in the given directory, computes statistics, and writes them to a new CSV file.
def statistic_sentiment_all_csv(dir, dir_to_csv_file):
    # dir: Path to the directory containing CSV files.
    # dir_to_csv_file: Path to the output directory for CSV file.
 
    try:
        csv_files = list_csv_files(dir)
        
        Name = []
        Id = []
        Sentiment_score_average = []
        Sentiment_score_mean_squared_deviation = []
        Sentiment_num_turns = []
        Scenes_count = []

        for file in csv_files:
            print(f"Processing file: {file}")
            name, id_number, average, mean_squared_deviation, num_turns, scenes_count = get_features_df(file)
            Name.append(name)
            Id.append(id_number)
            Sentiment_score_average.append(average)
            Sentiment_score_mean_squared_deviation.append(mean_squared_deviation)
            Sentiment_num_turns.append(num_turns)
            Scenes_count.append(scenes_count)

        if not os.path.exists(dir_to_csv_file):
            os.makedirs(dir_to_csv_file)
        
        csv_sum = os.path.join(dir_to_csv_file, 'sentiment_summ.csv')
        with open(csv_sum, 'w', newline='') as csvfile:
            csv_writer = csv.writer(csvfile)
            csv_writer.writerow(['Name', 'Id', 'Sentiment_score_average', 'Sentiment_score_mean_squared_deviation', 'Sentiment_num_turns', 'Scenes_count'])
            for i in range(len(Name)):
                csv_writer.writerow([Name[i], Id[i], Sentiment_score_average[i], Sentiment_score_mean_squared_deviation[i], Sentiment_num_turns[i], Scenes_count[i]])

        print("Statistics saved to", csv_sum)

    except Exception as e:
        print(f"An error occurred: {str(e)}")

# Example usage:
dir = '/data/screenplay_data/data/scene_separated_analysed_vader'  # Replace with your path
dir_to_csv_file = '/data/screenplay_data/data/scene_sentiment_vader'  # Replace with your path
statistic_sentiment_all_csv(dir, dir_to_csv_file)

           


('2 Broke Girls', '1845307', -0.09243020134228187, 0.4740227039872078, 55, 149)
Processing file: /Users/xiaozhouye/Desktop/neuefische/Capstone-project_copy2/data/screenplay_data/data/scene_separated_analysed_vader_new2/The Changeling_0080516.csv
Processing file: /Users/xiaozhouye/Desktop/neuefische/Capstone-project_copy2/data/screenplay_data/data/scene_separated_analysed_vader_new2/Batman v Superman Dawn of Justice_2975590.csv
Processing file: /Users/xiaozhouye/Desktop/neuefische/Capstone-project_copy2/data/screenplay_data/data/scene_separated_analysed_vader_new2/Panther_0114084.csv
Processing file: /Users/xiaozhouye/Desktop/neuefische/Capstone-project_copy2/data/screenplay_data/data/scene_separated_analysed_vader_new2/Slash_4729990.csv
Processing file: /Users/xiaozhouye/Desktop/neuefische/Capstone-project_copy2/data/screenplay_data/data/scene_separated_analysed_vader_new2/Blue Velvet_0090756.csv
Processing file: /Users/xiaozhouye/Desktop/neuefische/Capstone-project_copy2/data/screenpl

In [7]:
import pandas as pd
df = pd.read_csv('/data/screenplay_data/data/scene_sentiment_vader/sentiment_summ.csv')
print(df['Id'].nunique())
df.head()

2856


Unnamed: 0,Name,Id,Sentiment_score_average,Sentiment_score_mean_squared_deviation,Sentiment_num_turns,Scenes_count
0,The Changeling,80516,0.04313,0.596833,20,54
1,Batman v Superman Dawn of Justice,2975590,0.001235,0.529702,44,135
2,Panther,114084,-0.224685,0.466195,48,151
3,Slash,4729990,-0.188082,0.544121,21,56
4,Blue Velvet,90756,0.126827,0.406182,57,231
