In [32]:
import pandas as pd
import os

def process_file(file_path, max_columns=20):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            first_line = file.readline()
            if not first_line.strip():
                print(f"Skipped empty file: {file_path}")
                return pd.DataFrame()

            actual_columns = len(first_line.split(','))
            columns_to_read = list(range(min(max_columns, actual_columns)))

        data = pd.read_csv(file_path, usecols=columns_to_read, dtype=str, engine='python')

        def clean_cell(cell):
            if isinstance(cell, str) and "https://" in cell:
                index = cell.find("https://")
                return cell[index:]
            return None

        def extract_title(url):
            if isinstance(url, str):
                last_part = url.strip().split('/')[-1]
                return last_part.replace('-', ' ')
            return None

        cleaned_data = []
        for _, row in data.iterrows():
            for cell in row:
                valid_content = clean_cell(cell)
                if valid_content:
                    title = extract_title(valid_content)
                    if title and "netflix" in title.lower():
                        cleaned_data.append(title.strip().lower())

        return pd.DataFrame(cleaned_data, columns=["title"]).drop_duplicates()
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return pd.DataFrame()


def process_all_files(input_dir, output_dir, max_columns=20):
    os.makedirs(output_dir, exist_ok=True)

    for file_name in os.listdir(input_dir):
        if file_name.endswith('.CSV'):
            input_file_path = os.path.join(input_dir, file_name)
            output_file_path = os.path.join(output_dir, f"processed_{file_name}")

            cleaned_news_df = process_file(input_file_path, max_columns=max_columns)
            if not cleaned_news_df.empty:
                cleaned_news_df.to_csv(output_file_path, index=False, header=True, quotechar='"', quoting=1)
                print(f"Processed and saved: {file_name}")
            else:
                print(f"Skipped or failed: {file_name}")


input_dir = r'C:\Users\zhao\Desktop\gdelt_extracted'
output_dir = r'C:\Users\zhao\Desktop\gdelt_cleaned'

process_all_files(input_dir, output_dir, max_columns=10)


Processed and saved: 20200101.export.CSV
Processed and saved: 20200102.export.CSV
Processed and saved: 20200103.export.CSV
Processed and saved: 20200104.export.CSV
Processed and saved: 20200105.export.CSV
Processed and saved: 20200106.export.CSV
Processed and saved: 20200107.export.CSV
Processed and saved: 20200108.export.CSV
Processed and saved: 20200109.export.CSV
Processed and saved: 20200110.export.CSV
Processed and saved: 20200111.export.CSV
Processed and saved: 20200112.export.CSV
Processed and saved: 20200113.export.CSV
Processed and saved: 20200114.export.CSV
Processed and saved: 20200115.export.CSV
Processed and saved: 20200116.export.CSV
Processed and saved: 20200117.export.CSV
Processed and saved: 20200118.export.CSV
Processed and saved: 20200119.export.CSV
Processed and saved: 20200120.export.CSV
Processed and saved: 20200121.export.CSV
Processed and saved: 20200122.export.CSV
Processed and saved: 20200123.export.CSV
Processed and saved: 20200124.export.CSV
Processed and sa

In [33]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def perform_sentiment_analysis(data):
    analyzer = SentimentIntensityAnalyzer()
    data['sentiment'] = data['title'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
    avg_sentiment = data['sentiment'].mean()
    positive_ratio = (data['sentiment'] > 0).mean()
    neutral_ratio = (data['sentiment'] == 0).mean()
    negative_ratio = (data['sentiment'] < 0).mean()

    return avg_sentiment, positive_ratio, neutral_ratio, negative_ratio

def process_all_cleaned_files(input_dir, output_file):
    results = []

    for file_name in os.listdir(input_dir):
        if file_name.endswith('.CSV'):
            date = file_name.split('_')[1][:8]
            date = pd.to_datetime(date, format='%Y%m%d')

            file_path = os.path.join(input_dir, file_name)
            data = pd.read_csv(file_path)

            if not data.empty:
                avg_sentiment, positive_ratio, neutral_ratio, negative_ratio = perform_sentiment_analysis(data)
                results.append({
                    "date": date,
                    "avg_sentiment": avg_sentiment,
                    "positive_ratio": positive_ratio,
                    "neutral_ratio": neutral_ratio,
                    "negative_ratio": negative_ratio
                })
                print(f"Processed sentiment for: {file_name}")
            else:
                print(f"No data in: {file_name}")

    summary_df = pd.DataFrame(results).sort_values(by="date")
    summary_df.to_csv(output_file, index=False)
    print(f"Daily sentiment summary saved to {output_file}")


In [34]:
if __name__ == "__main__":
    cleaned_dir = r'C:\Users\zhao\Desktop\gdelt_cleaned'
    output_file = r'C:\Users\zhao\Desktop\daily_sentiment_summary.csv'

    process_all_cleaned_files(cleaned_dir, output_file)


Processed sentiment for: processed_20200101.export.CSV
Processed sentiment for: processed_20200102.export.CSV
Processed sentiment for: processed_20200103.export.CSV
Processed sentiment for: processed_20200104.export.CSV
Processed sentiment for: processed_20200105.export.CSV
Processed sentiment for: processed_20200106.export.CSV
Processed sentiment for: processed_20200107.export.CSV
Processed sentiment for: processed_20200108.export.CSV
Processed sentiment for: processed_20200109.export.CSV
Processed sentiment for: processed_20200110.export.CSV
Processed sentiment for: processed_20200111.export.CSV
Processed sentiment for: processed_20200112.export.CSV
Processed sentiment for: processed_20200113.export.CSV
Processed sentiment for: processed_20200114.export.CSV
Processed sentiment for: processed_20200115.export.CSV
Processed sentiment for: processed_20200116.export.CSV
Processed sentiment for: processed_20200117.export.CSV
Processed sentiment for: processed_20200118.export.CSV
Processed 