In [1]:
#Importing libraries
import os
import spacy
import csv
import nltk
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest
from nltk.corpus import PlaintextCorpusReader
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
#Loading SpaCy language model
nlp = spacy.load('en_core_web_sm')

In [3]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA


# Initialize VADER
sia = SIA()

# Define category paths
base_categories = {
    'Mainstream': r"E:\Study\TUD\Computational Knowledge Analysis\Term Paper\Corpus\Mainstream",
    'Lifestyle': r"E:\Study\TUD\Computational Knowledge Analysis\Term Paper\Corpus\Lifestyle",
    'Technology': r"E:\Study\TUD\Computational Knowledge Analysis\Term Paper\Corpus\Technology"
}

# Output folders
output_dir = r"E:\Study\TUD\Computational Knowledge Analysis\Term Paper\Corpus\Merged"
os.makedirs(output_dir, exist_ok=True)
csv_output_path = os.path.join(output_dir, "Sentiment_Results.csv")

# Open CSV file for writing
with open(csv_output_path, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Category', 'File Path', 'Negative', 'Neutral', 'Positive', 'Compound'])

    # Loop through categories
    for category, path in base_categories.items():
        combined_text = ""

        for root, dirs, files in os.walk(path):
            for file in files:
                if file.endswith('.txt'):
                    file_path = os.path.join(root, file)

                    with open(file_path, 'r', encoding='utf-8') as f:
                        text = f.read()

                        # Append to combined text
                        combined_text += f"{text}\n"

                        # Sentiment analysis
                        sentiment = sia.polarity_scores(text)
                        writer.writerow([
                            category,
                            file_path,
                            sentiment['neg'],
                            sentiment['neu'],
                            sentiment['pos'],
                            sentiment['compound']
                        ])

        # Write combined text file per category
        combined_txt_path = os.path.join(output_dir, f"{category}_combined.txt")
        with open(combined_txt_path, 'w', encoding='utf-8') as out_file:
            out_file.write(combined_text)


In [4]:
#combined_text

In [5]:
# Initialize VADER
analyzer = SentimentIntensityAnalyzer()

# Define your category paths
categories = [
    r"E:\Study\TUD\Computational Knowledge Analysis\Term Paper\Corpus\Mainstream",
    r"E:\Study\TUD\Computational Knowledge Analysis\Term Paper\Corpus\Lifestyle",
    r"E:\Study\TUD\Computational Knowledge Analysis\Term Paper\Corpus\Technology"
]

# Create the summary corpus directory
summary_corpus = r"E:\Study\TUD\Computational Knowledge Analysis\Term Paper\Corpus\Summaries"
os.makedirs(summary_corpus, exist_ok=True)

# Stop words list
stopwords = list(STOP_WORDS)

# Store summaries and sentiments
all_summaries = []
sentiment_scores = []

# Generate summaries
for category in categories:
    if not os.path.isdir(category):
        continue  # skip invalid paths

    category_name = os.path.basename(category)
    sources = [s for s in os.listdir(category) if os.path.isdir(os.path.join(category, s))]

    for source in sources:
        source_path = os.path.join(category, source)
        summary_output_path = os.path.join(summary_corpus, category_name, source)
        os.makedirs(summary_output_path, exist_ok=True)

        # Get all .txt files
        file_names = [f for f in os.listdir(source_path) if f.lower().endswith('.txt')]

        for file_name in file_names:
            full_path = os.path.join(source_path, file_name)

            with open(full_path, 'r', encoding='utf-8') as f:
                text = f.read().replace('\n', ' ')
                doc = nlp(text)

                # Word frequency calculation
                word_frequencies = {}
                for token in doc:
                    if token.text.lower() not in stopwords and token.text.lower() not in punctuation:
                        word_frequencies[token.text.lower()] = word_frequencies.get(token.text.lower(), 0) + 1

                if not word_frequencies:
                    continue  # Skip empty/invalid text

                max_freq = max(word_frequencies.values())
                for word in word_frequencies:
                    word_frequencies[word] /= max_freq

                # Sentence scoring
                sentence_scores = {}
                for sent in doc.sents:
                    for word in sent:
                        if word.text.lower() in word_frequencies:
                            sentence_scores[sent] = sentence_scores.get(sent, 0) + word_frequencies[word.text.lower()]

                summary_sentences = nlargest(5, sentence_scores, key=sentence_scores.get)
                summary_text = ' '.join([sent.text for sent in summary_sentences])

                # Save summary
                summary_filename = file_name.replace('.txt', '_summary.txt')
                summary_path = os.path.join(summary_output_path, summary_filename)
                with open(summary_path, 'w', encoding='utf-8') as sf:
                    sf.write(summary_text)

                all_summaries.append((summary_path, summary_text))

# Sentiment analysis on summaries
for summary_path, summary_text in all_summaries:
    score = analyzer.polarity_scores(summary_text)
    sentiment_scores.append((summary_path, score))

# Export to CSV
csv_output = r"E:\Study\TUD\Computational Knowledge Analysis\Term Paper\Corpus\Summaries\Sum_senti_scores.csv"
with open(csv_output, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Summary File', 'Negative', 'Neutral', 'Positive', 'Compound'])
    for path, score in sentiment_scores:
        writer.writerow([path, score['neg'], score['neu'], score['pos'], score['compound']])
        


In [6]:
# all_summaries

In [7]:
# sentiment_scores