In [1]:
import os
import re
import nltk
from nltk.corpus import stopwords

### Cleaning all the scraped texts present in the .txt files using the 'StopWords' folder and saving the cleaned texts in the original .txt files. 

In [2]:
import os

# Function to load stop words from the StopWords folder
def load_stopwords(stopwords_dir):
    stop_words = set()
    # Loop through all files in the StopWords directory
    for filename in os.listdir(stopwords_dir):
        if filename.endswith(".txt"):  # Assuming stop words are in .txt files
            filepath = os.path.join(stopwords_dir, filename)
            try:
                # Attempt to open the file with 'utf-8' encoding, fallback to 'latin-1' if error occurs
                with open(filepath, 'r', encoding='utf-8', errors='ignore') as file:
                    words = file.read().splitlines()
                    stop_words.update(words)
            except UnicodeDecodeError as e:
                print(f"Error reading {filename}: {e}")
    return stop_words

# Function to clean text by removing stop words
def clean_text(text, stop_words):
    # Tokenize text into words (this can be improved using nltk)
    words = text.split()  # Simple tokenization, splitting by whitespace
    
    # Remove stop words
    cleaned_words = [word for word in words if word.lower() not in stop_words]
    
    # Rejoin the cleaned words back into text
    cleaned_text = ' '.join(cleaned_words)
    return cleaned_text

# Load all stop words from the folder
stopwords_dir = 'StopWords'  # Replace with the actual path to the StopWords folder
stop_words = load_stopwords(stopwords_dir)

# Directory containing the article .txt files
articles_dir = 'scraped_articles'  # Replace with the folder where you saved your articles

# Loop through all .txt files in the articles directory
for filename in os.listdir(articles_dir):
    if filename.endswith(".txt"):
        filepath = os.path.join(articles_dir, filename)
        
        # Read the content of the article
        with open(filepath, 'r', encoding='utf-8', errors='ignore') as file:
            article_text = file.read()
        
        # Perform cleaning by removing stop words
        cleaned_text = clean_text(article_text, stop_words)
        
        # Overwrite the original .txt file with the cleaned text
        with open(filepath, 'w', encoding='utf-8') as file:
            file.write(cleaned_text)
        
        print(f"Successfully cleaned and saved {filename}")



Successfully cleaned and saved bctech2011.txt
Successfully cleaned and saved bctech2012.txt
Successfully cleaned and saved bctech2013.txt
Successfully cleaned and saved bctech2014.txt
Successfully cleaned and saved bctech2015.txt
Successfully cleaned and saved bctech2016.txt
Successfully cleaned and saved bctech2017.txt
Successfully cleaned and saved bctech2018.txt
Successfully cleaned and saved bctech2019.txt
Successfully cleaned and saved bctech2020.txt
Successfully cleaned and saved bctech2021.txt
Successfully cleaned and saved bctech2022.txt
Successfully cleaned and saved bctech2023.txt
Successfully cleaned and saved bctech2024.txt
Successfully cleaned and saved bctech2025.txt
Successfully cleaned and saved bctech2026.txt
Successfully cleaned and saved bctech2027.txt
Successfully cleaned and saved bctech2028.txt
Successfully cleaned and saved bctech2029.txt
Successfully cleaned and saved bctech2030.txt
Successfully cleaned and saved bctech2031.txt
Successfully cleaned and saved bct

### Creating a dictionary of postive and neagative words from the 'MasterDictionary' folder

In [3]:
import os

# Function to load stop words from the StopWords folder
def load_stopwords(stopwords_dir):
    stop_words = set()
    for filename in os.listdir(stopwords_dir):
        if filename.endswith(".txt"):
            filepath = os.path.join(stopwords_dir, filename)
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as file:
                words = file.read().splitlines()
                stop_words.update(words)
    return stop_words

# Function to load positive and negative words from the Master_Dictionary
def load_master_dictionary(master_dict_dir, stop_words):
    positive_words = set()
    negative_words = set()

    # Load positive words
    positive_file = os.path.join(master_dict_dir, 'positive-words.txt')
    with open(positive_file, 'r', encoding='utf-8', errors='ignore') as file:
        words = file.read().splitlines()
        # Add words to positive dictionary if not in stop words
        positive_words.update([word for word in words if word.lower() not in stop_words])

    # Load negative words
    negative_file = os.path.join(master_dict_dir, 'negative-words.txt')
    with open(negative_file, 'r', encoding='utf-8', errors='ignore') as file:
        words = file.read().splitlines()
        # Add words to negative dictionary if not in stop words
        negative_words.update([word for word in words if word.lower() not in stop_words])

    return positive_words, negative_words

# Load all stop words from the StopWords folder
stopwords_dir = 'StopWords'  # Replace with the actual path to the StopWords folder
stop_words = load_stopwords(stopwords_dir)

# Load the Master Dictionary and filter out stop words
master_dict_dir = 'Master_Dictionary'  # Replace with the actual path to the Master_Dictionary folder
positive_words, negative_words = load_master_dictionary(master_dict_dir, stop_words)

# Display the number of words loaded
print(f"Positive words loaded: {len(positive_words)}")
print(f"Negative words loaded: {len(negative_words)}")

# You can now use positive_words and negative_words in your sentiment analysis


Positive words loaded: 1988
Negative words loaded: 4779


### Converting the text into a list of tokens and using these tokens to calculate 4 variables namely 'Positive Score', 'Negative Score', 'Polarity Score', and 'Subjectivity Score'

In [11]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\abhin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [14]:
import os
from nltk.tokenize import word_tokenize

# Function to calculate derived variables
def calculate_scores(text, positive_words, negative_words):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Initialize scores
    positive_score = 0
    negative_score = 0

    # Calculate positive and negative scores
    for word in tokens:
        lower_word = word.lower()  # Normalize to lowercase
        if lower_word in positive_words:
            positive_score += 1
        elif lower_word in negative_words:
            negative_score += 1

    # Calculate Polarity Score
    if positive_score + negative_score > 0:
        polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
    else:
        polarity_score = 0  # Default to 0 if no positive or negative words

    # Calculate Subjectivity Score
    total_words = len(tokens)
    if total_words > 0:
        subjectivity_score = (positive_score + negative_score) / (total_words + 0.000001)
    else:
        subjectivity_score = 0  # Default to 0 if no words

    return positive_score, negative_score, polarity_score, subjectivity_score

# Directory containing the article .txt files
articles_dir = 'scraped_articles'  # Replace with the folder where you saved your articles

# Load the Master Dictionary (ensure this part is in your script before this section)
# ...

# Loop through all .txt files in the articles directory and calculate scores
for filename in os.listdir(articles_dir):
    if filename.endswith(".txt"):
        filepath = os.path.join(articles_dir, filename)

        # Read the cleaned content of the article
        with open(filepath, 'r', encoding='utf-8', errors='ignore') as file:
            cleaned_text = file.read()

        # Calculate scores
        positive_score, negative_score, polarity_score, subjectivity_score = calculate_scores(
            cleaned_text, positive_words, negative_words)

        # Output the scores
        print(f"Scores for {filename}:")
        print(f"Positive Score: {positive_score}")
        print(f"Negative Score: {negative_score}")
        print(f"Polarity Score: {polarity_score}")
        print(f"Subjectivity Score: {subjectivity_score}")
        print("\n")


Scores for bctech2011.txt:
Positive Score: 40
Negative Score: 11
Polarity Score: 0.5686274398308345
Subjectivity Score: 0.06367041190553008


Scores for bctech2012.txt:
Positive Score: 8
Negative Score: 3
Polarity Score: 0.4545454132231443
Subjectivity Score: 0.038327525998858795


Scores for bctech2013.txt:
Positive Score: 9
Negative Score: 3
Polarity Score: 0.4999999583333368
Subjectivity Score: 0.04225352097798056


Scores for bctech2014.txt:
Positive Score: 8
Negative Score: 3
Polarity Score: 0.4545454132231443
Subjectivity Score: 0.03806228360532082


Scores for bctech2015.txt:
Positive Score: 9
Negative Score: 3
Polarity Score: 0.4999999583333368
Subjectivity Score: 0.04166666652199074


Scores for bctech2016.txt:
Positive Score: 8
Negative Score: 3
Polarity Score: 0.4545454132231443
Subjectivity Score: 0.03873239422981551


Scores for bctech2017.txt:
Positive Score: 8
Negative Score: 3
Polarity Score: 0.4545454132231443
Subjectivity Score: 0.038327525998858795


Scores for bctec

### Analysis of Readabitity, Complex Word count, Word Count, and Avg. no. of words per sentence

In [23]:
import os
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

# Ensure you have the necessary resources downloaded
nltk.download('punkt_tab')
nltk.download('stopwords')

# Function to calculate readability metrics
def readability_analysis(text):
    # Tokenize text into sentences and words
    sentences = sent_tokenize(text)
    words = word_tokenize(text)

    # Load stop words from NLTK
    stop_words = set(stopwords.words('english'))

    # Remove stop words and punctuation from words
    cleaned_words = [word for word in words if word.isalpha() and word.lower() not in stop_words]

    # Count total sentences and cleaned words
    total_sentences = len(sentences)
    total_words = len(cleaned_words)

    # Calculate Average Sentence Length
    if total_sentences > 0:
        average_sentence_length = total_words / total_sentences
    else:
        average_sentence_length = 0

    # Count complex words (more than 2 syllables)
    complex_word_count = sum(1 for word in cleaned_words if syllable_count(word) > 2)

    # Calculate Percentage of Complex Words
    if total_words > 0:
        percentage_complex_words = complex_word_count / total_words
    else:
        percentage_complex_words = 0

    # Calculate Fog Index
    fog_index = 0.4 * (average_sentence_length + percentage_complex_words)

    # Calculate Average Number of Words Per Sentence
    avg_words_per_sentence = total_words / total_sentences if total_sentences > 0 else 0

    return average_sentence_length, percentage_complex_words, fog_index, complex_word_count, total_words, avg_words_per_sentence

# Function to count syllables in a word
def syllable_count(word):
    word = word.lower()
    vowels = "aeiouy"
    count = 0
    is_prev_vowel = False

    for char in word:
        if char in vowels:
            if not is_prev_vowel:
                count += 1
                is_prev_vowel = True
        else:
            is_prev_vowel = False

    # Remove silent 'e' at the end
    if word.endswith("e"):
        count -= 1
    # Handle exceptions for "ed" and "es"
    if word.endswith("ed") or word.endswith("es"):
        count -= 1

    # Ensure count is at least 1
    return max(count, 1)

# Directory containing the article .txt files
articles_dir = 'scraped_articles'  # Replace with the folder where you saved your articles

# Loop through all .txt files in the articles directory and analyze each
for filename in os.listdir(articles_dir):
    if filename.endswith(".txt"):
        filepath = os.path.join(articles_dir, filename)

        # Read the content of the article
        with open(filepath, 'r', encoding='utf-8', errors='ignore') as file:
            article_text = file.read()

        # Analyze the text
        average_sentence_length, percentage_complex_words, fog_index, complex_word_count, word_count, avg_words_per_sentence = readability_analysis(article_text)

        # Output the results for each file
        print(f"Text Analysis Results for {filename}:")
        print(f"Average Sentence Length: {average_sentence_length:.2f}")
        print(f"Percentage of Complex Words: {percentage_complex_words * 100:.2f}%")
        print(f"Fog Index: {fog_index:.2f}")
        print(f"Complex Word Count: {complex_word_count}")
        print(f"Word Count (cleaned): {word_count}")
        print(f"Average Number of Words Per Sentence: {avg_words_per_sentence:.2f}")
        print("\n" + "-" * 40 + "\n")  # Separator for better readability



[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\abhin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Text Analysis Results for bctech2011.txt:
Average Sentence Length: 26.48
Percentage of Complex Words: 43.84%
Fog Index: 10.77
Complex Word Count: 267
Word Count (cleaned): 609
Average Number of Words Per Sentence: 26.48

----------------------------------------

Text Analysis Results for bctech2012.txt:
Average Sentence Length: 34.67
Percentage of Complex Words: 37.02%
Fog Index: 14.01
Complex Word Count: 77
Word Count (cleaned): 208
Average Number of Words Per Sentence: 34.67

----------------------------------------

Text Analysis Results for bctech2013.txt:
Average Sentence Length: 34.33
Percentage of Complex Words: 39.32%
Fog Index: 13.89
Complex Word Count: 81
Word Count (cleaned): 206
Average Number of Words Per Sentence: 34.33

----------------------------------------

Text Analysis Results for bctech2014.txt:
Average Sentence Length: 34.67
Percentage of Complex Words: 38.46%
Fog Index: 14.02
Complex Word Count: 80
Word Count (cleaned): 208
Average Number of Words Per Sentence: 

### Calculating other variables like Total Syllables, Pronoun Count, and Average Word Length

In [24]:
import os
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Ensure you have the necessary resources downloaded
nltk.download('punkt_tab')
nltk.download('stopwords')

# Function to count syllables in a word
def syllable_count(word):
    word = word.lower()
    vowels = "aeiouy"
    count = 0
    is_prev_vowel = False

    for char in word:
        if char in vowels:
            if not is_prev_vowel:
                count += 1
                is_prev_vowel = True
        else:
            is_prev_vowel = False

    # Remove silent 'e' at the end
    if word.endswith("e"):
        count -= 1
    # Handle exceptions for "ed" and "es"
    if word.endswith("ed") or word.endswith("es"):
        count -= 1

    # Ensure count is at least 1
    return max(count, 1)

# Function to count personal pronouns
def count_pronouns(text):
    # Define a regex pattern for the personal pronouns
    pronouns_pattern = r'\b(I|we|my|ours|us)\b'
    # Find all matches in the text
    matches = re.findall(pronouns_pattern, text, flags=re.IGNORECASE)
    return len(matches)

# Function to calculate average word length
def average_word_length(words):
    if not words:
        return 0
    # Calculate total characters in all words
    total_characters = sum(len(word) for word in words)
    return total_characters / len(words)

# Function to analyze text
def analyze_text(text):
    # Tokenize text into words
    words = word_tokenize(text)
    
    # Load English stop words
    stop_words = set(stopwords.words('english'))

    # Remove stop words and punctuation
    cleaned_words = [word for word in words if word.isalpha() and word.lower() not in stop_words]
    
    # Count syllables for each cleaned word
    syllable_counts = [syllable_count(word) for word in cleaned_words]

    # Count personal pronouns
    pronoun_count = count_pronouns(text)

    # Calculate average word length
    avg_word_length = average_word_length(cleaned_words)

    # Summary
    total_syllables = sum(syllable_counts)

    return {
        "Total Syllables": total_syllables,
        "Pronoun Count": pronoun_count,
        "Average Word Length": avg_word_length
    }

# Directory containing the article .txt files
articles_dir = 'scraped_articles'  # Replace with the folder where you saved your articles

# Loop through all .txt files in the articles directory and analyze each
for filename in os.listdir(articles_dir):
    if filename.endswith(".txt"):
        filepath = os.path.join(articles_dir, filename)

        # Read the content of the article
        with open(filepath, 'r', encoding='utf-8', errors='ignore') as file:
            article_text = file.read()

        # Analyze the text
        analysis_result = analyze_text(article_text)

        # Output the results for each file
        print(f"Text Analysis Results for {filename}:")
        print(f"Total Syllables: {analysis_result['Total Syllables']}")
        print(f"Pronoun Count: {analysis_result['Pronoun Count']}")
        print(f"Average Word Length: {analysis_result['Average Word Length']:.2f}")
        print("\n" + "-" * 40 + "\n")  # Separator for better readability




[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\abhin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Text Analysis Results for bctech2011.txt:
Total Syllables: 1518
Pronoun Count: 1
Average Word Length: 7.59

----------------------------------------

Text Analysis Results for bctech2012.txt:
Total Syllables: 497
Pronoun Count: 1
Average Word Length: 7.38

----------------------------------------

Text Analysis Results for bctech2013.txt:
Total Syllables: 496
Pronoun Count: 1
Average Word Length: 7.41

----------------------------------------

Text Analysis Results for bctech2014.txt:
Total Syllables: 502
Pronoun Count: 1
Average Word Length: 7.41

----------------------------------------

Text Analysis Results for bctech2015.txt:
Total Syllables: 504
Pronoun Count: 1
Average Word Length: 7.46

----------------------------------------

Text Analysis Results for bctech2016.txt:
Total Syllables: 496
Pronoun Count: 1
Average Word Length: 7.44

----------------------------------------

Text Analysis Results for bctech2017.txt:
Total Syllables: 496
Pronoun Count: 1
Average Word Length: 7.33

## Output Data Structure.xlsx

In [26]:
import os
import pandas as pd
import nltk
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

# Ensure you have the necessary resources downloaded
nltk.download('punkt_tab')
nltk.download('stopwords')

# Function to count syllables in a word
def syllable_count(word):
    word = word.lower()
    vowels = "aeiouy"
    count = 0
    is_prev_vowel = False

    for char in word:
        if char in vowels:
            if not is_prev_vowel:
                count += 1
                is_prev_vowel = True
        else:
            is_prev_vowel = False

    # Remove silent 'e' at the end
    if word.endswith("e"):
        count -= 1
    # Handle exceptions for "ed" and "es"
    if word.endswith("ed") or word.endswith("es"):
        count -= 1

    # Ensure count is at least 1
    return max(count, 1)

# Function to count personal pronouns
def count_pronouns(text):
    pronouns_pattern = r'\b(I|we|my|ours|us)\b'
    matches = re.findall(pronouns_pattern, text, flags=re.IGNORECASE)
    return len(matches)

# Function to calculate average word length
def average_word_length(words):
    if not words:
        return 0
    total_characters = sum(len(word) for word in words)
    return total_characters / len(words)

# Function to calculate polarity and subjectivity scores
def calculate_sentiment_scores(positive_score, negative_score, total_words):
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (total_words + 0.000001)
    return polarity_score, subjectivity_score

# Function to analyze text
def analyze_text(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))

    # Clean words (remove stop words and punctuation)
    cleaned_words = [word for word in words if word.isalpha() and word.lower() not in stop_words]

    # Calculate scores for syllables, pronouns, etc.
    syllable_counts = [syllable_count(word) for word in cleaned_words]
    pronoun_count = count_pronouns(text)
    avg_word_length = average_word_length(cleaned_words)

    # Calculate Positive and Negative Scores
    positive_score = sum(1 for word in cleaned_words if word in positive_words)  # Replace with actual positive words
    negative_score = sum(1 for word in cleaned_words if word in negative_words)  # Replace with actual negative words

    # Polarity and Subjectivity Score Calculation
    polarity_score, subjectivity_score = calculate_sentiment_scores(positive_score, negative_score, len(cleaned_words))

    # Average Sentence Length and Fog Index
    avg_sentence_length = len(cleaned_words) / len(sentences) if len(sentences) > 0 else 0
    complex_word_count = sum(1 for word in cleaned_words if syllable_count(word) > 2)
    percentage_complex_words = complex_word_count / len(cleaned_words) if len(cleaned_words) > 0 else 0
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_words_per_sentence = avg_sentence_length

    return {
        "Positive Score": positive_score,
        "Negative Score": negative_score,
        "Polarity Score": polarity_score,
        "Subjectivity Score": subjectivity_score,
        "Avg Sentence Length": avg_sentence_length,
        "Percentage Complex Words": percentage_complex_words,
        "Fog Index": fog_index,
        "Avg Number of Words per Sentence": avg_words_per_sentence,
        "Complex Word Count": complex_word_count,
        "Word Count": len(cleaned_words),
        "Syllable per Word": sum(syllable_counts) / len(cleaned_words) if len(cleaned_words) > 0 else 0,
        "Personal Pronouns": pronoun_count,
        "Avg Word Length": avg_word_length
    }

# Load the input file that contains URL_ID and URL
input_df = pd.read_excel('Input.xlsx')  # Load your input Excel file

# Directory containing the article .txt files
articles_dir = 'scraped_articles'  # Replace with the folder where you saved your articles

# Data structure to hold the analysis results
output_data = []

# Loop through all .txt files in the articles directory and analyze each
for index, row in input_df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    filename = f"{url_id}.txt"  # Assuming the filenames match URL_ID
    
    filepath = os.path.join(articles_dir, filename)
    if os.path.exists(filepath):
        with open(filepath, 'r', encoding='utf-8', errors='ignore') as file:
            article_text = file.read()

        # Analyze the text
        analysis_result = analyze_text(article_text)

        # Add the result to the output data
        output_data.append({
            "URL_ID": url_id,
            "URL": url,
            **analysis_result
        })

# Create a DataFrame from the output data
df = pd.DataFrame(output_data)

# Write the DataFrame to the Output Data Structure
output_file_path = 'Output Data Structure.xlsx'  # Update with the correct path
df.to_excel(output_file_path, index=False)

print("Analysis complete! Data saved to Excel.")


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\abhin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Analysis complete! Data saved to Excel.


## View the saved data in excel

In [27]:
import pandas as pd

# Load the saved Excel file
df = pd.read_excel('Output Data Structure.xlsx')

# Display the data
print(df)

         URL_ID                                                URL  \
0    bctech2011  https://insights.blackcoffer.com/ml-and-ai-bas...   
1    bctech2012  https://insights.blackcoffer.com/streamlined-i...   
2    bctech2013  https://insights.blackcoffer.com/efficient-dat...   
3    bctech2014  https://insights.blackcoffer.com/effective-man...   
4    bctech2015  https://insights.blackcoffer.com/streamlined-t...   
..          ...                                                ...   
142  bctech2153  https://insights.blackcoffer.com/population-an...   
143  bctech2154  https://insights.blackcoffer.com/google-lsa-ap...   
144  bctech2155  https://insights.blackcoffer.com/healthcare-da...   
145  bctech2156  https://insights.blackcoffer.com/budget-sales-...   
146  bctech2157  https://insights.blackcoffer.com/amazon-buy-bo...   

     Positive Score  Negative Score  Polarity Score  Subjectivity Score  \
0                34               8        0.619048            0.068966   
1        