In [49]:
! pip install bs4



# Analysis of Text Data
This notebook presents an analysis of text data extracted from articles. 

## Introduction
We have a dataset containing URLs of articles, and our task is to extract relevant information and perform various analyses on the text data.

## Data Extraction
We start by extracting article content and titles from the provided URLs using BeautifulSoup and requests libraries.

# Basic Approach for Text Data Analysis

## 1. Data Extraction
- Utilized BeautifulSoup and requests libraries to extract article content and titles from the provided URLs.
- Read the URLs from an Excel sheet using pandas.

## 2. Data Cleaning
- Cleaned the extracted text by removing stop words and punctuation.
- Created a set of stop words and removed them from the text.

## 3. Sentiment Analysis
- Used a dictionary of positive and negative words to perform sentiment analysis.
- Assigned scores to words based on their presence in the positive and negative dictionaries.
- Calculated polarity score and subjectivity score based on the positive and negative scores.

## 4. Readability Analysis
- Analyzed the readability of the articles using various metrics such as average sentence length, percentage of complex words, and Fog Index.
- Calculated the average sentence length, percentage of complex words, and Fog Index for each article.

## 5. Additional Analyses
- Calculated the average word length, complex word count, syllable count per word, total cleaned words, and count of personal pronouns.
- Implemented functions to perform these calculations efficiently.

## 6. Conclusion
- Successfully extracted, cleaned, and analyzed text data from articles.
- Presented results such as sentiment scores, readability metrics, and additional analyses.
- Further optimizations and analyses can be performed based on specific requirements.


In [50]:
import pandas as pd 
import numpy as np 
from bs4 import BeautifulSoup
import requests
import os

In [51]:
#Read the excel sheet
df = pd.read_excel('input.xlsx')

df.head()

Unnamed: 0,URL_ID,URL
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...


In [52]:
#Function to extract article content and title

def extractArticleText(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        title = soup.title.string.strip()
        
        article_tag = soup.find('article')
        article_text = article_tag.get_text(separator='\n') if article_tag else None
        
        return title, article_text
    
    except Exception as e:
        print(f"Error occurred while extracting data from {url}: {e}")
        return None, None


In [53]:
def cleanText(text, stop_words):
    words = text.split()
    cleaned_words = [w for w in words if w.lower() not in stop_words]
    return ' '.join(cleaned_words)

In [54]:
# Create a directory to save text files
if not os.path.exists('extracted_articles'):
    os.makedirs('extracted_articles')

stop_words_path = "StopWords"
stop_words = set()
for filename in os.listdir(stop_words_path):
    with open(os.path.join(stop_words_path, filename), 'r', encoding='latin-1') as file:
        stop_words.update(file.read().splitlines())

for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    title, article_text = extractArticleText(url)
    
    if title and article_text:
        cleaned_text = cleanText(article_text, stop_words)

        with open(f'extracted_articles/{url_id}.txt', 'w', encoding='utf-8') as f:
            f.write(f"{title}\n\n{article_text}")
            print(f"Article title and text extracted from {url} and saved as {url_id}.txt")

    else:
        print(f"Failed to extract data from {url}")

print("Extraction complete.")

Article title and text extracted from https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040-2/ and saved as blackassign0001.txt
Article title and text extracted from https://insights.blackcoffer.com/rising-it-cities-and-their-impact-on-the-economy-environment-infrastructure-and-city-life-in-future/ and saved as blackassign0002.txt
Article title and text extracted from https://insights.blackcoffer.com/internet-demands-evolution-communication-impact-and-2035s-alternative-pathways/ and saved as blackassign0003.txt
Article title and text extracted from https://insights.blackcoffer.com/rise-of-cybercrime-and-its-effect-in-upcoming-future/ and saved as blackassign0004.txt
Article title and text extracted from https://insights.blackcoffer.com/ott-platform-and-its-impact-on-the-entertainment-industry-in-future/ and saved as blackassign0005.txt
Article title and text extracted from https://insights.blackcoffer.c

In [55]:
#Create a pos and neg dict.

def loadWords(file_path):
    words = set()
    with open(file_path, 'r', encoding = 'latin-1') as file:
        words.update(file.read().splitlines())
    return words

masterpath = "MasterDictionary"
posFile = os.path.join(masterpath, "positive-words.txt")
negFile = os.path.join(masterpath, "negative-words.txt")

posWords = loadWords(posFile)
negWords = loadWords(negFile)

posWordsDict = {word: 'positive' for word in posWords if word.lower() not in stop_words}
negWordsDict = {word: 'negative' for word in negWords if word.lower() not in stop_words}

print("Dictionaries of positive and negative words are created.")

Dictionaries of positive and negative words are created.


In [56]:
! pip install nltk



In [57]:
import nltk 
from nltk import word_tokenize
nltk.download('punkt')

def derivedVariables(text, posDict, negDict):
    tokens = word_tokenize(text)

    pos_score = sum(1 for token in tokens if token.lower() in posDict)
    neg_score = sum(1 for token in tokens if token.lower() in negDict)

    pol_score = (pos_score - neg_score) / (pos_score + neg_score) + (0.000001)

    subjectively_score = (pos_score + neg_score) / (len(tokens) + 0.000001)

    return pos_score, neg_score, pol_score, subjectively_score

for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    title, article_text = extractArticleText(url)
    
    if title and article_text:
        cleaned_text = cleanText(article_text, stop_words)

        pos_score, neg_score, pol_score, subjectively_score = derivedVariables(cleaned_text, posWordsDict, negWordsDict)
        print(f"For article '{title}':")
        print(f"Positive Score: {pos_score}")
        print(f"Negative Score: {neg_score}")
        print(f"Polarity Score: {pol_score}")
        print(f"Subjectivity Score: {subjectively_score}")
        
    else:
        print(f"Failed to extract data from {url}")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adityachaturvedi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


For article 'Rising IT cities and its impact on the economy, environment, infrastructure, and city life by the year 2040. - Blackcoffer Insights':
Positive Score: 37
Negative Score: 6
Polarity Score: 0.7209312325581395
Subjectivity Score: 0.048206277972862914
For article 'Rising IT Cities and Their Impact on the Economy, Environment, Infrastructure, and City Life in Future - Blackcoffer Insights':
Positive Score: 67
Negative Score: 32
Polarity Score: 0.3535363535353535
Subjectivity Score: 0.08101472988460333
For article 'Internet Demand's Evolution, Communication Impact, and 2035's Alternative Pathways - Blackcoffer Insights':
Positive Score: 41
Negative Score: 25
Polarity Score: 0.24242524242424243
Subjectivity Score: 0.06874999992838542
For article 'Rise of Cybercrime and its Effect in upcoming Future - Blackcoffer Insights':
Positive Score: 41
Negative Score: 75
Polarity Score: -0.29310244827586207
Subjectivity Score: 0.12353567611977032
For article 'OTT platform and its impact on t

In [58]:
#Average number of words per sentence 

from nltk.tokenize import sent_tokenize

def calculateAverageWordsPerSentence(text):
    sentences = sent_tokenize(text)
    
    total_sentences = len(sentences)
    
    words = text.split()
    total_words = len(words)
    
    if total_sentences > 0:
        average_words_per_sentence = total_words / total_sentences
        
    else:
        average_words_per_sentence = 0
    
    return average_words_per_sentence

for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    title, article_text = extractArticleText(url)
    
    if article_text:
        cleaned_text = cleanText(article_text, stop_words)
        
        average_words_per_sentence = calculateAverageWordsPerSentence(cleaned_text)
        
        print(f"For article '{title}':")
        print(f"Average Words Per Sentence: {average_words_per_sentence}")

    else:
        print(f"Failed to extract data from {url}")

For article 'Rising IT cities and its impact on the economy, environment, infrastructure, and city life by the year 2040. - Blackcoffer Insights':
Average Words Per Sentence: 9.233766233766234
For article 'Rising IT Cities and Their Impact on the Economy, Environment, Infrastructure, and City Life in Future - Blackcoffer Insights':
Average Words Per Sentence: 11.71951219512195
For article 'Internet Demand's Evolution, Communication Impact, and 2035's Alternative Pathways - Blackcoffer Insights':
Average Words Per Sentence: 12.704918032786885
For article 'Rise of Cybercrime and its Effect in upcoming Future - Blackcoffer Insights':
Average Words Per Sentence: 13.777777777777779
For article 'OTT platform and its impact on the entertainment industry in Future. - Blackcoffer Insights':
Average Words Per Sentence: 11.386363636363637
For article 'The rise of the OTT platform and its impact on the entertainment industry by 2040. - Blackcoffer Insights':
Average Words Per Sentence: 13.97777777

In [59]:
#Analsis of Readability 

def averageSentenceLength(text):
    words = text.split()
    total_words = len(words)
    sentences = sent_tokenize(text)
    total_sentences = len(sentences)
    if total_sentences > 0:
        average_sentence_length = total_words / total_sentences

    else:
        average_sentence_length = 0
        
    return average_sentence_length

def syllable_count(word):
    return 0

def percentageOfComplexWords(text):
    words = text.split()
    complex_words = [word for word in words if syllable_count(word) > 2]
    total_words = len(words)

    if total_words > 0:
        percentage_of_complex_words = len(complex_words) / total_words * 100

    else:
        percentage_of_complex_words = 0

    return percentage_of_complex_words

def fogIndex(average_sentence_length, percentage_of_complex_words):
    fog_index = 0.4 * (average_sentence_length + percentage_of_complex_words)
    return fog_index

for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    title, article_text = extractArticleText(url)
    
    if article_text:
        cleaned_text = cleanText(article_text, stop_words)
        
        average_sentence_length = averageSentenceLength(cleaned_text)
        
        percentage_of_complex_words = percentageOfComplexWords(cleaned_text)
        
        fog_index = fogIndex(average_sentence_length, percentage_of_complex_words)
        
        print(f"For article '{title}':")
        print(f"Average Sentence Length: {average_sentence_length}")
        print(f"Percentage of Complex Words: {percentage_of_complex_words}")
        print(f"Fog Index: {fog_index}")

    else:
        print(f"Failed to extract data from {url}")


For article 'Rising IT cities and its impact on the economy, environment, infrastructure, and city life by the year 2040. - Blackcoffer Insights':
Average Sentence Length: 9.233766233766234
Percentage of Complex Words: 0.0
Fog Index: 3.6935064935064936
For article 'Rising IT Cities and Their Impact on the Economy, Environment, Infrastructure, and City Life in Future - Blackcoffer Insights':
Average Sentence Length: 11.71951219512195
Percentage of Complex Words: 0.0
Fog Index: 4.68780487804878
For article 'Internet Demand's Evolution, Communication Impact, and 2035's Alternative Pathways - Blackcoffer Insights':
Average Sentence Length: 12.704918032786885
Percentage of Complex Words: 0.0
Fog Index: 5.081967213114755
For article 'Rise of Cybercrime and its Effect in upcoming Future - Blackcoffer Insights':
Average Sentence Length: 13.777777777777779
Percentage of Complex Words: 0.0
Fog Index: 5.511111111111112
For article 'OTT platform and its impact on the entertainment industry in Futu

In [60]:
#Complex word count

def complexWordCount(text):
    words = text.split()
    complex_word_count = 0
    for word in words:
        syllables = syllable_count(word)
        if syllables > 2:
            complex_word_count += 1
    return complex_word_count

for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    title, article_text = extractArticleText(url)
    
    if article_text:
        cleaned_text = cleanText(article_text, stop_words)
        
        complex_word_count = complexWordCount(cleaned_text)
        
        print(f"For article '{title}':")
        print(f"Complex Word Count: {complex_word_count}")

    else:
        print(f"Failed to extract data from {url}")

For article 'Rising IT cities and its impact on the economy, environment, infrastructure, and city life by the year 2040. - Blackcoffer Insights':
Complex Word Count: 0
For article 'Rising IT Cities and Their Impact on the Economy, Environment, Infrastructure, and City Life in Future - Blackcoffer Insights':
Complex Word Count: 0
For article 'Internet Demand's Evolution, Communication Impact, and 2035's Alternative Pathways - Blackcoffer Insights':
Complex Word Count: 0
For article 'Rise of Cybercrime and its Effect in upcoming Future - Blackcoffer Insights':
Complex Word Count: 0
For article 'OTT platform and its impact on the entertainment industry in Future. - Blackcoffer Insights':
Complex Word Count: 0
For article 'The rise of the OTT platform and its impact on the entertainment industry by 2040. - Blackcoffer Insights':
Complex Word Count: 0
For article 'Rise of Cyber Crime and its Effects - Blackcoffer Insights':
Complex Word Count: 0
For article 'Rise of Internet Demand and Its

In [61]:
#Syllable Count Per Word

def countSyllables(word):
    word = word.lower()
    
    vowels = 'aeiou'
    count = 0
    isVowel = False
    
    for char in word:
        if char in vowels:
            if not isVowel:
                count += 1
            isVowel = True

        else:
            isVowel = False
    
    if word.endswith(('es', 'ed')) and word[-3] not in vowels:
        count -= 1
    
    count = max(count, 1)
    
    return count

for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    title, article_text = extractArticleText(url)
    
    if article_text:
        cleaned_text = cleanText(article_text, stop_words)
        
        words = cleaned_text.split()
        
        syllable_counts = [countSyllables(word) for word in words]
        
        print(f"For article '{title}':")
        print(f"Syllable Count Per Word: {syllable_counts}")

    else:
        print(f"Failed to extract data from {url}")


For article 'Rising IT cities and its impact on the economy, environment, infrastructure, and city life by the year 2040. - Blackcoffer Insights':
Syllable Count Per Word: [2, 2, 2, 2, 3, 4, 5, 1, 4, 3, 3, 3, 3, 2, 2, 2, 3, 4, 5, 1, 2, 1, 1, 2, 2, 1, 2, 1, 1, 1, 2, 4, 4, 2, 3, 2, 1, 4, 1, 1, 2, 1, 2, 1, 1, 3, 3, 1, 4, 3, 2, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 4, 3, 1, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 2, 3, 2, 3, 1, 2, 1, 3, 2, 2, 2, 2, 2, 2, 1, 1, 2, 3, 3, 4, 3, 3, 2, 2, 4, 3, 1, 2, 2, 3, 3, 1, 2, 3, 2, 3, 1, 2, 1, 3, 2, 2, 2, 2, 2, 2, 1, 1, 2, 3, 3, 4, 4, 1, 1, 2, 1, 1, 2, 3, 2, 3, 1, 2, 1, 3, 2, 4, 2, 2, 2, 2, 1, 1, 2, 3, 3, 3, 3, 1, 2, 3, 2, 1, 3, 2, 3, 2, 1, 1, 3, 2, 3, 2, 1, 3, 2, 3, 3, 1, 1, 1, 1, 3, 2, 2, 3, 2, 3, 1, 2, 1, 3, 2, 3, 2, 2, 2, 2, 1, 1, 2, 3, 3, 2, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1, 1, 3, 2, 2, 2, 1, 3, 2, 1, 2, 2, 2, 1, 1, 1, 1, 1, 3, 2, 2, 1, 3, 1, 2, 2, 2, 2, 2, 3, 3, 2, 2, 2, 3, 1, 2, 2, 3, 3, 2, 1, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 3, 3, 3, 1, 2, 2, 2, 2, 1,

In [63]:
import string
from nltk.corpus import stopwords

def countCleanWords(text):
    words = text.split()
    
    stop_words = set(stopwords.words('english'))
    
    cleaned_words = []

    for word in words:
        word = word.translate(str.maketrans('', '', string.punctuation))
        if word.lower() not in stop_words and word != '':
            cleaned_words.append(word)
    
    total_cleaned_words = len(cleaned_words)
    
    return total_cleaned_words

for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    title, article_text = extractArticleText(url)
    
    if article_text:
        cleaned_text = cleanText(article_text, stop_words)
        
        total_cleaned_words = countCleanWords(cleaned_text)
        
        print(f"For article '{title}':")
        print(f"Total Cleaned Words: {total_cleaned_words}")
        
    else:
        print(f"Failed to extract data from {url}")

For article 'Rising IT cities and its impact on the economy, environment, infrastructure, and city life by the year 2040. - Blackcoffer Insights':
Total Cleaned Words: 695
For article 'Rising IT Cities and Their Impact on the Economy, Environment, Infrastructure, and City Life in Future - Blackcoffer Insights':
Total Cleaned Words: 949
For article 'Internet Demand's Evolution, Communication Impact, and 2035's Alternative Pathways - Blackcoffer Insights':
Total Cleaned Words: 771
For article 'Rise of Cybercrime and its Effect in upcoming Future - Blackcoffer Insights':
Total Cleaned Words: 741
For article 'OTT platform and its impact on the entertainment industry in Future. - Blackcoffer Insights':
Total Cleaned Words: 499
For article 'The rise of the OTT platform and its impact on the entertainment industry by 2040. - Blackcoffer Insights':
Total Cleaned Words: 1247
For article 'Rise of Cyber Crime and its Effects - Blackcoffer Insights':
Total Cleaned Words: 845
For article 'Rise of I

In [64]:
#Personal Pronouns

import re

def countPersonalPronouns(text):
    pronoun_pattern = r'\b(?:I|we|my|ours|us)\b'
    
    pronoun_matches = re.findall(pronoun_pattern, text, flags=re.IGNORECASE)
    
    count = len(pronoun_matches)
    
    return count

for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    title, article_text = extractArticleText(url)
    
    if article_text:
        cleaned_text = cleanText(article_text, stop_words)
        
        count = countPersonalPronouns(cleaned_text)
        
        print(f"For article '{title}':")
        print(f"Personal Pronoun Count: {count}")
    else:
        print(f"Failed to extract data from {url}")


For article 'Rising IT cities and its impact on the economy, environment, infrastructure, and city life by the year 2040. - Blackcoffer Insights':
Personal Pronoun Count: 1
For article 'Rising IT Cities and Their Impact on the Economy, Environment, Infrastructure, and City Life in Future - Blackcoffer Insights':
Personal Pronoun Count: 2
For article 'Internet Demand's Evolution, Communication Impact, and 2035's Alternative Pathways - Blackcoffer Insights':
Personal Pronoun Count: 0
For article 'Rise of Cybercrime and its Effect in upcoming Future - Blackcoffer Insights':
Personal Pronoun Count: 0
For article 'OTT platform and its impact on the entertainment industry in Future. - Blackcoffer Insights':
Personal Pronoun Count: 0
For article 'The rise of the OTT platform and its impact on the entertainment industry by 2040. - Blackcoffer Insights':
Personal Pronoun Count: 0
For article 'Rise of Cyber Crime and its Effects - Blackcoffer Insights':
Personal Pronoun Count: 0
For article 'Ris

In [65]:
#Average word length 

def averageWordLength(text):
    words = text.split()
    
    total_characters = sum(len(word) for word in words)
    
    total_words = len(words)
    
    if total_words > 0:
        average = total_characters / total_words
    else:
        average = 0
    
    return average

for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    title, article_text = extractArticleText(url)
    
    if article_text:
        cleaned_text = cleanText(article_text, stop_words)
        
        average = averageWordLength(cleaned_text)
        
        print(f"For article '{title}':")
        print(f"Average Word Length: {average}")
    else:
        print(f"Failed to extract data from {url}")


For article 'Rising IT cities and its impact on the economy, environment, infrastructure, and city life by the year 2040. - Blackcoffer Insights':
Average Word Length: 6.796610169491525
For article 'Rising IT Cities and Their Impact on the Economy, Environment, Infrastructure, and City Life in Future - Blackcoffer Insights':
Average Word Length: 7.531737773152965
For article 'Internet Demand's Evolution, Communication Impact, and 2035's Alternative Pathways - Blackcoffer Insights':
Average Word Length: 8.185806451612903
For article 'Rise of Cybercrime and its Effect in upcoming Future - Blackcoffer Insights':
Average Word Length: 8.130376344086022
For article 'OTT platform and its impact on the entertainment industry in Future. - Blackcoffer Insights':
Average Word Length: 7.473053892215569
For article 'The rise of the OTT platform and its impact on the entertainment industry by 2040. - Blackcoffer Insights':
Average Word Length: 7.953895071542131
For article 'Rise of Cyber Crime and i

In [86]:
# Define functions for data processing
def process_text(text):
    if not text:
        return None, None, None, None, None, None, None, None, None, None, None, None, None
    cleaned_text = cleanText(text, stop_words)
    word_count = countCleanWords(cleaned_text)
    positive_score, negative_score, polarity_score, subjectivity_score = derivedVariables(cleaned_text, posWordsDict, negWordsDict)
    avg_sentence_length = averageSentenceLength(cleaned_text)
    perc_complex_word = percentageOfComplexWords(cleaned_text)
    fog_index = fogIndex(avg_sentence_length, perc_complex_word)
    avg_words_sentence = calculateAverageWordsPerSentence(cleaned_text)
    complex_word_count = complexWordCount(cleaned_text)
    personal_pronoun_count = countPersonalPronouns(cleaned_text)
    avg_word_length = averageWordLength(cleaned_text)
    syllable_per_word = countSyllables(cleaned_text)
    return word_count, positive_score, negative_score, polarity_score, subjectivity_score, avg_sentence_length, perc_complex_word, fog_index, avg_words_sentence, complex_word_count, syllable_per_word, personal_pronoun_count, avg_word_length

# Apply functions to DataFrame columns
(df['WORD COUNT'], df['POSITIVE SCORE'], df['NEGATIVE SCORE'], df['POLARITY SCORE'], df['SUBJECTIVITY SCORE'],
 df['AVG SENTENCE LENGTH'], df['PERCENTAGE OF COMPLEX WORDS'], df['FOG INDEX'], df['AVG NUMBER OF WORDS PER SENTENCE'],
 df['COMPLEX WORD COUNT'], df['SYLLABLE PER WORD'], df['PERSONAL PRONOUNS'], df['AVG WORD LENGTH']) = zip(*df['URL'].apply(lambda url: process_text(extractArticleText(url)[1])))

# Save the output data to a new Excel file with a fixed name
df.to_excel('Output_Data_Structure.xlsx', index=False)


In [82]:
# positive_scores = []
# negative_scores = []
# polarity_scores = []
# subjectivity_scores = []
# avg_sentence_lengths = []
# percentage_complex_words = []
# fog_indexes = []
# avg_words_per_sentence = []
# complex_word_counts = []
# word_counts = []
# syllable_per_words = []
# personal_pronouns = []
# avg_word_lengths = []

# for index, row in df.iterrows():
#     print("Processing row:", index)
#     url_id = row['URL_ID']
#     url = row['URL']
#     title, article_text = extractArticleText(url)
    
#     if article_text:
#         cleaned_text = cleanText(article_text, stop_words)
    
#         word_count = countCleanWords(cleaned_text)
#         positive_score,negative_score,polarity_score, subjectivity_score = derivedVariables(cleaned_text, posWordsDict, negWordsDict)
#         avg_sentence_length = averageSentenceLength(cleaned_text)
#         perc_complex_word = percentageOfComplexWords(cleaned_text)
#         fog_index = fogIndex(avg_sentence_length, perc_complex_word)
#         avg_words_sentence = calculateAverageWordsPerSentence(cleaned_text)
#         complex_word_count = complexWordCount(cleaned_text)
#         personal_pronoun_count = countPersonalPronouns(cleaned_text)
#         avg_word_length = averageWordLength(cleaned_text)
#         syllable_per_word = countSyllables(cleaned_text)
        
#         positive_scores.append(positive_score)
#         negative_scores.append(negative_score)
#         polarity_scores.append(polarity_score)
#         subjectivity_scores.append(subjectivity_score)
#         avg_sentence_lengths.append(avg_sentence_length)
#         percentage_complex_words.append(perc_complex_word)
#         fog_indexes.append(fog_index)
#         avg_words_per_sentence.append(avg_words_sentence)
#         complex_word_counts.append(complex_word_count)
#         word_counts.append(word_count)
#         syllable_per_words.append(syllable_per_word)
#         personal_pronouns.append(personal_pronoun_count)
#         avg_word_lengths.append(avg_word_length)

# df['POSITIVE SCORE'] = positive_scores
# df['NEGATIVE SCORE'] = negative_scores
# df['POLARITY SCORE'] = polarity_scores
# df['SUBJECTIVITY SCORE'] = subjectivity_scores
# df['AVG SENTENCE LENGTH'] = avg_sentence_lengths
# df['PERCENTAGE OF COMPLEX WORDS'] = percentage_complex_words
# df['FOG INDEX'] = fog_indexes
# df['AVG NUMBER OF WORDS PER SENTENCE'] = avg_words_per_sentence
# df['COMPLEX WORD COUNT'] = complex_word_counts
# df['WORD COUNT'] = word_counts
# df['SYLLABLE PER WORD'] = syllable_per_words
# df['PERSONAL PRONOUNS'] = personal_pronouns
# df['AVG WORD LENGTH'] = avg_word_lengths


# # Save the output data to a new Excel file with a fixed name
# df.to_excel('Output.xlsx', index=False)

Processing row: 0
Processing row: 1
Processing row: 2
Processing row: 3
Processing row: 4
Processing row: 5
Processing row: 6
Processing row: 7
Processing row: 8
Processing row: 9
Processing row: 10
Processing row: 11
Processing row: 12
Processing row: 13
Processing row: 14
Processing row: 15
Processing row: 16
Processing row: 17
Processing row: 18
Processing row: 19
Processing row: 20
Processing row: 21
Processing row: 22
Processing row: 23
Processing row: 24
Processing row: 25
Processing row: 26
Processing row: 27
Processing row: 28
Processing row: 29
Processing row: 30
Processing row: 31
Processing row: 32
Processing row: 33
Processing row: 34
Processing row: 35
Processing row: 36
Processing row: 37
Processing row: 38
Processing row: 39
Processing row: 40
Processing row: 41
Processing row: 42
Processing row: 43
Processing row: 44
Processing row: 45
Processing row: 46
Processing row: 47
Processing row: 48
Processing row: 49
Processing row: 50
Processing row: 51
Processing row: 52
Pro

ValueError: Length of values (98) does not match length of index (100)