# EXTRACT THE TEXT

In [5]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import uuid
import nltk
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import cmudict
from nltk.stem import WordNetLemmatizer
from collections import Counter

In [6]:
# Set up the webdriver
driver = webdriver.Chrome()

EXTRACT ARTICLE TITLE AND ARTICLE TEXT

In [7]:
# Navigate to the webpage
driver.get("https://insights.blackcoffer.com/ai-and-ml-based-youtube-analytics-and-content-creation-tool-for-optimizing-subscriber-engagement-and-content-strategy/")

In [8]:
# Get the page source
page_source = driver.page_source

In [9]:
# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(page_source, 'html.parser')

In [10]:
# Remove the header and footer
header = soup.find('header')
footer = soup.find('footer')
if header:
    header.decompose()
if footer:
    footer.decompose()

In [11]:
# Find the article text
article_text = soup.find('div', class_='article-content')

In [12]:
#If article text is found, get its text
if article_text:
    article_text = article_text.get_text()
else:
    article_text = soup.get_text()

In [13]:
#Remove any script or style elements
for script in soup.find_all(['script','style']):
    script.decompose()

In [14]:
# Remove any remaining non-article text
text = '\n'.join([line for line in article_text.split('\n')if line.strip()])

In [15]:
file_id = str(uuid.uuid4())

In [16]:
print(driver.title)

AI and ML-Based YouTube Analytics and Content Creation Tool for Optimizing Subscriber Engagement and Content Strategy - Blackcoffer Insights


In [17]:
# Print the article text
print(text)

AI and ML-Based YouTube Analytics and Content Creation Tool for Optimizing Subscriber Engagement and Content Strategy - Blackcoffer Insights
Sign in
Our Success Stories
Banking Securities, and Insurance
Energy
Entertainment
Fast Moving Consumer Goods
Government & Think Tanks
Healthcare
Infrastructure & Real Estate
IT
Lifestyle & eCommerce
Production & manufacturing
Research & Academia
Retail & Supply Chain
Telecom
What We Do
Banking, Financials, Securities, and Insurance
Energy
Entertainment
Fast Moving Consumer Goods
Government & Think Tanks
Healthcare
Hospitality
Infrastructure & Real Estate
IT Services
Lifestyle, eCommerce & Online Market Place
News & Media
Production & Manufacturing
Research & Academia
Retail & Supply Chain
What We Think
Automobiles & Components
BFSI
Asset and Portfolio
Banks
Capital Markets
Derivatives and Securities
Diversified Financials
Finance & Accounting
Insurance
Securities and Capital Markets
Capital Goods
Commercial & Professional Services
Consumer Discre

# TEXTUAL ANALYSIS

In [18]:
#Remove HTML tags and special characters
text = re.sub(r'<.*?>','',text)
text = re.sub(r'[^a-zA-Z\s]', '',text)

In [19]:
#Tokenize the text into words 
tokens = word_tokenize(text)

In [20]:
#Remove stopwords and punctuation
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word.lower() not in stop_words]

In [21]:
#Lemmatize the words
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(word) for word in tokens]

In [22]:
#Count the frequency of each word
word_freq = Counter(tokens)

In [23]:
# print the top 10 most frequent words
print(f"Article saved to {file_id}.txt")
print("Top 10 most frequent words:")

Article saved to a871b69b-9654-4579-a6c8-b209f65b3825.txt
Top 10 most frequent words:


In [24]:
for word, freq in word_freq.most_common(10):
    print(f"{word}: {freq}")

Think: 12
Success: 11
Stories: 10
October: 10
Recognition: 9
AI: 8
Consumer: 8
Analytics: 7
Code: 7
password: 7


# COMPUTE VARIABLES

In [25]:
def calculate_positive_score(text, positive_dictionary):
    #initialize score to 0 
    score = 0

    #Split the text into individual words
    words = text.split()

    #Iterate over each word in the text 
    for word in words :
        #Remove punctuation and convert to lowercase 
        word = word.strip('.,!?"\'').lower()

        #check if the word is in the positive Dictionary
        if word in positive_dictionary:
            #if found, increment the score by 1
            score += 1

            #Return the total score 
            return score

In [26]:
positive_dictionary = ["happy", "good", "greet"]
text = "I'm feeling happy and good today!"
print(calculate_positive_score(text, positive_dictionary))

1


In [27]:
def calculate_negative_score(text, negative_dict):
    score = 0
    for word in text.split():
        if word in negative_dict:
            score += 1
    return -score

In [28]:
negative_dict = {"bad", "hate", "sad"}
text = "I hate this bad day"
print(calculate_negative_score(text, negative_dict))

-2


In [29]:
def calculate_polarity_score(positive_score, negative_score):
    """
    Calculate the Polarity Score using the given formula.

    Args:
        positive_score (float): The score indicating the positive sentiment.
        negative_score (float): The score indicating the negative sentiment.

    Returns:
        float: The Polarity Score.
    """
    if positive_score + negative_score == 0:
        return 0  # Avoid division by zero
    return (positive_score - negative_score) / (positive_score + negative_score + 0.000001)

In [30]:
positive_score = 0.8
negative_score = 0.2
polarity_score = calculate_polarity_score(positive_score, negative_score)
print(polarity_score)

0.5999994000006001


In [31]:
def calculate_subjectivity_score(positive_score, negative_score, total_words):
    """
    Calculate the Subjectivity Score using the given formula.

    Args:
        positive_score (float): The score indicating the positive sentiment.
        negative_score (float): The score indicating the negative sentiment.
        total_words (int): The total number of words after cleaning.

    Returns:
        float: The Subjectivity Score.
    """
    if total_words == 0:
        return 0  # Avoid division by zero
    return (positive_score + negative_score) / (total_words + 0.000001)

In [32]:
positive_score = 0.8
negative_score = 0.2
total_words = 100
subjectivity_score = calculate_subjectivity_score(positive_score, negative_score, total_words)
print(subjectivity_score)

0.0099999999


In [33]:
def calculate_average_sentence_length(text):
    """
    Calculate the Average Sentence Length using the given text.

    Args:
        text (str): The input text.

    Returns:
        float: The Average Sentence Length.
    """
    words = text.split()  # Split the text into words
    sentences = text.split('.')  # Split the text into sentences
    sentences = [s.strip() for s in sentences if s.strip()]  # Remove empty strings
    if len(sentences) == 0:
        return 0  # Avoid division by zero
    return len(words) / len(sentences)

In [34]:
text = "This is an example sentence. This is another sentence."
average_sentence_length = calculate_average_sentence_length(text)
print(average_sentence_length)

4.5


In [35]:
nltk.download('cmudict')
d = cmudict.dict()

[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [36]:
def count_syllables(word):
    try:
        return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0]
    except KeyError:
        return 0  # Word not found in dictionary

In [37]:
def calculate_complex_words_percentage(text):
    """
    Calculate the Percentage of Complex Words using the given text.

    Args:
        text (str): The input text.

    Returns:
        float: The Percentage of Complex Words.
    """
    words = text.split()  # Split the text into words
    complex_words = [word for word in words if count_syllables(word) > 2]
    return (len(complex_words) / len(words)) * 100

In [38]:
text = "This is an example sentence with complex words like unrecognizable."
percentage_complex_words = calculate_complex_words_percentage(text)
print(percentage_complex_words)

10.0


In [39]:
def calculate_average_sentence_length(text):
    """
    Calculate the Average Sentence Length using the given text.

    Args:
        text (str): The input text.

    Returns:
        float: The Average Sentence Length.
    """
    words = text.split()  # Split the text into words
    sentences = text.split('.')  # Split the text into sentences
    sentences = [s.strip() for s in sentences if s.strip()]  # Remove empty strings
    if len(sentences) == 0:
        return 0  # Avoid division by zero
    return len(words) / len(sentences)

In [40]:
def calculate_complex_words_percentage(text):
    """
    Calculate the Percentage of Complex Words using the given text.

    Args:
        text (str): The input text.

    Returns:
        float: The Percentage of Complex Words.
    """
    words = text.split()  # Split the text into words
    complex_words = [word for word in words if count_syllables(word) > 2]
    return (len(complex_words) / len(words)) * 100

In [41]:
def calculate_fog_index(text):
    """
    Calculate the Fog Index using the given text.

    Args:
        text (str): The input text.

    Returns:
        float: The Fog Index.
    """
    average_sentence_length = calculate_average_sentence_length(text)
    percentage_complex_words = calculate_complex_words_percentage(text)
    return 0.4 * (average_sentence_length + (percentage_complex_words / 100))

In [42]:
text = "This is an example sentence with complex words like unrecognizable."
fog_index = calculate_fog_index(text)
print(fog_index)

4.04


In [43]:
def calculate_average_words_per_sentence(text):
    # Split the text into sentences
    sentences = text.split('. ')
    
    # Calculate the total number of words
    total_words = len(text.split())
    
    # Calculate the total number of sentences
    total_sentences = len(sentences)
    
    # Check if there are any sentences
    if total_sentences == 0:
        return 0
    
    # Calculate the average number of words per sentence
    average_words_per_sentence = total_words / total_sentences
    
    return average_words_per_sentence


In [44]:
# Example usage:
text = "This is a sample text. It has multiple sentences. Each sentence is separated by a period."
average_words_per_sentence = calculate_average_words_per_sentence(text)
print(average_words_per_sentence)

5.333333333333333


In [45]:
def count_complex_words(text):
    # Initialize the diphone dictionary
    diphone_dict = cmudict.dict()
    
    # Initialize the count of complex words
    complex_word_count = 0
    
    # Tokenize the text into words
    words = text.split()
    
    # Iterate over each word
    for word in words:
        # Remove punctuation
        word = ''.join(e for e in word if e.isalnum())
        
        # Check if the word is in the diphone dictionary
        if word.lower() in diphone_dict:
            # Get the phonemes for the word
            phonemes = diphone_dict[word.lower()]
            
            # Check if the word has more than two syllables
            if len(phonemes) > 2:
                complex_word_count += 1
    
    return complex_word_count


In [46]:
# Example usage:
text = "This is a sample text with complex words like unrecognizable and unpredictable."
complex_word_count = count_complex_words(text)
print(complex_word_count)

1


In [47]:
def count_cleaned_words(text):
    # Get the list of stopwords
    stop_words = set(stopwords.words('english'))
    
    # Remove punctuation from the text
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the text into words
    words = nltk.word_tokenize(text)
    
    # Remove stopwords and count the remaining words
    cleaned_word_count = sum(1 for word in words if word.lower() not in stop_words)
    
    return cleaned_word_count

In [48]:
# Example usage:
text = "This is a sample text with some stopwords like 'the', 'and', 'a' and punctuation like ? ! , ."
cleaned_word_count = count_cleaned_words(text)
print(cleaned_word_count)

6


In [49]:
def count_syllables(word):
    # Define vowels
    vowels = 'aeiouy'
    
    # Initialize syllable count
    syllable_count = 0
    
    # Check if the word ends with "es" or "ed"
    if word.endswith('es') or word.endswith('ed'):
        return 1
    
    # Count the number of vowels
    for i in range(len(word)):
        if word[i] in vowels:
            if i == 0:
                syllable_count += 1
            elif word[i-1] not in vowels and word[i-1] != 'y':
                syllable_count += 1
    
    return syllable_count

In [50]:
def count_word_syllables(text):
    # Tokenize the text into words
    words = text.split()
    
    # Initialize total syllable count
    total_syllable_count = 0
    
    # Iterate over each word
    for word in words:
        # Remove punctuation
        word = ''.join(e for e in word if e.isalnum())
        
        # Count the syllables in the word
        syllable_count = count_syllables(word)
        
        # Add to the total syllable count
        total_syllable_count += syllable_count
    
    return total_syllable_count

In [51]:
# Example usage:
text = "This is a sample text with multiple syllables in each word."
total_syllable_count = count_word_syllables(text)
print(total_syllable_count)

14


In [52]:
def count_personal_pronouns(text):
    # Define the pattern for personal pronouns
    pattern = r'\b(I|we|my|ours|us)\b'
    
    # Use regex to find the counts of personal pronouns
    pronoun_count = len(re.findall(pattern, text, re.IGNORECASE))
    
    return pronoun_count

In [53]:
# Example usage:
text = "I am going to the US with my friends. We will have a great time."
pronoun_count = count_personal_pronouns(text)
print(pronoun_count)

4


In [54]:
# split the text into words
words = re.findall(r'\b\w+\b', text)

In [55]:
#Calculate the total number of characters in each word
total_chars = sum(len(word) for word in words)

In [56]:
#Calculate the average word length
average_word_length = total_chars/ len(words)

In [57]:
print(f"Average Word Length: {average_word_length}")

Average Word Length: 3.2
