In [1]:
import pandas as pd
import bs4
from bs4 import BeautifulSoup
import csv
import requests
import os
import glob
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
print('pandas version: {}'.format(pd.__version__))
print('bs4 version: {}'.format(bs4.__version__))
print('requests version: {}'.format(requests.__version__))
print('csv version: {}'.format(csv.__version__))

pandas version: 1.4.2
bs4 version: 4.11.1
requests version: 2.27.1
csv version: 1.0


## Data Extraction

In [3]:
df=pd.read_excel('Input.xlsx')
df

Unnamed: 0,URL_ID,URL
0,Netclan20241017,https://insights.blackcoffer.com/ai-and-ml-bas...
1,Netclan20241018,https://insights.blackcoffer.com/enhancing-fro...
2,Netclan20241019,https://insights.blackcoffer.com/roas-dashboar...
3,Netclan20241020,https://insights.blackcoffer.com/efficient-pro...
4,Netclan20241021,https://insights.blackcoffer.com/development-o...
...,...,...
142,Netclan20241159,https://insights.blackcoffer.com/population-an...
143,Netclan20241160,https://insights.blackcoffer.com/google-lsa-ap...
144,Netclan20241161,https://insights.blackcoffer.com/healthcare-da...
145,Netclan20241162,https://insights.blackcoffer.com/budget-sales-...


In [4]:
titles = []
content = []
for i in range(len(df)):    
    url = df['URL'][i]
    response = requests.get(url)
#     print(response.status_code)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    title = soup.title.string
    titles.append(title[:-23])
    scraped_content = soup.find_all('div',class_="td-post-content tagdiv-type")
    for x in scraped_content:
        x = x.get_text(separator = " ").replace('\n',' ').replace('\xa0',' ').strip()
    content.append(x)

In [5]:
data = pd.DataFrame()
data['Title'] = titles
data['Content'] = content
data.head()

Unnamed: 0,Title,Content
0,AI and ML-Based YouTube Analytics and Content ...,Client Background Client: A leading IT & tech...
1,Enhancing Front-End Features and Functionality...,Client Background Client: A leading hospital ...
2,ROAS Dashboard for Campaign-Wise Google Ads Bu...,Client Background Client: A leading IT & tech...
3,Efficient Processing and Analysis of Financial...,Client Background Client: A leading automobil...
4,Development of EA Robot for Automated Trading,Objective: The goal of this project is to buil...


In [6]:
result = pd.concat([df, data], axis=1)
result

Unnamed: 0,URL_ID,URL,Title,Content
0,Netclan20241017,https://insights.blackcoffer.com/ai-and-ml-bas...,AI and ML-Based YouTube Analytics and Content ...,Client Background Client: A leading IT & tech...
1,Netclan20241018,https://insights.blackcoffer.com/enhancing-fro...,Enhancing Front-End Features and Functionality...,Client Background Client: A leading hospital ...
2,Netclan20241019,https://insights.blackcoffer.com/roas-dashboar...,ROAS Dashboard for Campaign-Wise Google Ads Bu...,Client Background Client: A leading IT & tech...
3,Netclan20241020,https://insights.blackcoffer.com/efficient-pro...,Efficient Processing and Analysis of Financial...,Client Background Client: A leading automobil...
4,Netclan20241021,https://insights.blackcoffer.com/development-o...,Development of EA Robot for Automated Trading,Objective: The goal of this project is to buil...
...,...,...,...,...
142,Netclan20241159,https://insights.blackcoffer.com/population-an...,Population and Community Survey of America,Client Background Client: A leading marketing...
143,Netclan20241160,https://insights.blackcoffer.com/google-lsa-ap...,Google LSA API Data Automation and Dashboarding,Client Background Client: A leading marketing...
144,Netclan20241161,https://insights.blackcoffer.com/healthcare-da...,Healthcare Data Analysis,Client Background Client: A leading healthcar...
145,Netclan20241162,https://insights.blackcoffer.com/budget-sales-...,"Budget, Sales KPI Dashboard using Power BI",Project Description Weekly Data – clustered ba...


In [7]:
folder_name = "articles"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
for index, row in result.iterrows():
    url_id = row['URL_ID']  
    title = row['Title']
    content = row['Content']  

    # Generate the file name based on URL_ID
    file_name = f"{url_id}.txt"
    file_path = os.path.join(folder_name, file_name)

    # Write the article text to the text file
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(title)
        file.write("\n")
        file.write(content)

## Data Analysis

In [8]:
def extract_words_from_file(file_path):
    with open(file_path, 'r') as file:
        return [line.strip() for line in file]
    
words = []
# Use glob to find all .txt files recursively in the root_directory
for file_path in glob.glob(os.path.join("StopWords\\StopWords", '**', '*.txt'), recursive=True):
    words_in_file = extract_words_from_file(file_path)
    words.extend(words_in_file)

stop_words = []
for word in words:
    if '|' in word:
        stop_words.extend([w.strip() for w in word.split('|')])  # Split and strip spaces
    else:
        stop_words.append(word)

with open("MasterDictionary\\MasterDictionary\\positive-words.txt", 'r') as file:
    positive_words = [line.strip() for line in file]
    
with open("MasterDictionary\\MasterDictionary\\negative-words.txt", 'r') as file:
    negative_words = [line.strip() for line in file]

In [9]:
def read_text_file(file_name):
    file_path = os.path.join("articles", file_name + ".txt")
    try:
        with open(file_path, 'r', encoding="utf8") as file:
            return file.read()
    except FileNotFoundError:
        return f"{file_name}.txt not found."

def count_syllables(word):
    word = word.lower()
    syllable_pattern = re.compile(r'[aeiouy]+')
    syllables = syllable_pattern.findall(word)
    if word.endswith("es") or word.endswith("ed"):
        if len(syllables) > 1:
            syllables = syllables[:-1]
    
    if word.endswith('e') and len(syllables) > 1:
        syllables = syllables[:-1]
    return max(1, len(syllables))  # Ensure at least 1 syllable

# Function to calculate the number of complex words and syllables per word
def analyze_text(text):
    words = re.findall(r'\b\w+\b', text)
    complex_word_count = 0
    syllable_count_per_word = {}
    total_syllables = 0
    # Analyze each word
    for word in words:
        syllable_count = count_syllables(word)
        syllable_count_per_word[word] = syllable_count
        total_syllables += syllable_count
        if syllable_count > 2:
            complex_word_count += 1
    return complex_word_count, syllable_count_per_word, total_syllables

In [10]:
def file_operation(file_name):
    file_path = os.path.join("articles", file_name + ".txt")  # Add .txt extension
    try:
        with open(file_path, 'r', encoding="utf8") as file:
            text = file.read()
            text_len = len(text)

            words = text.split()
            filtered_words = [word for word in words if word not in stop_words]
            filtered_text = " ".join(filtered_words)
            total_words = len(filtered_text)

            positive = []
            negative = []
            sentence_count = 0
            no_pronoun = 0
            punctuation = [".","!","?"]
            personal_pronouns = [ "I","my","you","he","she","it","we","they","them","us","him","her","his","hers","its","theirs","ours","our","your"]
            words = filtered_text.split()
            for word in words:
                word = word.lower()
                if word in positive_words:
                    positive.append(word)
                elif word in negative_words:
                    negative.append(word)
                if word[-1] in punctuation:
                    sentence_count +=1
                sentence_count = max(1, sentence_count)
                if word in personal_pronouns:        
                    no_pronoun += 1
                total_characters = sum(len(word) for word in words)

            complex_word_count, syllable_count_per_word, total_syllables = analyze_text(text)

            #CALCULATIONS
            positive_score = len(positive)
            negative_score = len(negative)
            polarity_score = (positive_score - negative_score)/ ((positive_score + negative_score) + 0.000001)
            subjectivity_score = (positive_score + negative_score)/ ((total_words) + 0.000001)
            avg_sentence_len = text_len/ sentence_count
            complex_words_percentage = complex_word_count/text_len
            Fog_Index = 0.4 * (avg_sentence_len + complex_words_percentage)
            Average_Number_of_Words_Per_Sentence = avg_sentence_len
            average_word_len = total_characters/ total_words

            return positive_score, negative_score, polarity_score, subjectivity_score, avg_sentence_len, complex_words_percentage, Fog_Index, Average_Number_of_Words_Per_Sentence, complex_word_count, text_len, total_syllables, no_pronoun, average_word_len

    except FileNotFoundError:
        return None, None, None, None, None, None, None, None, None, None, None, None, None

In [11]:
df1 = df
df1['POSITIVE SCORE'] = ''
df1['NEGATIVE SCORE'] = ''
df1['POLARITY SCORE'] = ''
df1['SUBJECTIVITY SCORE'] = ''
df1['AVG SENTENCE LENGTH'] = ''
df1['PERCENTAGE OF COMPLEX WORDS'] = ''
df1['FOG INDEX'] = ''
df1['AVG NUMBER OF WORDS PER SENTENCE'] = ''
df1['COMPLEX WORD COUNT'] = ''
df1['WORD COUNT'] = ''
df1['SYLLABLE PER WORD'] = ''
df1['PERSONAL PRONOUNS'] = ''
df1['AVG WORD LENGTH'] = ''

In [12]:
df1['POSITIVE SCORE'], df1['NEGATIVE SCORE'], df1['POLARITY SCORE'], df1['SUBJECTIVITY SCORE'], df1['AVG SENTENCE LENGTH'], df1['PERCENTAGE OF COMPLEX WORDS'], df1['FOG INDEX'], df1['AVG NUMBER OF WORDS PER SENTENCE'], df1['COMPLEX WORD COUNT'], df1['WORD COUNT'], df1['SYLLABLE PER WORD'], df1['PERSONAL PRONOUNS'], df1['AVG WORD LENGTH'] = zip(*df1['URL_ID'].apply(file_operation))
print(df1)

              URL_ID                                                URL  \
0    Netclan20241017  https://insights.blackcoffer.com/ai-and-ml-bas...   
1    Netclan20241018  https://insights.blackcoffer.com/enhancing-fro...   
2    Netclan20241019  https://insights.blackcoffer.com/roas-dashboar...   
3    Netclan20241020  https://insights.blackcoffer.com/efficient-pro...   
4    Netclan20241021  https://insights.blackcoffer.com/development-o...   
..               ...                                                ...   
142  Netclan20241159  https://insights.blackcoffer.com/population-an...   
143  Netclan20241160  https://insights.blackcoffer.com/google-lsa-ap...   
144  Netclan20241161  https://insights.blackcoffer.com/healthcare-da...   
145  Netclan20241162  https://insights.blackcoffer.com/budget-sales-...   
146  Netclan20241163  https://insights.blackcoffer.com/amazon-buy-bo...   

     POSITIVE SCORE  NEGATIVE SCORE  POLARITY SCORE  SUBJECTIVITY SCORE  \
0                 4     

In [13]:
df1

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,Netclan20241017,https://insights.blackcoffer.com/ai-and-ml-bas...,4,3,0.142857,0.003993,290.571429,0.027532,116.239584,290.571429,56,2034,501,3,0.885339
1,Netclan20241018,https://insights.blackcoffer.com/enhancing-fro...,14,7,0.333333,0.005515,147.891892,0.024488,59.166552,147.891892,134,5472,1421,1,0.870011
2,Netclan20241019,https://insights.blackcoffer.com/roas-dashboar...,9,8,0.058824,0.006589,159.052632,0.021840,63.629789,159.052632,66,3022,758,1,0.867442
3,Netclan20241020,https://insights.blackcoffer.com/efficient-pro...,22,12,0.294118,0.008558,164.964286,0.043732,66.003207,164.964286,202,4619,1265,4,0.886484
4,Netclan20241021,https://insights.blackcoffer.com/development-o...,9,2,0.636364,0.002691,111.136364,0.029652,44.466406,111.136364,145,4890,1294,1,0.872033
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,Netclan20241159,https://insights.blackcoffer.com/population-an...,11,7,0.222222,0.003823,171.000000,0.031245,68.412498,171.000000,187,5985,1647,3,0.866398
143,Netclan20241160,https://insights.blackcoffer.com/google-lsa-ap...,17,19,-0.055556,0.005177,131.750000,0.024556,52.709823,131.750000,220,8959,2372,3,0.865976
144,Netclan20241161,https://insights.blackcoffer.com/healthcare-da...,8,10,-0.111111,0.010129,148.823529,0.016996,59.536210,148.823529,43,2530,654,2,0.869443
145,Netclan20241162,https://insights.blackcoffer.com/budget-sales-...,1,0,0.999999,0.001168,908.000000,0.014317,363.205727,908.000000,13,908,229,0,0.841121


In [15]:
df1.to_excel("output.xlsx")