In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# Import necessary libraries
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
from textblob import TextBlob
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk

# Ensure required NLTK data packages are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Define utility functions
def load_words(file_path, encoding='utf-8'):
    try:
        with open(file_path, 'r', encoding=encoding) as file:
            words = set(word.strip() for word in file.readlines())
            print(f"Loaded {len(words)} words from {file_path}")
            return words
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return set()

def load_specific_stopwords(folder_path, filenames, encoding='utf-8'):
    stopwords = set()
    try:
        for filename in filenames:
            file_path = os.path.join(folder_path, filename)
            stopwords.update(load_words(file_path, encoding))
        print(f"Loaded {len(stopwords)} stopwords from {folder_path}")
    except Exception as e:
        print(f"Error reading stopwords from {folder_path}: {e}")
    return stopwords

def save_text_to_file(text, file_path):
    with open(file_path, 'w') as file:
        file.write(text)

def extract_article_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    title = soup.find('h1').text if soup.find('h1') else ''
    paragraphs = soup.find_all('p')
    article_text = ' '.join([para.text for para in paragraphs])
    return title, article_text

def count_syllables(word):
    vowels = "aeiouy"
    word = word.lower().strip()
    count = 0
    if word and word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

def analyze_text(text, positive_words, negative_words, stopwords):
    blob = TextBlob(text)
    words = [word for word in blob.words if word.lower() not in stopwords]
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words)
    polarity_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity
    sentences = sent_tokenize(text)
    complex_words = [word for word in words if len(word) > 2]

    avg_sentence_length = len(words) / len(sentences) if sentences else 0
    percentage_of_complex_words = len(complex_words) / len(words) if words else 0
    fog_index = 0.4 * (avg_sentence_length + percentage_of_complex_words)
    avg_number_of_words_per_sentence = avg_sentence_length
    complex_word_count = len(complex_words)
    word_count = len(words)
    syllables_per_word = sum(count_syllables(word) for word in words) / len(words) if words else 0
    personal_pronouns = sum(1 for word in words if word.lower() in ["i", "we", "my", "ours", "us"])
    avg_word_length = sum(len(word) for word in words) / len(words) if words else 0

    return {
        'POSITIVE SCORE': positive_score,
        'NEGATIVE SCORE': negative_score,
        'POLARITY SCORE': polarity_score,
        'SUBJECTIVITY SCORE': subjectivity_score,
        'AVG SENTENCE LENGTH': avg_sentence_length,
        'PERCENTAGE OF COMPLEX WORDS': percentage_of_complex_words,
        'FOG INDEX': fog_index,
        'AVG NUMBER OF WORDS PER SENTENCE': avg_number_of_words_per_sentence,
        'COMPLEX WORD COUNT': complex_word_count,
        'WORD COUNT': word_count,
        'SYLLABLE PER WORD': syllables_per_word,
        'PERSONAL PRONOUNS': personal_pronouns,
        'AVG WORD LENGTH': avg_word_length
    }

# Load input data
input_file_path = '/content/drive/MyDrive/Data Science Internship-Blackcoffer/Input.xlsx'  # Update the path as needed
input_df = pd.read_excel(input_file_path)

# Load positive and negative word dictionaries from the master dictionary folder
master_dictionary_folder_path = '/content/drive/MyDrive/Data Science Internship-Blackcoffer/MasterDictionary'  # Update the path as needed
positive_words_path = os.path.join(master_dictionary_folder_path, 'positive-words.txt')
negative_words_path = os.path.join(master_dictionary_folder_path, 'negative-words.txt')
positive_words = load_words(positive_words_path, encoding='latin1')
negative_words = load_words(negative_words_path, encoding='latin1')

# Load specific stopwords from the specified folder
stopwords_folder_path = '/content/drive/MyDrive/Data Science Internship-Blackcoffer/StopWords'  # Update the path as needed
stopword_filenames = [
    'StopWords_Currencies.txt',
    'StopWords_Auditor.txt',
    'StopWords_DatesandNumbers.txt',
    'StopWords_Generic.txt',
    'StopWords_GenericLong.txt',
    'StopWords_Geographic.txt',
    'StopWords_Names.txt'
]  # List stopword filenames here
stopwords = load_specific_stopwords(stopwords_folder_path, stopword_filenames, encoding='latin1')

# Extract articles
articles = []
for _, row in input_df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    title, text = extract_article_content(url)
    articles.append((url_id, url, title, text))

# Save extracted articles
articles_df = pd.DataFrame(articles, columns=['URL_ID', 'URL', 'Title', 'Text'])
articles_df.to_csv('extracted_articles.csv', index=False)

# Analyze extracted articles
analysis_results = []
for _, row in articles_df.iterrows():
    analysis = analyze_text(row['Text'], positive_words, negative_words, stopwords)
    analysis['URL_ID'] = row['URL_ID']
    analysis['URL'] = row['URL']
    analysis_results.append(analysis)

# Save analysis results
analysis_df = pd.DataFrame(analysis_results)
analysis_df = analysis_df[['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE',
                           'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE',
                           'COMPLEX WORD COUNT', 'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']]
analysis_df.to_excel('analysis_results.xlsx', index=False)

# Display the first few rows of the analysis results
analysis_df.head()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loaded 2006 words from /content/drive/MyDrive/Data Science Internship-Blackcoffer/MasterDictionary/positive-words.txt
Loaded 4783 words from /content/drive/MyDrive/Data Science Internship-Blackcoffer/MasterDictionary/negative-words.txt
Loaded 85 words from /content/drive/MyDrive/Data Science Internship-Blackcoffer/StopWords/StopWords_Currencies.txt
Loaded 8 words from /content/drive/MyDrive/Data Science Internship-Blackcoffer/StopWords/StopWords_Auditor.txt
Loaded 108 words from /content/drive/MyDrive/Data Science Internship-Blackcoffer/StopWords/StopWords_DatesandNumbers.txt
Loaded 121 words from /content/drive/MyDrive/Data Science Internship-Blackcoffer/StopWords/StopWords_Generic.txt
Loaded 570 words from /content/drive/MyDrive/Data Science Internship-Blackcoffer/StopWords/StopWords_GenericLong.txt
Loaded 195 words from /content/drive/MyDrive/Data Science Internship-Blackcoffer/StopWords/StopWords_Geographic.txt
Loaded 11897 words from /content/drive/MyDrive/Data Science Internship-

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...,34,8,0.069339,0.386899,26.333333,0.963608,10.918776,26.333333,609,632,2.528481,0,7.488924
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...,4,0,0.059315,0.331447,28.142857,0.959391,11.640899,28.142857,189,197,2.441624,0,7.502538
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...,4,0,0.059315,0.331447,27.714286,0.958763,11.469219,27.714286,186,194,2.443299,0,7.5
3,bctech2014,https://insights.blackcoffer.com/effective-man...,4,0,0.059315,0.331447,28.0,0.959184,11.583673,28.0,188,196,2.443878,0,7.5
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...,4,0,0.059315,0.331447,28.142857,0.959391,11.640899,28.142857,189,197,2.446701,0,7.517766
