<a href="https://colab.research.google.com/github/ArynAgarwal/NLP-/blob/main/NLP_on_scrapped_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install beautifulsoup4 requests openpyxl



In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Load input data from Excel file
input_file = '/content/Input.xlsx'
df = pd.read_excel(input_file)

# Function to extract article text from a given URL
def extract_article_text(url):
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad requests

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract article title and text
        title = soup.title.text.strip()
        article_text = ' '.join([p.text.strip() for p in soup.find_all('p')])

        return title, article_text

    except Exception as e:
        print(f"Error extracting data from {url}: {str(e)}")
        return None, None

# Loop through each row in the DataFrame and extract data
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    title, article_text = extract_article_text(url)

    if title and article_text:
        # Save the extracted data to a text file
        output_file = f"{url_id}.txt"
        with open(output_file, 'w', encoding='utf-8') as file:
            file.write(f"{title}\n\n{article_text}")

        print(f"Data extracted from {url} and saved to {output_file}")
output_file = f"/content/{url_id}.txt"
print("Extraction process completed.")


Data extracted from https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040-2/ and saved to blackassign0001.txt
Data extracted from https://insights.blackcoffer.com/rising-it-cities-and-their-impact-on-the-economy-environment-infrastructure-and-city-life-in-future/ and saved to blackassign0002.txt
Data extracted from https://insights.blackcoffer.com/internet-demands-evolution-communication-impact-and-2035s-alternative-pathways/ and saved to blackassign0003.txt
Data extracted from https://insights.blackcoffer.com/rise-of-cybercrime-and-its-effect-in-upcoming-future/ and saved to blackassign0004.txt
Data extracted from https://insights.blackcoffer.com/ott-platform-and-its-impact-on-the-entertainment-industry-in-future/ and saved to blackassign0005.txt
Data extracted from https://insights.blackcoffer.com/the-rise-of-the-ott-platform-and-its-impact-on-the-entertainment-industry-by-2040/ and saved to blackassi

In [5]:
from google.colab import files
from zipfile import ZipFile

# Specify the file names
file_names = [f"blackassign{i:04d}.txt" for i in range(1, 101)]

# Zip the files into a single archive
zip_file_path = "/content/output_files.zip"
with ZipFile(zip_file_path, 'w') as zip_file:
    for file_name in file_names:
        try:
            zip_file.write(file_name)
        except FileNotFoundError as e:
            print(f"Skipping {file_name} - File not found.")

# Download the zip file
files.download(zip_file_path)


Skipping blackassign0036.txt - File not found.
Skipping blackassign0049.txt - File not found.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
import re
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

# Download NLTK resources (if not already downloaded)
import nltk
nltk.download('stopwords')
nltk.download('punkt')

# Load the extracted text and perform textual analysis
def perform_textual_analysis(article_text):
    # Tokenize the words and sentences
    words = word_tokenize(article_text)
    sentences = sent_tokenize(article_text)

    # Remove stopwords and punctuations
    stop_words = set(stopwords.words('english'))
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]

    # Counting variables
    positive_score = 0
    negative_score = 0
    complex_word_count = 0
    total_syllables = 0
    personal_pronouns = 0

    # Positive and Negative Dictionaries (You may need to customize these dictionaries)
    positive_words = ["positive", "good", "happy", ...]
    negative_words = ["negative", "bad", "unhappy", ...]

    for word in words:
        # Positive Score
        if word in positive_words:
            positive_score += 1

        # Negative Score
        elif word in negative_words:
            negative_score += 1

        # Syllable Count
        total_syllables += sum([1 for char in word if char in 'aeiouAEIOU'])

        # Personal Pronouns
        if word.lower() in ["i", "we", "my", "ours", "us"]:
            personal_pronouns += 1

        # Complex Words
        if len(re.findall('[aeiouAEIOU]{3,}', word)) > 1:
            complex_word_count += 1

    # Calculate other variables
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(words) + 0.000001)
    avg_sentence_length = len(words) / len(sentences)
    percentage_complex_words = complex_word_count / len(words)
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_words_per_sentence = len(words) / len(sentences)
    word_count = len(words)
    syllables_per_word = total_syllables / len(words)
    avg_word_length = sum(len(word) for word in words) / len(words)

    # Return the results
    return {
        'Positive Score': positive_score,
        'Negative Score': negative_score,
        'Polarity Score': polarity_score,
        'Subjectivity Score': subjectivity_score,
        'Average Sentence Length': avg_sentence_length,
        'Percentage of Complex Words': percentage_complex_words,
        'Fog Index': fog_index,
        'Average Number of Words Per Sentence': avg_words_per_sentence,
        'Complex Word Count': complex_word_count,
        'Word Count': word_count,
        'Syllable Per Word': syllables_per_word,
        'Personal Pronouns': personal_pronouns,
        'Average Word Length': avg_word_length,
    }

# Read the input data (URL_ID and URL)
input_data = pd.read_excel('/content/input.xlsx')

# Initialize an empty DataFrame for the output
output_data = pd.DataFrame(columns=['URL_ID', 'URL'] + list(perform_textual_analysis('')))

# Loop through each row in the input data
for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    # Read the article text from the corresponding file
    file_path = f"{url_id}.txt"
    with open(file_path, 'r', encoding='utf-8') as file:
        article_text = file.read()

    # Perform textual analysis
    analysis_results = perform_textual_analysis(article_text)

    # Append the results to the output DataFrame
    output_data = output_data.append({'URL_ID': url_id, 'URL': url, **analysis_results}, ignore_index=True)

# Save the results to an Excel file
output_data.to_excel('output_results.xlsx', index=False)


In [6]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [16]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import os
import re

# Download necessary resources for NLTK
nltk.download('punkt')
nltk.download('stopwords')

# Load the input data from the extracted text files
input_folder = "/content/drive/MyDrive/NLP/extracted_files"
output_structure_file = "/content/drive/MyDrive/NLP/Output Data Structure.xlsx"

# Read the output structure file
output_structure_df = pd.read_excel(output_structure_file)

# Create lists to store calculated values
positive_scores = []
negative_scores = []
polarity_scores = []
subjectivity_scores = []
avg_sentence_lengths = []
percentage_complex_words = []
fog_indices = []
avg_words_per_sentence = []
complex_word_counts = []
word_counts = []
syllable_per_words = []
personal_pronouns = []
avg_word_lengths = []

# Load stop words
# Load stop words from "StopWords_Auditor.txt" only
stop_words_file_path = "/content/drive/MyDrive/NLP/StopWords/StopWords_Auditor.txt"
with open(stop_words_file_path, 'r') as f:
    stop_words = set(f.read().splitlines())


# Load positive and negative words
positive_words = set()
# negative_words = set()

with open("/content/drive/MyDrive/NLP/MasterDictionary/positive-words.txt", 'r') as f:
    positive_words.update(f.read().splitlines())

# with open("/content/drive/MyDrive/NLP/MasterDictionary/negative-words.txt", 'r') as f:
#     negative_words.update(f.read().splitlines())
# Load negative words from "negative-words.txt"
negative_words_file_path = "/content/drive/MyDrive/NLP/MasterDictionary/negative-words.txt"
try:
    with open(negative_words_file_path, 'r', encoding='utf-8') as f:
        negative_words = set(f.read().splitlines())
except UnicodeDecodeError:
    # If decoding as UTF-8 fails, try another encoding
    with open(negative_words_file_path, 'r', encoding='latin-1') as f:
        negative_words = set(f.read().splitlines())


# Function to count syllables in a word
def count_syllables(word):
    vowels = "aeiouy"
    count = 0
    prev_char = ''

    for char in word:
        if char.lower() in vowels and prev_char not in vowels:
            count += 1
        prev_char = char.lower()

    # Handling exceptions
    if word.endswith(('es', 'ed')) and count > 1:
        count -= 1

    return max(1, count)

# Function to calculate average word length
def avg_word_length(text):
    words = word_tokenize(text)
    total_chars = sum(len(word) for word in words)
    total_words = len(words)
    return total_chars / total_words if total_words != 0 else 0

# Iterate over each file in the input folder
for file_name in os.listdir(input_folder):
    file_path = os.path.join(input_folder, file_name)

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # Tokenize sentences and words
        sentences = sent_tokenize(text)
        words = word_tokenize(text)

        # Clean text
        cleaned_words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]

        # Calculate variables
        positive_score = sum(1 for word in cleaned_words if word in positive_words)
        negative_score = sum(1 for word in cleaned_words if word in negative_words)
        polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
        subjectivity_score = (positive_score + negative_score) / (len(cleaned_words) + 0.000001)
        avg_sentence_length = len(words) / len(sentences) if len(sentences) != 0 else 0
        percentage_complex = sum(1 for word in cleaned_words if count_syllables(word) > 2) / len(cleaned_words) if len(cleaned_words) != 0 else 0
        fog_index = 0.4 * (avg_sentence_length + percentage_complex)
        avg_words_sentence = len(words) / len(sentences) if len(sentences) != 0 else 0
        complex_word_count = sum(1 for word in cleaned_words if count_syllables(word) > 2)
        word_count = len(cleaned_words)
        syllable_per_word = sum(count_syllables(word) for word in cleaned_words) / word_count if word_count != 0 else 0
        personal_pronoun_count = len(re.findall(r'\b(?:I|we|my|ours|us)\b', text, flags=re.IGNORECASE))
        avg_word_len = avg_word_length(text)

        # Append values to lists
        positive_scores.append(positive_score)
        negative_scores.append(negative_score)
        polarity_scores.append(polarity_score)
        subjectivity_scores.append(subjectivity_score)
        avg_sentence_lengths.append(avg_sentence_length)
        percentage_complex_words.append(percentage_complex)
        fog_indices.append(fog_index)
        avg_words_per_sentence.append(avg_words_sentence)
        complex_word_counts.append(complex_word_count)
        word_counts.append(word_count)
        syllable_per_words.append(syllable_per_word)
        personal_pronouns.append(personal_pronoun_count)
        avg_word_lengths.append(avg_word_len)

    except FileNotFoundError as e:
        print(f"Skipping {file_name} - File not found.")

# Create DataFrame with calculated values
output_data = pd.DataFrame({
    'URL_ID': output_structure_df['URL_ID'],
    'URL': output_structure_df['URL'],
    'POSITIVE SCORE': positive_scores,
    'NEGATIVE SCORE': negative_scores,
    'POLARITY SCORE': polarity_scores,
    'SUBJECTIVITY SCORE': subjectivity_scores,
    'AVG SENTENCE LENGTH': avg_sentence_lengths,
    'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
    'FOG INDEX': fog_indices,
    'AVG NUMBER OF WORDS PER SENTENCE': avg_words_per_sentence,
    'COMPLEX WORD COUNT': complex_word_counts,
    'WORD COUNT': word_counts,
    'SYLLABLE PER WORD': syllable_per_words,
    'PERSONAL PRONOUNS': personal_pronouns,
    'AVG WORD LENGTH': avg_word_lengths
})

# Save the output DataFrame to an Excel file
output_data.to_excel("/content/drive/MyDrive/NLP/output_data.xlsx", index=False)

print("Data analysis and output file generation completed.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Data analysis and output file generation completed.
