# Import All Required Dependencies

In [8]:
import requests
import re
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import cmudict
from nltk.tokenize import sent_tokenize, word_tokenize
from textblob import TextBlob
from string import punctuation
from unidecode import unidecode
import contractions
import pandas as pd 

# Data Collection

In [10]:
df=pd.read_excel("/content/sample_data/Input.xlsx")
df.sample(5)

Unnamed: 0,URL_ID,URL
17,54,https://insights.blackcoffer.com/all-you-need-...
21,58,https://insights.blackcoffer.com/environmental...
3,40,https://insights.blackcoffer.com/will-machine-...
104,141,https://insights.blackcoffer.com/impact-of-cov...
69,106,https://insights.blackcoffer.com/contribution-...


# Create Function To Perform Text Analysis 

### 1) Count Syllables Word

In [38]:
def count_syllables(word):
    d = cmudict.dict()
    try:
        return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0]
    except KeyError:
        return 0

### 2)Count Complex Words

In [39]:
def count_complex_words(text):
    words = word_tokenize(text)
    count = 0
    for word in words:
        syllables = count_syllables(word)
        if syllables is not None and syllables >= 3:
            count += 1
    return count

### 3)Count Words

In [40]:
def count_words(text):
    words = word_tokenize(text)
    return len(words)

### 4)Average Word Length

In [41]:
def average_word_length(text):
    words = word_tokenize(text)
    total_length = 0
    for word in words:
        total_length += len(word)
    return total_length / len(words)

### 5)Fog Index

In [42]:
def fog_index(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    avg_sent_len = len(words) / len(sentences)
    complex_word_percent = count_complex_words(text) / len(words)
    fog = 0.4 * (avg_sent_len + 100 * complex_word_percent)
    return fog

### 6)Personal Pronouns

In [43]:
def personal_pronouns(text):
    words = word_tokenize(text)
    count = 0
    personal_pronouns = ['I','Me','You','He','Him','She','Her','It','We','Us','They','Them','Mine','Yours','His','Hers','Its','Ours','Theirs','Myself','Yourself','Himself','Herself','Itself','Ourselves','Themselves']
    for word in words:
        if word.lower() in personal_pronouns:
            count += 1
    return count

# Data Cleaning

### 1)Remove Space

In [44]:
def remove_space(data):
    clean_text = data.replace('\\n',' ').replace('\t',' ').replace('\\'," ")
    return clean_text


### 2)Get Expanded

In [45]:
def get_expanded(data):
    clean_text = contractions.fix(data)
    return clean_text

### 3)Get Handle Accented

In [46]:
def get_handle_accented(data):
    clean_text=unidecode(data)
    return clean_text

### 4)Remove HTML Tags

In [6]:
def get_remove_tag(text):
    text = re.sub(r'<.*?>', '', text)    # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    return text

### 5) Remove Stopword

In [47]:
stopword_list = stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('nor')
stopword_list.remove('not')

def get_clean_data(data):
    token= word_tokenize(data)
    clean_text=[word.lower() for word in token if (word not in punctuation) and (word.lower() not in stopword_list) and (len(word)>2) and (word.isalpha())]
    return clean_text


# Data Extraction And Text Analysis

In [None]:
data = []                                #Store data in List
for index, row in df.iterrows():
    url = row["URL"]
    url_id = row["URL_ID"]

    
    page = requests.get(url)                            # Request the HTML page
    soup = BeautifulSoup(page.content, "html.parser")

    
    title = soup.find("title").get_text()               # Extract the article title and text
    text = soup.find("article").get_text()
    
    
    clean_space= remove_space(text)                     
    expanded_text=get_expanded(clean_space)
    accented_clean = get_handle_accented(expanded_text)   # Calling function 
                                                          # To perfom Data cleaning
    remove_tag =get_remove_tag(accented_clean)
    text_clean = get_clean_data(remove_tag)
    text = ", ".join(text_clean)


    
    with open(f"{url_id}.txt", "w") as file:
        file.write(title + "\n\n" + "".join(text))      # Save the Clean article to text file
        
        
    blob = TextBlob(text)                               # for text processing used TextBlob Library and perform sentiment analysis
    sentiment = SentimentIntensityAnalyzer().polarity_scores(text)

    
    # Calculate  all the Variable
    
    result = {
        "URL_ID": url_id,
        "URL":url,
        "Positive_Score": sentiment['pos'],
        "Negative_Score": sentiment['neg'],
        "Polarity_Score": sentiment['compound'],
        "Subjectivity_Score": blob.sentiment.subjectivity,
        "Avg_Sentence_Length": len(word_tokenize(text)) / len(sent_tokenize(text)),
        "Percent_Complex_Words": count_complex_words(text) / count_words(text),
        "Fog_Index": fog_index(text),
        "Avg_Num_Words_Per_Sentence": count_words(text) / len(sent_tokenize(text)),
        "Complex_Word_Count": count_complex_words(text),
        "Word_Count": count_words(text),
        "Syllables_Per_Word": sum(count_syllables(word) for word in word_tokenize(text)) / count_words(text),
        "Personal_Pronouns": personal_pronouns(text),
        "Avg_Word_Length": average_word_length(text)
    }
    
    data.append(result)
    print(url_id)
    
output_df = pd.DataFrame(data)  # Converted into the DataFrame

In [53]:
data

[{'URL_ID': 37.0,
  'URL': 'https://insights.blackcoffer.com/ai-in-healthcare-to-improve-patient-outcomes/',
  'Positive_Score': 0.162,
  'Negative_Score': 0.051,
  'Polarity_Score': 0.9993,
  'Subjectivity_Score': 0.44298277387234436,
  'Avg_Sentence_Length': 2719.0,
  'Percent_Complex_Words': 0.1677087164398676,
  'Fog_Index': 1094.3083486575947,
  'Avg_Num_Words_Per_Sentence': 2719.0,
  'Complex_Word_Count': 456,
  'Word_Count': 2719,
  'Syllables_Per_Word': 1.0842221404928283,
  'Personal_Pronouns': 0,
  'Avg_Word_Length': 4.007355645457889},
 {'URL_ID': 38.0,
  'URL': 'https://insights.blackcoffer.com/what-if-the-creation-is-taking-over-the-creator/',
  'Positive_Score': 0.2,
  'Negative_Score': 0.095,
  'Polarity_Score': 0.9983,
  'Subjectivity_Score': 0.40098011363636343,
  'Avg_Sentence_Length': 1831.0,
  'Percent_Complex_Words': 0.12889131622064445,
  'Fog_Index': 737.5556526488258,
  'Avg_Num_Words_Per_Sentence': 1831.0,
  'Complex_Word_Count': 236,
  'Word_Count': 1831,
  'S

### Output of Our Text Analysis with each variable output

In [55]:
output_df

Unnamed: 0,URL_ID,URL,Positive_Score,Negative_Score,Polarity_Score,Subjectivity_Score,Avg_Sentence_Length,Percent_Complex_Words,Fog_Index,Avg_Num_Words_Per_Sentence,Complex_Word_Count,Word_Count,Syllables_Per_Word,Personal_Pronouns,Avg_Word_Length
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,0.162,0.051,0.9993,0.442983,2719.0,0.167709,1094.308349,2719.0,456,2719,1.084222,0,4.007356
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,0.2,0.095,0.9983,0.40098,1831.0,0.128891,737.555653,1831.0,236,1831,0.969962,0,3.761333


# Convert DataFrame into excel File without index

In [57]:
output_df.to_excel("Output(Ankit_Mahalle).xlsx",index=False)