In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
nltk.download('punkt')
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [7]:
excel_data=pd.read_excel('/content/drive/MyDrive/20211030 Test Assignment/Input.xlsx')
#excel_data

In [8]:
excel_data.drop([48,35], inplace=True)#Error 404 on these websites

In [9]:
def Parsing(url):  #Parsing the pages
    urls=[]
    url=excel_data['URL']

    for _ in url:
        urls.append(_)

    responses=[]
    for link in urls:
        response=requests.get(link)
        response.raise_for_status()
        responses.append(response)

    soups=[]
    for r in responses:
        soup=BeautifulSoup(r.content,'html.parser')
        soups.append(soup)
    return soups

In [10]:
def extract(soups):
    all_text = []
    titles = []

    for soup in soups:
        heads = soup.find('h1', class_="entry-title")
        texts = soup.find('div', class_="td-post-content tagdiv-type")

        if heads is not None and texts is not None:
            heads = heads.text.strip()
            text = texts.text.strip()
            all_text.append(text)
            titles.append(heads)
        else:
            heads = soup.find('h1', class_="tdb-title-text").text.strip()
            texts = soup.find_all('div', class_='tdb-block-inner td-fix-index')

            paragraphs_list = []
            for t in texts:
                paragraphs = t.find_all('p')

                for paragraph in paragraphs:
                    paragraphs_list.append(paragraph.text.strip())

            text = ' '.join(paragraphs_list)
            all_text.append(text)
            titles.append(heads)

    data = {'Title': titles, 'Text': all_text}
    dataframe = pd.DataFrame(data)
    return dataframe


In [None]:
df = extract(Parsing(excel_data['URL']))

In [38]:
def clean(text):#Function to clean the text by removing all special characters and numericals leaving the alphabets
    text = re.sub('[^A-Za-z]+', ' ', str(text))
    return text

df['Cleaned Text'] = df['Text'].apply(clean)


In [39]:
stopwords = [
    '/content/drive/MyDrive/20211030 Test Assignment/StopWords/StopWords_Currencies.txt',
    '/content/drive/MyDrive/20211030 Test Assignment/StopWords/StopWords_DatesandNumbers.txt',
    '/content/drive/MyDrive/20211030 Test Assignment/StopWords/StopWords_Generic.txt',
    '/content/drive/MyDrive/20211030 Test Assignment/StopWords/StopWords_GenericLong.txt',
    '/content/drive/MyDrive/20211030 Test Assignment/StopWords/StopWords_Geographic.txt',
    '/content/drive/MyDrive/20211030 Test Assignment/StopWords/StopWords_Names.txt'
]
sw_list=[]
for stopword in stopwords:
  with open(stopword, 'r', encoding='latin=1') as file:
        content = file.read()
        sw_list.append(content)

In [None]:
cleaned_sw_list = [clean(text) for text in sw_list]
result_list = []#Final cleaned list containing all the stopwords

for item in cleaned_sw_list:: #Seperating each word
    words = item.split()
    result_list.extend(words)

In [41]:
def clean_stopword(data, stop_words):
    if isinstance(data, str):
        # If input is a string, split it into a list of words
        words = data.split()
    elif isinstance(data, list):
        # If input is a list, use it directly
        words = data
    else:
        raise ValueError("Input data must be either a string or a list")

    # Filter out stop words
    cleaned_words = [word for word in words if word.lower() not in stop_words]

    if isinstance(data, str):
        # If input was a string, join the cleaned words into a string
        cleaned_data = ' '.join(cleaned_words)
    else:
        # If input was a list, return the cleaned list
        cleaned_data = cleaned_words

    return cleaned_data


In [None]:
df['Cleaned Text'] = df['Text'].apply(clean_stopword,stop_words=result_list)

In [None]:
positive='/content/drive/MyDrive/20211030 Test Assignment/MasterDictionary/positive-words.txt'
positive_words=[]
PositiveWord=[]
with open(positive, 'r', encoding='latin-1') as file:
  positive_word=file.read()
  positive_words.append(positive_word)
  cleaned_positive_words = [clean(text) for text in positive_words]

  for item in cleaned_positive_words:
    Word=item.split()
    PositiveWord.extend(Word)


In [None]:
negative='/content/drive/MyDrive/20211030 Test Assignment/MasterDictionary/negative-words.txt'
negative_words=[]
negativeWord=[]
with open(negative, 'r', encoding='latin-1') as file:
  negative_word=file.read()
  negative_words.append(negative_word)
  cleaned_negative_words = [clean(text) for text in negative_words]

  for item in cleaned_negative_words:
    Word=item.split()
    negativeWord.extend(Word)

In [None]:
cleaned_PositiveWords=clean_stopword(PositiveWord,result_list)
cleaned_NegativeWords=clean_stopword(negativeWord,result_list)

In [46]:
from nltk.tokenize import word_tokenize
df['Tokenized Text'] = df['Cleaned Text'].apply(lambda x: word_tokenize(x) if isinstance(x, str) else [])

In [None]:
Positive_score = []
Negative_score = []
Polarity_score = []
Subjectivity_score = []

for index, row in df.iterrows():
    words = row['Tokenized Text']
    positivescore = 0
    negativescore = 0
    totalwords = 0

    for word in words:
        if word in cleaned_PositiveWords:
            positivescore += 1
            totalwords += 1
        elif word in cleaned_NegativeWords:
            negativescore -= 1*(- 1)
            totalwords += 1

    Negative_score.append(negativescore)
    Positive_score.append(positivescore)

    # Calculate polarity and subjectivity scores
    polarityscore = (positivescore - negativescore) / ((positivescore + negativescore) + 0.000001)
    subjectivityscore = (positivescore + negativescore) / (totalwords + 0.000001)

    Polarity_score.append(polarityscore)
    Subjectivity_score.append(subjectivityscore)

df['Positive Score'] = Positive_score
df['Negative Score'] = Negative_score
df['Polarity Score'] = Polarity_score
df['Subjectivity Score'] = Subjectivity_score

In [None]:
from textstat import sentence_count, lexicon_count, syllable_count

def calculate_gunning_fog(text):

    average_sentence_length = lexicon_count(text) / sentence_count(text)

    complex_words = [word for word in text.split() if syllable_count(word) >= 3]
    percentage_complex_words = len(complex_words) / lexicon_count(text)

    fog_index = 0.4 * (average_sentence_length + percentage_complex_words)

    return fog_index, average_sentence_length, percentage_complex_words

df[['Fog Index', 'Average Sentence Length', 'Percentage of Complex Words']] = df['Text'].apply(calculate_gunning_fog).apply(pd.Series)

In [49]:
def avg_words_per_sentence(text):
    sentences = re.split(r'[.!?]', text)
    words_count = sum(len(sentence.split()) for sentence in sentences if sentence.strip())
    sentences_count = len(sentences)
    return words_count / sentences_count if sentences_count else 0

def complex_word_count(text):
    return len([word for word in text.split() if syllable_count(word) >= 3])

def personal_pronouns_count(text):
    pronouns = ['I', 'me', 'my', 'mine', 'myself', 'you', 'your', 'yours', 'yourself', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'we', 'us', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourselves', 'they', 'them', 'their', 'theirs', 'themselves']
    return sum(text.split().count(pronoun) for pronoun in pronouns)

def avg_word_length(text):
    words = re.findall(r'\b\w+\b', text)
    return sum(len(word) for word in words) / len(words) if len(words) else 0

df['Avg Words per Sentence'] = df['Text'].apply(avg_words_per_sentence)
df['Complex Word Count'] = df['Text'].apply(complex_word_count)
df['Personal Pronouns Count'] = df['Text'].apply(personal_pronouns_count)
df['Avg Word Length'] = df['Text'].apply(avg_word_length)


In [50]:
def calculate_word_count(text):
    words = text.split()
    return len(words)

# Function to calculate syllables per word
def calculate_syllables_per_word(text):
    words = text.split()
    total_syllables = sum(syllable_count(word) for word in words)
    if len(words) > 0:
        return total_syllables / len(words)
    else:
        return 0

# Apply the functions to your DataFrame
df['Word Count'] = df['Text'].apply(calculate_word_count)
df['Syllables per Word'] = df['Text'].apply(calculate_syllables_per_word)


In [52]:
new_df=df

In [53]:
new_df.drop(['Title','Text','Cleaned Text','Tokenized Text'], axis=1, inplace=True)

In [None]:
merged_data = pd.concat([excel_data, new_df], axis=1)
new_df=merged_data
new_df

In [None]:
from google.colab import files
new_df.to_excel('Output.xlsx', encoding = 'utf-8-sig')
files.download('Output.xlsx')