In [1]:
#import the necessary libraries
import pandas as pd
from bs4 import BeautifulSoup
import re
from newspaper import Article
import os
import nltk
from nltk import sent_tokenize, word_tokenize

In [3]:
#Read the URLs from the Input.xlsx and store in a list
input_data = pd.read_excel('Input.xlsx')
url_ids = input_data['URL_ID'].tolist()
urls = input_data['URL'].tolist()

# Check if all URL_IDs and URLs are read successfully
print(len(url_ids))
print(len(urls))
print(url_ids[:5]) # Print first 5 URL_IDs to check if the list is correct
print(urls[:5]) # Print first 5 URLs to check if the list is correct

114
114
[37.0, 38.0, 39.0, 40.0, 41.0]
['https://insights.blackcoffer.com/ai-in-healthcare-to-improve-patient-outcomes/', 'https://insights.blackcoffer.com/what-if-the-creation-is-taking-over-the-creator/', 'https://insights.blackcoffer.com/what-jobs-will-robots-take-from-humans-in-the-future/', 'https://insights.blackcoffer.com/will-machine-replace-the-human-in-the-future-of-work/', 'https://insights.blackcoffer.com/will-ai-replace-us-or-work-with-us/']


In [3]:
def get_article_text(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        paragraphs = article.text.split('\n')
        article_text = '\n'.join([p.strip() for p in paragraphs if p.strip() != ''])
        article_title = article.title
        return article_title, article_text
    except Exception as e:
        print(f"Error in processing URL {url}: {e}")
        return None, None

In [5]:
for url_id, url in zip(url_ids, urls):
    title, text = get_article_text(url)
    if title is None or text is None:
        continue
    filename = str(int(url_id))
    with open(filename + ".txt", "w", encoding="utf-8") as f:
        f.write(title + "\n" + text)

Error in processing URL https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/: Article `download()` failed with 404 Client Error: Not Found for url: https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/ on URL https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/
Error in processing URL https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/: Article `download()` failed with 404 Client Error: Not Found for url: https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/ on URL https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/
Error in processing URL https://insights.blackcoffer.com/ensuring-growth-through-insurance-technology/: Article `download()` failed with 404 Client Error: Not Found for url: https://insights.blackcoffer.com/ensuring-growth-through-insurance-technology/ on URL

In [4]:
# Create a list to store the rows of the dataframe
rows = []

# Iterate over the file names
for file_name in os.listdir():
    # Check if the file is a text file
    if file_name.endswith(".txt"):
        # Get the URL_ID from the file name by removing the '.txt' extension
        url_id = file_name[:-4]
        # Read the contents of the text file
        if url_id in str(url_ids):
            with open(file_name, "r", encoding="utf-8") as f:
                contents = f.read()
            
        # Split the contents into title and text
            title, text = contents.split("\n", 1)
        
        # Append the row to the list of rows
            rows.append([int(url_id), title, text])

# Sort the list of rows based on the URL_ID column
rows = sorted(rows, key=lambda x: x[0])

In [5]:
# Create a dataframe from the list of rows
df = pd.DataFrame(rows, columns=['URL_ID', 'Title', 'Text'])

In [6]:
df.to_excel("data.xlsx", index=False)

In [159]:
df = pd.read_excel("data.xlsx")

In [160]:
df

Unnamed: 0,URL_ID,Title,Text
0,37,AI in healthcare to Improve Patient Outcomes,Introduction\n“If anything kills over 10 milli...
1,38,What if the Creation is Taking Over the Creator?,"Human minds, a fascination in itself carrying ..."
2,39,What Jobs Will Robots Take From Humans in The ...,Introduction\nAI is rapidly evolving in the em...
3,40,Will Machine Replace The Human in the Future o...,“Anything that could give rise to smarter-than...
4,41,Will AI Replace Us or Work With Us?,“Machine intelligence is the last invention th...
...,...,...,...
106,146,Blockchain for Payments,Reconciling with the financial realities of an...
107,147,The future of Investing,What Is an Investment?\nAn investment is a res...
108,148,Big Data Analytics in Healthcare,Quality and affordable healthcare is a vision ...
109,149,Business Analytics In The Healthcare Industry,Analytics is a statistical scientific process ...


In [161]:
df["Text"][0]

'Introduction\n“If anything kills over 10 million people in the next few decades, it will be a highly infectious virus rather than a war. Not missiles but microbes.” Bill Gates’s remarks at a TED conference in 2014, right after the world had avoided the Ebola outbreak. When the new, unprecedented, invisible virus hit us, it met an overwhelmed and unprepared healthcare system and oblivious population. This public health emergency demonstrated our lack of scientific consideration and underlined the alarming need for robust innovations in our health and medical facilities. For the past few years, artificial intelligence has proven to be of tangible potential in the healthcare sectors, clinical practices, translational medical and biomedical research.\nAfter the first case was detected in China on December 31st 2019, it was an AI program developed by BlueDot that alerted the world about the pandemic. It was quick to realise AI’s ability to analyse large chunks of data could help in detecti

## Text Preprocessing

#### Normalizing

In [162]:
text = df["Text"][0]

In [163]:
text = text.lower()

In [164]:
re_special_char1 = r'\n'

In [165]:
re_special_char2 = r"[^a-zA-Z|\s]"

In [166]:
text = re.sub(re_special_char1, "|", text)

In [167]:
text = re.sub(re_special_char2, "", text)

In [168]:

text_list = text.split(" ")
text = " ".join(text_list[:-11])

print(text)

introduction|if anything kills over  million people in the next few decades it will be a highly infectious virus rather than a war not missiles but microbes bill gatess remarks at a ted conference in  right after the world had avoided the ebola outbreak when the new unprecedented invisible virus hit us it met an overwhelmed and unprepared healthcare system and oblivious population this public health emergency demonstrated our lack of scientific consideration and underlined the alarming need for robust innovations in our health and medical facilities for the past few years artificial intelligence has proven to be of tangible potential in the healthcare sectors clinical practices translational medical and biomedical research|after the first case was detected in china on december st  it was an ai program developed by bluedot that alerted the world about the pandemic it was quick to realise ais ability to analyse large chunks of data could help in detecting patterns and identifying and tra

#### Stop Word

In [169]:
def read_stop_words(filename):
    with open(filename, "r") as file:
        stop_words = file.read().splitlines()
    return [word.lower() for word in stop_words]

In [170]:
stop_words = []
for filename in ["StopWords_Auditor.txt", "StopWords_Currencies.txt", "StopWords_DatesandNumbers.txt", "StopWords_Generic.txt", "StopWords_GenericLong.txt", "StopWords_Geographic.txt", "StopWords_Names.txt"]:
    stop_words += read_stop_words(filename)

In [171]:
stop_words

['ernst',
 'young',
 'deloitte',
 'touche',
 'kpmg',
 'pricewaterhousecoopers',
 'pricewaterhouse',
 'coopers',
 'afghani  | afghanistan ',
 'ariary | madagascar ',
 'baht | thailand ',
 'balboa | panama ',
 'birr | ethiopia ',
 'bolivar | venezuela ',
 'boliviano  | bolivia ',
 'cedi | ghana ',
 'colon  | costa rica ',
 'córdoba  | nicaragua ',
 'dalasi | gambia ',
 'denar | macedonia (former yug. rep.) ',
 'dinar | algeria ',
 'dirham  | morocco ',
 'dobra | são tom and príncipe ',
 'dong | vietnam ',
 'dram | armenia ',
 'escudo  | cape verde ',
 'euro  | belgium ',
 'florin | aruba ',
 'forint | hungary ',
 'gourde | haiti ',
 'guarani | paraguay ',
 'gulden | netherlands antilles ',
 'hryvnia  | ukraine ',
 'kina | papua new guinea ',
 'kip | laos ',
 'konvertibilna marka  | bosnia-herzegovina ',
 'koruna  | czech republic ',
 'krona | sweden ',
 'krone | denmark ',
 'kroon | estonia ',
 'kuna | croatia ',
 'kwacha | zambia ',
 'kwanza | angola ',
 'kyat | myanmar ',
 'lari | geor

In [172]:
text = text.split()
print(text)

['introduction|if', 'anything', 'kills', 'over', 'million', 'people', 'in', 'the', 'next', 'few', 'decades', 'it', 'will', 'be', 'a', 'highly', 'infectious', 'virus', 'rather', 'than', 'a', 'war', 'not', 'missiles', 'but', 'microbes', 'bill', 'gatess', 'remarks', 'at', 'a', 'ted', 'conference', 'in', 'right', 'after', 'the', 'world', 'had', 'avoided', 'the', 'ebola', 'outbreak', 'when', 'the', 'new', 'unprecedented', 'invisible', 'virus', 'hit', 'us', 'it', 'met', 'an', 'overwhelmed', 'and', 'unprepared', 'healthcare', 'system', 'and', 'oblivious', 'population', 'this', 'public', 'health', 'emergency', 'demonstrated', 'our', 'lack', 'of', 'scientific', 'consideration', 'and', 'underlined', 'the', 'alarming', 'need', 'for', 'robust', 'innovations', 'in', 'our', 'health', 'and', 'medical', 'facilities', 'for', 'the', 'past', 'few', 'years', 'artificial', 'intelligence', 'has', 'proven', 'to', 'be', 'of', 'tangible', 'potential', 'in', 'the', 'healthcare', 'sectors', 'clinical', 'practice

In [173]:
text = [word for word in text if word not in stop_words]

In [174]:
def text_preprocessing(text):
    #Replace \n at end of sentence with |
    text = re.sub(re_special_char1, '|', text)
    #Normalizing the case
    text = text.lower()
    #Remove the special characters
    text = re.sub(re_special_char2, "", text)
    text_list = text.split(" ")
    text = " ".join(text_list[:-11])
    #Word Tokenization
    text = text.split()
    #Stopword removal 
    text = [word for word in text if word not in stop_words]
    #Joining text
    text = " ".join(text)
    return text

In [175]:
df["Text"]= df["Text"].apply(text_preprocessing)

In [176]:
df

Unnamed: 0,URL_ID,Title,Text
0,37,AI in healthcare to Improve Patient Outcomes,introduction|if kills people decades highly in...
1,38,What if the Creation is Taking Over the Creator?,human minds fascination carrying potential tin...
2,39,What Jobs Will Robots Take From Humans in The ...,introduction|ai rapidly evolving employment se...
3,40,Will Machine Replace The Human in the Future o...,give rise smarterthanhuman intelligence form a...
4,41,Will AI Replace Us or Work With Us?,machine intelligence invention humanity make|n...
...,...,...,...
106,146,Blockchain for Payments,reconciling financial realities mba education ...
107,147,The future of Investing,investment|an investment resource thing procur...
108,148,Big Data Analytics in Healthcare,quality affordable healthcare vision governmen...
109,149,Business Analytics In The Healthcare Industry,analytics statistical scientific process disco...


#### Master Dictionary

In [177]:
#reading the two files that contain the positive and negative words, and create two lists to store the words
def read_words(filename):
    with open(filename, "r") as file:
        words = file.read().splitlines()
    return [word.lower() for word in words]

In [178]:
positive_words = read_words("positive-words.txt")
negative_words = read_words("negative-words.txt")

### Extracting Derived Variables

In [179]:
#compute the positive and negative scores of each text in your data
def compute_scores(text):
    positive_score = 0
    negative_score = 0
    for word in text.split():
        if word in positive_words:
            positive_score += 1
        if word in negative_words:
            negative_score += -1
    return {'positive_score': positive_score, 'negative_score': negative_score * -1}

In [180]:
df[['positive_score', 'negative_score']] = df['Text'].apply(compute_scores).apply(pd.Series)

In [181]:
df

Unnamed: 0,URL_ID,Title,Text,positive_score,negative_score
0,37,AI in healthcare to Improve Patient Outcomes,introduction|if kills people decades highly in...,58,30
1,38,What if the Creation is Taking Over the Creator?,human minds fascination carrying potential tin...,55,35
2,39,What Jobs Will Robots Take From Humans in The ...,introduction|ai rapidly evolving employment se...,64,34
3,40,Will Machine Replace The Human in the Future o...,give rise smarterthanhuman intelligence form a...,59,21
4,41,Will AI Replace Us or Work With Us?,machine intelligence invention humanity make|n...,54,24
...,...,...,...,...,...
106,146,Blockchain for Payments,reconciling financial realities mba education ...,20,25
107,147,The future of Investing,investment|an investment resource thing procur...,33,10
108,148,Big Data Analytics in Healthcare,quality affordable healthcare vision governmen...,26,41
109,149,Business Analytics In The Healthcare Industry,analytics statistical scientific process disco...,35,4


In [182]:
def compute_polarity_score(positive_score, negative_score):
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    return polarity_score

In [183]:
df['polarity_score'] = df.apply(lambda x: compute_polarity_score(x['positive_score'], x['negative_score']), axis=1)

In [184]:
def compute_subjectivity_score(text, positive_score, negative_score):
    total_words = len(text.split())
    subjectivity_score = (positive_score + negative_score) / (total_words + 0.000001)
    return subjectivity_score

In [185]:
subjectivity_scores = df['Text'].apply(lambda x: compute_subjectivity_score(x, compute_scores(x)['positive_score'], compute_scores(x)['negative_score']))
df['subjectivity_score'] = subjectivity_scores

In [186]:
df

Unnamed: 0,URL_ID,Title,Text,positive_score,negative_score,polarity_score,subjectivity_score
0,37,AI in healthcare to Improve Patient Outcomes,introduction|if kills people decades highly in...,58,30,0.318182,0.095032
1,38,What if the Creation is Taking Over the Creator?,human minds fascination carrying potential tin...,55,35,0.222222,0.165746
2,39,What Jobs Will Robots Take From Humans in The ...,introduction|ai rapidly evolving employment se...,64,34,0.306122,0.122653
3,40,Will Machine Replace The Human in the Future o...,give rise smarterthanhuman intelligence form a...,59,21,0.475000,0.132890
4,41,Will AI Replace Us or Work With Us?,machine intelligence invention humanity make|n...,54,24,0.384615,0.105978
...,...,...,...,...,...,...,...
106,146,Blockchain for Payments,reconciling financial realities mba education ...,20,25,-0.111111,0.110024
107,147,The future of Investing,investment|an investment resource thing procur...,33,10,0.534884,0.063988
108,148,Big Data Analytics in Healthcare,quality affordable healthcare vision governmen...,26,41,-0.223881,0.120939
109,149,Business Analytics In The Healthcare Industry,analytics statistical scientific process disco...,35,4,0.794872,0.107143


### Complex Word Count & Analysis of Readability

In [187]:
# nltk.download('punkt')


In [188]:
def count_syllables(word):
    vowels = 'aeiouAEIOU'
    syllable_count = 0
    word = word.lower()
    if word[0] in vowels:
        syllable_count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            syllable_count += 1
    if word.endswith('e'):
        syllable_count -= 1
    if syllable_count == 0:
        syllable_count += 1
    return syllable_count



In [189]:
def compute_readability(text):
    avg_sentence_length = 0
    complex_words = 0
    sentences = text.split("|")
    total_words = len(nltk.word_tokenize(text))
    avg_sentence_length = total_words / len(sentences)
    words = nltk.word_tokenize(text)
    for word in words:
        syllable_count = count_syllables(word)
        if syllable_count > 2:
            complex_words += 1
    
    percentage_of_complex_words = complex_words / total_words
    fog_index = 0.4 * (avg_sentence_length + percentage_of_complex_words)
    
    return {'Avg Sentence Length': avg_sentence_length, 'Percentage of Complex words': percentage_of_complex_words, 'Fog Index': fog_index}

In [190]:
df[['Avg Sentence Length', 'Percentage of Complex words', 'Fog Index']] = df.apply(lambda x: compute_readability(x['Text']), axis=1, result_type="expand")

### Average Number of Words Per Sentence

In [191]:
def avg_words_per_sentence(text):
    sentences = text.split("|")
    words = len(text.split())
    avg_words = words/len(sentences)
    return avg_words

In [192]:
df['Avg Words Per Sentence'] = df['Text'].apply(lambda x: avg_words_per_sentence(x))

### Complex Words

In [193]:
def count_complex_words(text):
    complex_words = 0
    words = nltk.word_tokenize(text)
    for word in words:
        syllable_count = count_syllables(word)
        if syllable_count > 2:
            complex_words += 1
    return complex_words

In [194]:
df['Complex word count'] = df['Text'].apply(lambda x: count_complex_words(x))

In [195]:
df

Unnamed: 0,URL_ID,Title,Text,positive_score,negative_score,polarity_score,subjectivity_score,Avg Sentence Length,Percentage of Complex words,Fog Index,Avg Words Per Sentence,Complex word count
0,37,AI in healthcare to Improve Patient Outcomes,introduction|if kills people decades highly in...,58,30,0.318182,0.095032,28.060606,0.451404,11.404804,28.060606,418
1,38,What if the Creation is Taking Over the Creator?,human minds fascination carrying potential tin...,55,35,0.222222,0.165746,36.200000,0.395948,14.638379,36.200000,215
2,39,What Jobs Will Robots Take From Humans in The ...,introduction|ai rapidly evolving employment se...,64,34,0.306122,0.122653,28.535714,0.468085,11.601520,28.535714,374
3,40,Will Machine Replace The Human in the Future o...,give rise smarterthanhuman intelligence form a...,59,21,0.475000,0.132890,17.705882,0.377076,7.233184,17.705882,227
4,41,Will AI Replace Us or Work With Us?,machine intelligence invention humanity make|n...,54,24,0.384615,0.105978,19.368421,0.421196,7.915847,19.368421,310
...,...,...,...,...,...,...,...,...,...,...,...,...
106,146,Blockchain for Payments,reconciling financial realities mba education ...,20,25,-0.111111,0.110024,22.722222,0.408313,9.252214,22.722222,167
107,147,The future of Investing,investment|an investment resource thing procur...,33,10,0.534884,0.063988,19.764706,0.403274,8.067192,19.764706,271
108,148,Big Data Analytics in Healthcare,quality affordable healthcare vision governmen...,26,41,-0.223881,0.120939,25.181818,0.393502,10.230128,25.181818,218
109,149,Business Analytics In The Healthcare Industry,analytics statistical scientific process disco...,35,4,0.794872,0.107143,21.411765,0.530220,8.776794,21.411765,193


### Word Count & Syllable Count

In [196]:
def word_count(text):
    words = word_tokenize(text)
    return len(words)

In [197]:
df['Word Count'] = df['Text'].apply(word_count)

In [198]:
def count_syllables_per_word(text):
    words = word_tokenize(text)
    syllables_per_word = [count_syllables(word) for word in words]
    return syllables_per_word

In [199]:
df['Syllables per Word'] = df['Text'].apply(count_syllables_per_word)

In [200]:
df

Unnamed: 0,URL_ID,Title,Text,positive_score,negative_score,polarity_score,subjectivity_score,Avg Sentence Length,Percentage of Complex words,Fog Index,Avg Words Per Sentence,Complex word count,Word Count,Syllables per Word
0,37,AI in healthcare to Improve Patient Outcomes,introduction|if kills people decades highly in...,58,30,0.318182,0.095032,28.060606,0.451404,11.404804,28.060606,418,926,"[5, 1, 1, 3, 1, 3, 2, 1, 3, 3, 2, 2, 3, 1, 3, ..."
1,38,What if the Creation is Taking Over the Creator?,human minds fascination carrying potential tin...,55,35,0.222222,0.165746,36.200000,0.395948,14.638379,36.200000,215,543,"[2, 1, 4, 2, 3, 3, 2, 1, 1, 4, 2, 2, 2, 2, 4, ..."
2,39,What Jobs Will Robots Take From Humans in The ...,introduction|ai rapidly evolving employment se...,64,34,0.306122,0.122653,28.535714,0.468085,11.601520,28.535714,374,799,"[5, 2, 3, 3, 2, 2, 3, 3, 2, 2, 4, 4, 3, 3, 4, ..."
3,40,Will Machine Replace The Human in the Future o...,give rise smarterthanhuman intelligence form a...,59,21,0.475000,0.132890,17.705882,0.377076,7.233184,17.705882,227,602,"[1, 1, 5, 4, 1, 4, 4, 4, 4, 6, 2, 4, 4, 1, 1, ..."
4,41,Will AI Replace Us or Work With Us?,machine intelligence invention humanity make|n...,54,24,0.384615,0.105978,19.368421,0.421196,7.915847,19.368421,310,736,"[2, 4, 3, 3, 3, 3, 1, 1, 4, 4, 3, 2, 1, 2, 2, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,146,Blockchain for Payments,reconciling financial realities mba education ...,20,25,-0.111111,0.110024,22.722222,0.408313,9.252214,22.722222,167,409,"[4, 3, 3, 1, 4, 1, 3, 1, 4, 1, 3, 1, 2, 2, 1, ..."
107,147,The future of Investing,investment|an investment resource thing procur...,33,10,0.534884,0.063988,19.764706,0.403274,8.067192,19.764706,271,672,"[4, 3, 2, 1, 3, 3, 3, 1, 4, 5, 3, 1, 2, 3, 3, ..."
108,148,Big Data Analytics in Healthcare,quality affordable healthcare vision governmen...,26,41,-0.223881,0.120939,25.181818,0.393502,10.230128,25.181818,218,554,"[2, 3, 2, 2, 3, 1, 1, 3, 2, 1, 3, 4, 4, 2, 4, ..."
109,149,Business Analytics In The Healthcare Industry,analytics statistical scientific process disco...,35,4,0.794872,0.107143,21.411765,0.530220,8.776794,21.411765,193,364,"[3, 4, 3, 2, 4, 3, 3, 2, 1, 2, 3, 3, 2, 1, 4, ..."


### Personal Pronouns

In [201]:
def count_personal_pronouns(text):
    # Define the regular expression pattern to match personal pronouns
    pattern = re.compile(r"\b(I|we|my|ours|us)\b", re.IGNORECASE)
    
    # Use re.findall to search for all occurrences of the pattern in the text
    matches = re.findall(pattern, text)
    
    # Return the count of personal pronouns
    return len(matches)

In [202]:
df["Personal Pronouns"] = df["Text"].apply(count_personal_pronouns)

### Average Word Length

In [203]:
def avg_word_length(text):
    words = text.split()
    return sum(len(word) for word in words) / len(words)

In [204]:
df["Average Word Length"] = df["Text"].apply(avg_word_length)

In [205]:
df

Unnamed: 0,URL_ID,Title,Text,positive_score,negative_score,polarity_score,subjectivity_score,Avg Sentence Length,Percentage of Complex words,Fog Index,Avg Words Per Sentence,Complex word count,Word Count,Syllables per Word,Personal Pronouns,Average Word Length
0,37,AI in healthcare to Improve Patient Outcomes,introduction|if kills people decades highly in...,58,30,0.318182,0.095032,28.060606,0.451404,11.404804,28.060606,418,926,"[5, 1, 1, 3, 1, 3, 2, 1, 3, 3, 2, 2, 3, 1, 3, ...",0,7.989201
1,38,What if the Creation is Taking Over the Creator?,human minds fascination carrying potential tin...,55,35,0.222222,0.165746,36.200000,0.395948,14.638379,36.200000,215,543,"[2, 1, 4, 2, 3, 3, 2, 1, 1, 4, 2, 2, 2, 2, 4, ...",0,7.364641
2,39,What Jobs Will Robots Take From Humans in The ...,introduction|ai rapidly evolving employment se...,64,34,0.306122,0.122653,28.535714,0.468085,11.601520,28.535714,374,799,"[5, 2, 3, 3, 2, 2, 3, 3, 2, 2, 4, 4, 3, 3, 4, ...",0,7.968711
3,40,Will Machine Replace The Human in the Future o...,give rise smarterthanhuman intelligence form a...,59,21,0.475000,0.132890,17.705882,0.377076,7.233184,17.705882,227,602,"[1, 1, 5, 4, 1, 4, 4, 4, 4, 6, 2, 4, 4, 1, 1, ...",0,7.528239
4,41,Will AI Replace Us or Work With Us?,machine intelligence invention humanity make|n...,54,24,0.384615,0.105978,19.368421,0.421196,7.915847,19.368421,310,736,"[2, 4, 3, 3, 3, 3, 1, 1, 4, 4, 3, 2, 1, 2, 2, ...",1,7.701087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,146,Blockchain for Payments,reconciling financial realities mba education ...,20,25,-0.111111,0.110024,22.722222,0.408313,9.252214,22.722222,167,409,"[4, 3, 3, 1, 4, 1, 3, 1, 4, 1, 3, 1, 2, 2, 1, ...",0,8.002445
107,147,The future of Investing,investment|an investment resource thing procur...,33,10,0.534884,0.063988,19.764706,0.403274,8.067192,19.764706,271,672,"[4, 3, 2, 1, 3, 3, 3, 1, 4, 5, 3, 1, 2, 3, 3, ...",0,7.552083
108,148,Big Data Analytics in Healthcare,quality affordable healthcare vision governmen...,26,41,-0.223881,0.120939,25.181818,0.393502,10.230128,25.181818,218,554,"[2, 3, 2, 2, 3, 1, 1, 3, 2, 1, 3, 4, 4, 2, 4, ...",0,7.373646
109,149,Business Analytics In The Healthcare Industry,analytics statistical scientific process disco...,35,4,0.794872,0.107143,21.411765,0.530220,8.776794,21.411765,193,364,"[3, 4, 3, 2, 4, 3, 3, 2, 1, 2, 3, 3, 2, 1, 4, ...",0,8.343407


In [206]:
filtered_input_data = input_data[~input_data["URL_ID"].isin([44, 57, 144])]

In [207]:
df.insert(2, "URL", filtered_input_data["URL"])

In [208]:
df

Unnamed: 0,URL_ID,Title,URL,Text,positive_score,negative_score,polarity_score,subjectivity_score,Avg Sentence Length,Percentage of Complex words,Fog Index,Avg Words Per Sentence,Complex word count,Word Count,Syllables per Word,Personal Pronouns,Average Word Length
0,37,AI in healthcare to Improve Patient Outcomes,https://insights.blackcoffer.com/ai-in-healthc...,introduction|if kills people decades highly in...,58,30,0.318182,0.095032,28.060606,0.451404,11.404804,28.060606,418,926,"[5, 1, 1, 3, 1, 3, 2, 1, 3, 3, 2, 2, 3, 1, 3, ...",0,7.989201
1,38,What if the Creation is Taking Over the Creator?,https://insights.blackcoffer.com/what-if-the-c...,human minds fascination carrying potential tin...,55,35,0.222222,0.165746,36.200000,0.395948,14.638379,36.200000,215,543,"[2, 1, 4, 2, 3, 3, 2, 1, 1, 4, 2, 2, 2, 2, 4, ...",0,7.364641
2,39,What Jobs Will Robots Take From Humans in The ...,https://insights.blackcoffer.com/what-jobs-wil...,introduction|ai rapidly evolving employment se...,64,34,0.306122,0.122653,28.535714,0.468085,11.601520,28.535714,374,799,"[5, 2, 3, 3, 2, 2, 3, 3, 2, 2, 4, 4, 3, 3, 4, ...",0,7.968711
3,40,Will Machine Replace The Human in the Future o...,https://insights.blackcoffer.com/will-machine-...,give rise smarterthanhuman intelligence form a...,59,21,0.475000,0.132890,17.705882,0.377076,7.233184,17.705882,227,602,"[1, 1, 5, 4, 1, 4, 4, 4, 4, 6, 2, 4, 4, 1, 1, ...",0,7.528239
4,41,Will AI Replace Us or Work With Us?,https://insights.blackcoffer.com/will-ai-repla...,machine intelligence invention humanity make|n...,54,24,0.384615,0.105978,19.368421,0.421196,7.915847,19.368421,310,736,"[2, 4, 3, 3, 3, 3, 1, 1, 4, 4, 3, 2, 1, 2, 2, ...",1,7.701087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,146,Blockchain for Payments,https://insights.blackcoffer.com/impact-of-cov...,reconciling financial realities mba education ...,20,25,-0.111111,0.110024,22.722222,0.408313,9.252214,22.722222,167,409,"[4, 3, 3, 1, 4, 1, 3, 1, 4, 1, 3, 1, 2, 2, 1, ...",0,8.002445
107,147,The future of Investing,,investment|an investment resource thing procur...,33,10,0.534884,0.063988,19.764706,0.403274,8.067192,19.764706,271,672,"[4, 3, 2, 1, 3, 3, 3, 1, 4, 5, 3, 1, 2, 3, 3, ...",0,7.552083
108,148,Big Data Analytics in Healthcare,https://insights.blackcoffer.com/blockchain-in...,quality affordable healthcare vision governmen...,26,41,-0.223881,0.120939,25.181818,0.393502,10.230128,25.181818,218,554,"[2, 3, 2, 2, 3, 1, 1, 3, 2, 1, 3, 4, 4, 2, 4, ...",0,7.373646
109,149,Business Analytics In The Healthcare Industry,https://insights.blackcoffer.com/blockchain-fo...,analytics statistical scientific process disco...,35,4,0.794872,0.107143,21.411765,0.530220,8.776794,21.411765,193,364,"[3, 4, 3, 2, 4, 3, 3, 2, 1, 2, 3, 3, 2, 1, 4, ...",0,8.343407


In [210]:
df = df.drop(columns=["Title", "Text"])

In [211]:
df = df.rename(columns={
    "positive_score": "POSITIVE SCORE",
    "negative_score": "NEGATIVE SCORE",
    "polarity_score": "POLARITY SCORE",
    "subjectivity_score": "SUBJECTIVITY SCORE",
    "Avg Sentence Length": "AVG SENTENCE LENGTH",
    "Percentage of Complex words": "PERCENTAGE OF COMPLEX WORDS",
    "Fog Index": "FOG INDEX",
    "Avg Words Per Sentence": "AVG NUMBER OF WORDS PER SENTENCE",
    "Complex word count": "COMPLEX WORD COUNT",
    "Word Count": "WORD COUNT",
    "Syllables per Word": "SYLLABLE PER WORD",
    "Personal Pronouns": "PERSONAL PRONOUNS",
    "Average Word Length": "AVG WORD LENGTH"
})

In [212]:
df

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37,https://insights.blackcoffer.com/ai-in-healthc...,58,30,0.318182,0.095032,28.060606,0.451404,11.404804,28.060606,418,926,"[5, 1, 1, 3, 1, 3, 2, 1, 3, 3, 2, 2, 3, 1, 3, ...",0,7.989201
1,38,https://insights.blackcoffer.com/what-if-the-c...,55,35,0.222222,0.165746,36.200000,0.395948,14.638379,36.200000,215,543,"[2, 1, 4, 2, 3, 3, 2, 1, 1, 4, 2, 2, 2, 2, 4, ...",0,7.364641
2,39,https://insights.blackcoffer.com/what-jobs-wil...,64,34,0.306122,0.122653,28.535714,0.468085,11.601520,28.535714,374,799,"[5, 2, 3, 3, 2, 2, 3, 3, 2, 2, 4, 4, 3, 3, 4, ...",0,7.968711
3,40,https://insights.blackcoffer.com/will-machine-...,59,21,0.475000,0.132890,17.705882,0.377076,7.233184,17.705882,227,602,"[1, 1, 5, 4, 1, 4, 4, 4, 4, 6, 2, 4, 4, 1, 1, ...",0,7.528239
4,41,https://insights.blackcoffer.com/will-ai-repla...,54,24,0.384615,0.105978,19.368421,0.421196,7.915847,19.368421,310,736,"[2, 4, 3, 3, 3, 3, 1, 1, 4, 4, 3, 2, 1, 2, 2, ...",1,7.701087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,146,https://insights.blackcoffer.com/impact-of-cov...,20,25,-0.111111,0.110024,22.722222,0.408313,9.252214,22.722222,167,409,"[4, 3, 3, 1, 4, 1, 3, 1, 4, 1, 3, 1, 2, 2, 1, ...",0,8.002445
107,147,,33,10,0.534884,0.063988,19.764706,0.403274,8.067192,19.764706,271,672,"[4, 3, 2, 1, 3, 3, 3, 1, 4, 5, 3, 1, 2, 3, 3, ...",0,7.552083
108,148,https://insights.blackcoffer.com/blockchain-in...,26,41,-0.223881,0.120939,25.181818,0.393502,10.230128,25.181818,218,554,"[2, 3, 2, 2, 3, 1, 1, 3, 2, 1, 3, 4, 4, 2, 4, ...",0,7.373646
109,149,https://insights.blackcoffer.com/blockchain-fo...,35,4,0.794872,0.107143,21.411765,0.530220,8.776794,21.411765,193,364,"[3, 4, 3, 2, 4, 3, 3, 2, 1, 2, 3, 3, 2, 1, 4, ...",0,8.343407


In [213]:
df.to_excel("Output.xlsx", index=False)