# Web Scraping

In [None]:
#import dependencies
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [None]:
#read in the input data
input = pd.read_excel('/content/Input.xlsx')

In [None]:
input.head(10)

Unnamed: 0,URL_ID,URL
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...
1,38.0,https://insights.blackcoffer.com/what-if-the-c...
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...
3,40.0,https://insights.blackcoffer.com/will-machine-...
4,41.0,https://insights.blackcoffer.com/will-ai-repla...
5,42.0,https://insights.blackcoffer.com/man-and-machi...
6,43.0,https://insights.blackcoffer.com/in-future-or-...
7,44.0,https://insights.blackcoffer.com/how-neural-ne...
8,45.0,https://insights.blackcoffer.com/how-machine-l...
9,46.0,https://insights.blackcoffer.com/deep-learning...


In [None]:
#summary information on the input data
input.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   URL_ID  114 non-null    float64
 1   URL     114 non-null    object 
dtypes: float64(1), object(1)
memory usage: 1.9+ KB


#Cleaning

In [None]:
#strip all spaces
input.URL = input.URL.str.strip()

In [None]:
input.URL[0] 

'https://insights.blackcoffer.com/ai-in-healthcare-to-improve-patient-outcomes/'

In [None]:
#convert url column to list
url_list = input.URL.values.tolist()

In [None]:
#extract articles from url using beautiful soup
html_list = []
for url in url_list:
  headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0"}
  result = requests.get(url, headers=headers)
  soup = BeautifulSoup(result.content, 'lxml')
  html_list.append(soup)

In [None]:
#extract articles titles and content and
#remove article with Nonetype article topic and article content
title_list = []
content_list = []
for html in html_list:
  titles = html.find("h1", {"class":"entry-title"})
  if titles != None:
    titles = titles.text
    title_list.append(titles)

#content
for html in html_list:
  content = html.find('div', attrs={'class': 'td-post-content'})
  if content != None:
    content = content.text
    content_list.append(content)

In [None]:
#get index of articles with 404 error(returns None for article and title)
none_index = []
for index, html in enumerate(html_list):
  titles = html.find("h1", {"class":"entry-title"})
  if titles == None:
    none_index.append(index)

# Creating Dataframe

In [None]:
#Converting to dataframe
dataframe = {'title':title_list,'text':content_list}
df = pd.DataFrame(dataframe)
df.head()

Unnamed: 0,title,text
0,AI in healthcare to Improve Patient Outcomes,\nIntroduction\n“If anything kills over 10 mil...
1,What if the Creation is Taking Over the Creator?,"\nHuman minds, a fascination in itself carryin..."
2,What Jobs Will Robots Take From Humans in The ...,\nIntroduction\nAI is rapidly evolving in the ...
3,Will Machine Replace The Human in the Future o...,\n“Anything that could give rise to smarter-th...
4,Will AI Replace Us or Work With Us?,\n“Machine intelligence is the last invention ...


In [None]:
#Combine the article title and content
df.text = df.title + ' ' + df.text

In [None]:
#drop the title column
df.drop('title', axis=1, inplace = True)

In [None]:
df.head()

Unnamed: 0,text
0,AI in healthcare to Improve Patient Outcomes \...
1,What if the Creation is Taking Over the Creato...
2,What Jobs Will Robots Take From Humans in The ...
3,Will Machine Replace The Human in the Future o...
4,Will AI Replace Us or Work With Us? \n“Machine...


In [None]:
#create a copy
df1 = df.copy()

In [None]:
#save extracted text as a text file
with open('/content/URL_ID.txt', "w") as f:
    f.write('\n'.join(df['text']))

# Data Cleaning

In [None]:
#import regex and nltk packages
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer


lemma = WordNetLemmatizer()
nltk.download('stopwords')
stopword_list = stopwords.words('english')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
#Define a function for cleaning/preprocessing

def clean(text):
  #remove special characters
  text = re.sub("[^A-Za-z0-9 ]", "", text)
  #convert text to lowercase
  text = text.lower()
  #tokenize text
  tokens = nltk.word_tokenize(text)
  #remove stopwords
  text_list = []
  for token in tokens:
    if token not in stopword_list:
      #lemmatization
      text_list.append(lemma.lemmatize(token))
  return ' '.join(text_list)

In [None]:
#apply the function df.text series 
df.text = df.text.apply(clean)

In [None]:
#Load the output file
output = pd.read_excel('/content/Output Data Structure.xlsx')
output.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,,,,,,,,,,,,,
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,,,,,,,,,,,,,
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,,,,,,,,,,,,,
3,40.0,https://insights.blackcoffer.com/will-machine-...,,,,,,,,,,,,,
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,,,,,,,,,,,,,


In [None]:
#remove the rows with no article with the none_index
output.drop(none_index, axis=0, inplace=True)
#reset index
output.reset_index(drop=True, inplace = True)

# __Sentimental Analysis__
## __1. Cleaning using Stop Words Lists__ <br/>
- Load stopword file

In [None]:
#extract country currencies and compute currency stopword list.
import csv
stp_curr = []
with open('/content/StopWords_Currencies.txt', 'r', encoding='Latin-1') as sc:
    reader = csv.reader(sc, delimiter='|')
    for row in reader:
      stp_curr.append(row[0])

In [None]:
#load and convert the auditor stopwords to a list lowercase
aud_stp = pd.read_csv('/content/StopWords_Auditor.txt', header = None)
aud_stp = list(aud_stp[0])
aud_stp = [x.lower() for x in aud_stp]


#load and convert the dates stopwords to a list lowercase
dates_stp = pd.read_csv('/content/StopWords_DatesandNumbers.txt', header = None)
dates_stp = list(dates_stp[0])
pipe_char = ['|']
dates_stpword = []
for i in dates_stp:
    count=i
    for j in pipe_char:
       i=i.replace(j,"")
    if(len(i)==len(count)):
        dates_stpword.append(i.lower())


#load and convert the generic stopwords to a list lowercase
gen_stp = pd.read_csv('/content/StopWords_Generic.txt', header = None)
gen_stp = list(gen_stp[0])
gen_stp = [x.lower() for x in gen_stp]


#load and convert the generic long stopwords to a list lowercase
genl_stp = pd.read_csv('/content/StopWords_GenericLong.txt', header = None)
genl_stp = list(genl_stp[0])
genl_stp = [x.lower() for x in genl_stp]


#load and convert the geographic stopwords to a list lowercase
geo_stp = pd.read_csv('/content/StopWords_Geographic.txt', header = None)
geo_stpword = list(geo_stp[0])
geo_stpword = [word.strip().lower() for line in geo_stpword for word in line.split('|')]


#load and convert the names stopwords to a list lowercase
names_stp = pd.read_csv('/content/StopWords_Names.txt',header = None)
name_stopword = list(names_stp[0])
name_stopword[0] = name_stopword[0].split('|')[0].strip()
name_stopword = [str(x).lower() for x in name_stopword]

In [None]:
big_stoplist = stp_curr + aud_stp + dates_stp + gen_stp + genl_stp + geo_stpword + name_stopword

__Cleaning the text with the new stopword list__

In [None]:
#definne a funcction to remove the stopwords in big_stoplist
def remove_stopword(text):
  #remove stopwords
  text_list = []
  #tokenize text
  tokens = nltk.word_tokenize(text)
  for token in tokens:
    if token not in big_stoplist:
      text_list.append(token)
  return ' '.join(text_list)

In [None]:
#apply remove_stopword function to the df.text series
df.text = df.text.apply(remove_stopword)

## __2. Extracting Derived variables__
__i. Positive Score__

In [None]:
#calculate positive score
pos_words = pd.read_csv('/content/positive-words.txt', header = None)
pos_words.head()

Unnamed: 0,0
0,a+
1,abound
2,abounds
3,abundance
4,abundant


In [None]:
for i in range(111):
  pos_count=0
  for word in pos_words[0]:
    if word in df['text'][i]:
      pos_count+=1
  output.at[i, 'POSITIVE SCORE'] = pos_count
output.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,74.0,,,,,,,,,,,,
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,54.0,,,,,,,,,,,,
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,78.0,,,,,,,,,,,,
3,40.0,https://insights.blackcoffer.com/will-machine-...,60.0,,,,,,,,,,,,
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,60.0,,,,,,,,,,,,


__2. Negative Score__


In [None]:
#calculate negative score
neg_words = pd.read_csv('/content/negative-words.txt', encoding = 'Latin-1', header = None)
neg_words.head()

Unnamed: 0,0
0,2-faced
1,2-faces
2,abnormal
3,abolish
4,abominable


In [None]:
for i in range(111):
  neg_count=0
  for word in neg_words[0]: 
    if word in df['text'][i]:
      neg_count = neg_count + (-1) * (-1)
  output.at[i, 'NEGATIVE SCORE'] = neg_count
output.head(5)

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,74.0,74.0,,,,,,,,,,,
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,54.0,68.0,,,,,,,,,,,
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,78.0,78.0,,,,,,,,,,,
3,40.0,https://insights.blackcoffer.com/will-machine-...,60.0,63.0,,,,,,,,,,,
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,60.0,67.0,,,,,,,,,,,


__3. Polarity Score__ <br/>
Polarity Score = (Positive Score – Negative Score)/ ((Positive Score + Negative Score) + 0.000001)

In [None]:
#calculate polarity score
for i in range(111):
    pos_score = output['POSITIVE SCORE'][i] 
    neg_score = output['NEGATIVE SCORE'][i] 
    pol_count = (pos_score - neg_score)/((pos_score + neg_score) + 0.000001)
    output.at[i, 'POLARITY SCORE'] = pol_count
output.head(5)

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,74.0,74.0,0.0,,,,,,,,,,
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,54.0,68.0,-0.114754,,,,,,,,,,
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,78.0,78.0,0.0,,,,,,,,,,
3,40.0,https://insights.blackcoffer.com/will-machine-...,60.0,63.0,-0.02439,,,,,,,,,,
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,60.0,67.0,-0.055118,,,,,,,,,,


__4. Subjectivity Score__ <br/>
Subjectivity Score = (Positive Score + Negative Score)/ ((Total Words after cleaning) + 0.000001)


In [None]:
#calculate subjectivity score
for i in range(111):
  total_word = len(df['text'][0].split())
  pos_score = output['POSITIVE SCORE'][i] 
  neg_score = output['NEGATIVE SCORE'][i] 
  sub_count = (pos_score + neg_score)/((total_word) + 0.000001)
  output.at[i, 'SUBJECTIVITY SCORE'] = sub_count
  
output.head(5)

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,74.0,74.0,0.0,0.158289,,,,,,,,,
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,54.0,68.0,-0.114754,0.130481,,,,,,,,,
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,78.0,78.0,0.0,0.166845,,,,,,,,,
3,40.0,https://insights.blackcoffer.com/will-machine-...,60.0,63.0,-0.02439,0.131551,,,,,,,,,
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,60.0,67.0,-0.055118,0.135829,,,,,,,,,


# __Analysis of Readability__
__1. Average Sentence Length__ <br/>
Average Sentence Length = the number of words / the number of sentences


In [None]:
#install and import textstat
!pip install textstat
import textstat

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#remove unwanted characters from the df1.text series
def clean_char(text):
  #remove special characters
  text = re.sub("[^A-Za-z0-9. ]", "", text)
  return text

#apply remove_stopword function to the df1.text series with lower case
senten_series = df1.text.str.lower().apply(remove_stopword)
senten_series = senten_series.apply(clean_char)

In [None]:
#calculate average sentence lenght
for i in range(111):
  total_word = len(df['text'][i].split())
  total_sentences = senten_series.apply(textstat.sentence_count)
  avg_sentence = total_word/total_sentences[i]
  output.at[i, 'AVG SENTENCE LENGTH'] = avg_sentence

In [None]:
output.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,74.0,74.0,0.0,0.158289,12.302632,,,,,,,,
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,54.0,68.0,-0.114754,0.130481,9.081967,,,,,,,,
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,78.0,78.0,0.0,0.166845,9.841463,,,,,,,,
3,40.0,https://insights.blackcoffer.com/will-machine-...,60.0,63.0,-0.02439,0.131551,7.961538,,,,,,,,
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,60.0,67.0,-0.055118,0.135829,9.426829,,,,,,,,


__2. Percentage of Complex words__ <br/>
Percentage of Complex words = the number of complex words / the number of words 


In [None]:
#calculate percentage of complex words
for i in range(111):
  words = df['text'][i].split()
  total_word =len(words)

  vowels='aeiou'
  complex_counts =0
  for word in words:
    vowel_count = 0
    for vowel in vowels:
      if vowel in word:
        vowel_count+=1
    if vowel_count > 1:
      complex_counts += 1
  percent_complex = complex_counts/total_word
    
  output.at[i,'PERCENTAGE OF COMPLEX WORDS'] = percent_complex
    
output.head(5)   

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,74.0,74.0,0.0,0.158289,12.302632,0.841711,,,,,,,
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,54.0,68.0,-0.114754,0.130481,9.081967,0.750903,,,,,,,
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,78.0,78.0,0.0,0.166845,9.841463,0.847584,,,,,,,
3,40.0,https://insights.blackcoffer.com/will-machine-...,60.0,63.0,-0.02439,0.131551,7.961538,0.798712,,,,,,,
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,60.0,67.0,-0.055118,0.135829,9.426829,0.786546,,,,,,,


__3. Fog Index__ <br/>
Fog Index = 0.4 * (Average Sentence Length + Percentage of Complex words)


In [None]:
#calculate fog index
output['FOG INDEX'] = 0.4 * (output['AVG SENTENCE LENGTH'] + output['PERCENTAGE OF COMPLEX WORDS'])
output.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,74.0,74.0,0.0,0.158289,12.302632,0.841711,5.257737,,,,,,
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,54.0,68.0,-0.114754,0.130481,9.081967,0.750903,3.933148,,,,,,
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,78.0,78.0,0.0,0.166845,9.841463,0.847584,4.275619,,,,,,
3,40.0,https://insights.blackcoffer.com/will-machine-...,60.0,63.0,-0.02439,0.131551,7.961538,0.798712,3.5041,,,,,,
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,60.0,67.0,-0.055118,0.135829,9.426829,0.786546,4.08535,,,,,,


## __Average Number of Words Per Sentence__
Average Number of Words Per Sentence = the total number of words / the total number of sentences


In [None]:
#calculate average number of words
for i in range(111):
  total_word = len(df['text'][i].split())
  total_sentences = senten_series.apply(textstat.sentence_count)
  avg_no_sentence = total_word/total_sentences[i]
  output.at[i, 'AVG NUMBER OF WORDS PER SENTENCE'] = avg_no_sentence
output.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,74.0,74.0,0.0,0.158289,12.302632,0.841711,5.257737,12.302632,,,,,
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,54.0,68.0,-0.114754,0.130481,9.081967,0.750903,3.933148,9.081967,,,,,
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,78.0,78.0,0.0,0.166845,9.841463,0.847584,4.275619,9.841463,,,,,
3,40.0,https://insights.blackcoffer.com/will-machine-...,60.0,63.0,-0.02439,0.131551,7.961538,0.798712,3.5041,7.961538,,,,,
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,60.0,67.0,-0.055118,0.135829,9.426829,0.786546,4.08535,9.426829,,,,,


## __Complex word count__

In [None]:
# Count number of complex words
for i in range(111):
  words = df['text'][i].split()
  vowels='aeiou'
  complex_counts =0
  
  for word in words:
    vowel_count = 0
    for vowel in vowels:
      if vowel in word:
        vowel_count+=1
    if vowel_count >= 2:
      complex_counts += 1
  output.at[i,'COMPLEX WORD COUNT'] = complex_counts

In [None]:
output.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,74.0,74.0,0.0,0.158289,12.302632,0.841711,5.257737,12.302632,787.0,,,,
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,54.0,68.0,-0.114754,0.130481,9.081967,0.750903,3.933148,9.081967,416.0,,,,
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,78.0,78.0,0.0,0.166845,9.841463,0.847584,4.275619,9.841463,684.0,,,,
3,40.0,https://insights.blackcoffer.com/will-machine-...,60.0,63.0,-0.02439,0.131551,7.961538,0.798712,3.5041,7.961538,496.0,,,,
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,60.0,67.0,-0.055118,0.135829,9.426829,0.786546,4.08535,9.426829,608.0,,,,


## __Word count__

In [None]:
#count number of words
for i in range(111):
  total_word = len(df['text'][i].split())
  output.at[i,'WORD COUNT'] = total_word

In [None]:
output.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,74.0,74.0,0.0,0.158289,12.302632,0.841711,5.257737,12.302632,787.0,935.0,,,
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,54.0,68.0,-0.114754,0.130481,9.081967,0.750903,3.933148,9.081967,416.0,554.0,,,
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,78.0,78.0,0.0,0.166845,9.841463,0.847584,4.275619,9.841463,684.0,807.0,,,
3,40.0,https://insights.blackcoffer.com/will-machine-...,60.0,63.0,-0.02439,0.131551,7.961538,0.798712,3.5041,7.961538,496.0,621.0,,,
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,60.0,67.0,-0.055118,0.135829,9.426829,0.786546,4.08535,9.426829,608.0,773.0,,,


## __Syllable per word__

In [None]:
#count number of syllables
for i in range(111):
  words = df['text'][i].split()
  total_word = len(words)
  vowels='aeiou'
  syllable_count = 0
  
  for word in words:
    for vowel in vowels:
      if vowel in word and word[-2:] not in (["es","ed"]):
        syllable_count+=1
  output.at[i,'SYLLABLE PER WORD'] = syllable_count

output.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,74.0,74.0,0.0,0.158289,12.302632,0.841711,5.257737,12.302632,787.0,935.0,1988.0,,
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,54.0,68.0,-0.114754,0.130481,9.081967,0.750903,3.933148,9.081967,416.0,554.0,1078.0,,
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,78.0,78.0,0.0,0.166845,9.841463,0.847584,4.275619,9.841463,684.0,807.0,1708.0,,
3,40.0,https://insights.blackcoffer.com/will-machine-...,60.0,63.0,-0.02439,0.131551,7.961538,0.798712,3.5041,7.961538,496.0,621.0,1286.0,,
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,60.0,67.0,-0.055118,0.135829,9.426829,0.786546,4.08535,9.426829,608.0,773.0,1546.0,,


## __Personal Pronouns__


In [None]:
pronoun_series[0]

'AI healthcare Improve Patient Outcomes Introduction  If kills 10 people decades  highly infectious virus war . Not missiles microbes.  Bill Gates  remarks TED conference 2014  world avoided Ebola outbreak . When  unprecedented  invisible virus hit  met overwhelmed unprepared healthcare system oblivious population . This public health emergency demonstrated lack scientific consideration underlined alarming robust innovations health medical facilities . For past years  artificial intelligence proven tangible potential healthcare sectors  clinical practices  translational medical biomedical research . After detected China December 31st 2019  AI program developed BlueDot alerted world pandemic . It realise AI  ability analyse chunks data detecting patterns identifying tracking carriers virus . Many tracing apps AI tabs people infected prevent risk crossinfection AI algorithms track patterns extract features classify categorise . So AI  IBM Watson  sophisticated AI works computing natural 

In [None]:
#count personal proniuns
#df1- a copy of df will be used since stopwords(including personal pronouns has been removed from df)
#apply remove_stopword function to the df1.text series
pronoun_series = df1.text.apply(remove_stopword)
pronoun_series = pronoun_series.apply(clean_char)

#pronoun list
pronoun_list = ['I', 'i', 'we', 'We', 'my', 'My', 'ours', 'Ours', 'us', 'Us']

for i in range(111):
  words = pronoun_series[i].split()
  pronoun_count = 0
  for word in words:
    if word in pronoun_list:
      pronoun_count += 1
  output.at[i,'PERSONAL PRONOUNS']=pronoun_count

output.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,74.0,74.0,0.0,0.158289,12.302632,0.841711,5.257737,12.302632,787.0,935.0,1988.0,0.0,
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,54.0,68.0,-0.114754,0.130481,9.081967,0.750903,3.933148,9.081967,416.0,554.0,1078.0,0.0,
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,78.0,78.0,0.0,0.166845,9.841463,0.847584,4.275619,9.841463,684.0,807.0,1708.0,1.0,
3,40.0,https://insights.blackcoffer.com/will-machine-...,60.0,63.0,-0.02439,0.131551,7.961538,0.798712,3.5041,7.961538,496.0,621.0,1286.0,0.0,
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,60.0,67.0,-0.055118,0.135829,9.426829,0.786546,4.08535,9.426829,608.0,773.0,1546.0,6.0,


## __Average Word Length__
Average Word Length =
Sum of the total number of characters in each word / Total number of words


In [None]:
for i in range(111):
  total_word = len(df['text'][i].split())
  total_char = len(df['text'][i])
  average_wordlen = total_char/total_word
  output.at[i,'AVG WORD LENGTH']=average_wordlen
output.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,74.0,74.0,0.0,0.158289,12.302632,0.841711,5.257737,12.302632,787.0,935.0,1988.0,0.0,8.742246
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,54.0,68.0,-0.114754,0.130481,9.081967,0.750903,3.933148,9.081967,416.0,554.0,1078.0,0.0,8.054152
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,78.0,78.0,0.0,0.166845,9.841463,0.847584,4.275619,9.841463,684.0,807.0,1708.0,1.0,8.700124
3,40.0,https://insights.blackcoffer.com/will-machine-...,60.0,63.0,-0.02439,0.131551,7.961538,0.798712,3.5041,7.961538,496.0,621.0,1286.0,0.0,8.223833
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,60.0,67.0,-0.055118,0.135829,9.426829,0.786546,4.08535,9.426829,608.0,773.0,1546.0,6.0,8.274256


## __Saving the dataframe as an excel file__

In [None]:
output.to_excel('/content/Output Data Structure.xlsx', index = False)