# Data Extraction and Text Analysis

### Libraries

In [21]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.4-py3-none-any.whl (105 kB)
     -------------------------------------- 105.1/105.1 kB 6.3 MB/s eta 0:00:00
Collecting pyphen
  Downloading pyphen-0.15.0-py3-none-any.whl (2.1 MB)
     ---------------------------------------- 2.1/2.1 MB 6.3 MB/s eta 0:00:00
Installing collected packages: pyphen, textstat
Successfully installed pyphen-0.15.0 textstat-0.7.4


In [3]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
import re
import contractions
from textstat.textstat import textstatistics
import string

### Loading Data

In [4]:
xl = pd.ExcelFile('Input.xlsx')
df = xl.parse("Sheet1")
df.head(5)

Unnamed: 0,URL_ID,URL
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...
3,bctech2014,https://insights.blackcoffer.com/effective-man...
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...


### Web Scrapping

In [8]:
my_headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 14685.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.4992.0 Safari/537.36",
              "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"}

def extract_text(url):
    result = requests.get(url, headers=my_headers)
    doc = BeautifulSoup(result.content, "html.parser")
    return " ".join([str(text.text) for text in doc.find_all("p")])

df['Text_Contents'] = df['URL'].apply(extract_text)

In [16]:
df['Text_Contents'].unique()[:10]

array(['Healthcare AI ChatBot using LLAMA, LLM, Langchain Efficient Supply Chain Assessment: Overcoming Technical Hurdles for Web Application Development Streamlined Integration: Interactive Brokers API with Python for Desktop Trading Application Efficient Data Integration and User-Friendly Interface Development: Navigating Challenges in Web Application Deployment AI Chatbot using LLM, Langchain, LLama AI Bot Audio to audio Methodology for ETL Discovery Tool using LLMA, OpenAI, Langchain Methodology for database discovery tool using openai, LLMA, Langchain Rising IT cities and its impact on the economy, environment, infrastructure, and city life by the year 2040. Rising IT Cities and Their Impact on the Economy, Environment, Infrastructure, and City Life in Future Internet Demand’s Evolution, Communication Impact, and 2035’s Alternative Pathways Rise of Cybercrime and its Effect in upcoming Future AI/ML and Predictive Modeling Solution for Contact Centre Problems How to Setup Custom Do

### Data Manipulation

In [24]:
# Instantiating wordnet lemmatizer
lemma = WordNetLemmatizer()

In [26]:
import nltk
nltk.download('stopwords')

# Instantiating the english language stopwords dictionary
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ankita\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [27]:
# Pre-process the text data
def text_prep(x: str) -> list:
     corp = str(x).lower()
     corp = contractions.fix(corp)
     corp = re.sub('[^a-zA-Z]+',' ', corp).strip() 
     tokens = word_tokenize(corp)
     words = [t for t in tokens if t not in stop_words]
     lemmatize = [lemma.lemmatize(w) for w in words]
     return lemmatize

In [29]:
import nltk
nltk.download('punkt')

# Pre-process the text data
def text_prep(x: str) -> list:
     corp = str(x).lower()
     corp = contractions.fix(corp)
     corp = re.sub('[^a-zA-Z]+',' ', corp).strip() 
     tokens = word_tokenize(corp) # word_tokenize requires the 'punkt' resource
     words = [t for t in tokens if t not in stop_words]
     lemmatize = [lemma.lemmatize(w) for w in words]
     return lemmatize



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ankita\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [37]:
nltk.download('wordnet')
nltk.download('omw-1.4')
preprocess_tag = [text_prep(i) for i in df['Text_Contents']]
df["preprocess_txt"] = preprocess_tag

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ankita\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Ankita\AppData\Roaming\nltk_data...


### Sentiment Analysis

In [33]:
file = open('negative-words.txt', 'r', encoding='latin-1')
neg_words = file.read().split()
file.close()
file = open('positive-words.txt', 'r', encoding='latin-1')
pos_words = file.read().split()
file.close()

##### Word_count

In [55]:
df['words_count'] = df['preprocess_txt'].map(lambda x: len(x))

##### Positive and Negative Score

In [56]:
num_pos = df['preprocess_txt'].map(lambda x: len([i for i in x if i in pos_words]))
df['Positive'] = num_pos
num_neg = df['preprocess_txt'].map(lambda x: len([i for i in x if i in neg_words]))
df['Negative'] = num_neg

##### Polarity score

In [57]:
df['Polarity'] = round((df['Positive'] - df['Negative'])/(df['Positive'] + df['Negative'] + 0.000001), 2)

##### Subjectivity Score

In [58]:
df['Subjectivity'] = round((df['Positive'] + df['Negative'])/(df['num_words'] + 0.000001), 2)

In [59]:
df.head(5)

Unnamed: 0,URL_ID,URL,Text_Contents,preprocess_txt,num_words,Positive,Negative,Polarity,Subjectivity,num_sentences,avg_sentence_len,complex_words,complex_words_prop,avg_words_per_sentence,complex_words_count,complex_words_percent,words_count,syl_per_word,personal_pronouns,avg_word_len
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...,"Healthcare AI ChatBot using LLAMA, LLM, Langch...","[healthcare, ai, chatbot, using, llama, llm, l...",650,42,9,0.65,0.08,24,27.1,138,0.21,27.08,138,0.21,650,2.363077,4,6.096425
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...,"Healthcare AI ChatBot using LLAMA, LLM, Langch...","[healthcare, ai, chatbot, using, llama, llm, l...",210,9,1,0.8,0.05,7,30.0,46,0.22,30.0,46,0.22,210,2.171429,3,6.490347
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...,"Healthcare AI ChatBot using LLAMA, LLM, Langch...","[healthcare, ai, chatbot, using, llama, llm, l...",207,9,1,0.8,0.05,7,29.6,45,0.22,29.57,45,0.22,207,2.169082,3,6.44186
3,bctech2014,https://insights.blackcoffer.com/effective-man...,"Healthcare AI ChatBot using LLAMA, LLM, Langch...","[healthcare, ai, chatbot, using, llama, llm, l...",209,9,1,0.8,0.05,7,29.9,45,0.22,29.86,45,0.22,209,2.167464,3,6.467181
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...,"Healthcare AI ChatBot using LLAMA, LLM, Langch...","[healthcare, ai, chatbot, using, llama, llm, l...",210,9,1,0.8,0.05,7,30.0,47,0.22,30.0,47,0.22,210,2.17619,3,6.501931


##### Average Sentence Length

In [60]:
df['num_sentences'] = df['Text_Contents'].map(lambda x: len(sent_tokenize(x)))
df['avg_sentence_len'] = round(df['num_words']/df['num_sentences'], 1)

##### Complex words count & Percentage


In [61]:
def syllables_count(text):
  return textstatistics().syllable_count(text)

In [62]:
def complex_words(text):
  diff_words_set = set()
  words = text
  for word in words:
    syllable_count = syllables_count(word)
    if syllable_count > 2:
      diff_words_set.add(word)
  return len(diff_words_set)

In [63]:
df['complex_words_count'] = df['preprocess_txt'].apply(lambda x: complex_words(x))
df['complex_words_percent'] = round((df['complex_words']/df['num_words']), 2)

##### Fog Index

In [64]:
df['Fog_index'] = 0.4 * (df['avg_sentence_len'] + df['complex_words_prop'])

##### Average Number of Words Per Sentence

In [65]:
df['avg_words_per_sentence'] = round(df['num_words']/df['num_sentences'], 2)

In [66]:
df.head()

Unnamed: 0,URL_ID,URL,Text_Contents,preprocess_txt,num_words,Positive,Negative,Polarity,Subjectivity,num_sentences,...,complex_words,complex_words_prop,avg_words_per_sentence,complex_words_count,complex_words_percent,words_count,syl_per_word,personal_pronouns,avg_word_len,Fog_index
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...,"Healthcare AI ChatBot using LLAMA, LLM, Langch...","[healthcare, ai, chatbot, using, llama, llm, l...",650,42,9,0.65,0.08,24,...,138,0.21,27.08,138,0.21,650,2.363077,4,6.096425,10.924
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...,"Healthcare AI ChatBot using LLAMA, LLM, Langch...","[healthcare, ai, chatbot, using, llama, llm, l...",210,9,1,0.8,0.05,7,...,46,0.22,30.0,46,0.22,210,2.171429,3,6.490347,12.088
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...,"Healthcare AI ChatBot using LLAMA, LLM, Langch...","[healthcare, ai, chatbot, using, llama, llm, l...",207,9,1,0.8,0.05,7,...,45,0.22,29.57,45,0.22,207,2.169082,3,6.44186,11.928
3,bctech2014,https://insights.blackcoffer.com/effective-man...,"Healthcare AI ChatBot using LLAMA, LLM, Langch...","[healthcare, ai, chatbot, using, llama, llm, l...",209,9,1,0.8,0.05,7,...,45,0.22,29.86,45,0.22,209,2.167464,3,6.467181,12.048
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...,"Healthcare AI ChatBot using LLAMA, LLM, Langch...","[healthcare, ai, chatbot, using, llama, llm, l...",210,9,1,0.8,0.05,7,...,47,0.22,30.0,47,0.22,210,2.17619,3,6.501931,12.088


##### Syllable count

In [67]:
df['syl_per_word'] = (df['preprocess_txt'].apply(lambda x: syllables_count(" ".join(x))))/df['num_words']

##### Personal pronouns

In [68]:
def personal_pro(text):
  pronounRegex = re.compile(r'\b(I|we|my|ours|(?-i:us))\b',re.I)
  pronouns = pronounRegex.findall(text)
  return len(pronouns)

# (?-i:us) is used as in-line modifier group where the matching is CASE SENSITIVE. As a result, this matches only us not US

df['personal_pronouns'] = df['Text_Contents'].apply(lambda x: personal_pro(x))

##### Average word length

In [69]:
def text_len(text):
  text = ''.join(text)
  filtered = ''.join(filter(lambda x: x not in string.punctuation, text))
  words = [word for word in filtered.split() if word]
  avg = sum(map(len, words))/len(words)
  return avg

df['avg_word_len'] = df['Text_Contents'].map(lambda x: text_len(x))

In [70]:
df.head(5)

Unnamed: 0,URL_ID,URL,Text_Contents,preprocess_txt,num_words,Positive,Negative,Polarity,Subjectivity,num_sentences,...,complex_words,complex_words_prop,avg_words_per_sentence,complex_words_count,complex_words_percent,words_count,syl_per_word,personal_pronouns,avg_word_len,Fog_index
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...,"Healthcare AI ChatBot using LLAMA, LLM, Langch...","[healthcare, ai, chatbot, using, llama, llm, l...",650,42,9,0.65,0.08,24,...,138,0.21,27.08,138,0.21,650,2.363077,4,6.096425,10.924
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...,"Healthcare AI ChatBot using LLAMA, LLM, Langch...","[healthcare, ai, chatbot, using, llama, llm, l...",210,9,1,0.8,0.05,7,...,46,0.22,30.0,46,0.22,210,2.171429,3,6.490347,12.088
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...,"Healthcare AI ChatBot using LLAMA, LLM, Langch...","[healthcare, ai, chatbot, using, llama, llm, l...",207,9,1,0.8,0.05,7,...,45,0.22,29.57,45,0.22,207,2.169082,3,6.44186,11.928
3,bctech2014,https://insights.blackcoffer.com/effective-man...,"Healthcare AI ChatBot using LLAMA, LLM, Langch...","[healthcare, ai, chatbot, using, llama, llm, l...",209,9,1,0.8,0.05,7,...,45,0.22,29.86,45,0.22,209,2.167464,3,6.467181,12.048
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...,"Healthcare AI ChatBot using LLAMA, LLM, Langch...","[healthcare, ai, chatbot, using, llama, llm, l...",210,9,1,0.8,0.05,7,...,47,0.22,30.0,47,0.22,210,2.17619,3,6.501931,12.088


### Output Data Structure

In [71]:
df.columns

Index(['URL_ID', 'URL', 'Text_Contents', 'preprocess_txt', 'num_words',
       'Positive', 'Negative', 'Polarity', 'Subjectivity', 'num_sentences',
       'avg_sentence_len', 'complex_words', 'complex_words_prop',
       'avg_words_per_sentence', 'complex_words_count',
       'complex_words_percent', 'words_count', 'syl_per_word',
       'personal_pronouns', 'avg_word_len', 'Fog_index'],
      dtype='object')

In [72]:
df = df[['URL_ID', 'URL', 'Positive', 'Negative', 'Polarity', 'Subjectivity', 'avg_sentence_len', 'complex_words_prop', 'Fog_index', 'avg_words_per_sentence', 'complex_words', 'num_words', 'syl_per_word', 'personal_pronouns', 'avg_word_len']]

In [73]:
df.columns = ['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']

In [74]:
df.head(5)

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...,42,9,0.65,0.08,27.1,0.21,10.924,27.08,138,650,2.363077,4,6.096425
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...,9,1,0.8,0.05,30.0,0.22,12.088,30.0,46,210,2.171429,3,6.490347
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...,9,1,0.8,0.05,29.6,0.22,11.928,29.57,45,207,2.169082,3,6.44186
3,bctech2014,https://insights.blackcoffer.com/effective-man...,9,1,0.8,0.05,29.9,0.22,12.048,29.86,45,209,2.167464,3,6.467181
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...,9,1,0.8,0.05,30.0,0.22,12.088,30.0,47,210,2.17619,3,6.501931


In [75]:
df.to_csv('Output-Data-Structure.csv', index = False)