## Data Extraction and Text Analysis

Author's name- Antra Tripathi
<br> email- tripathiantra074@gmail.com

#### Importing libraries

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize  
from nltk.corpus import stopwords  
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Antra
[nltk_data]     Tripathi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Antra
[nltk_data]     Tripathi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Reading Input File

In [3]:
df=pd.read_excel('Input.xlsx')
url_id=df['URL_ID']
url=df['URL']

#### Data Extraction

In [4]:
#data extraction using beautifulsoup 
#extracting article and title from given urls using bs4 object; if error while extracting data then storing url_id and index in separate variable
# so that we can drop these columns later

not_found=[]

for i in range(len(url_id)):
    link=url[i]
    id=url_id[i]
    
    #requesting url
    response=requests.get(link)
    
    #beautifulsoup object
    soup=BeautifulSoup(response.content,'html.parser')
    article_name=""
    article=""
    
    #extracting article title
    try:
        article_name_tag = soup.find('h1', class_='entry-title')
        article_name = article_name_tag.text.strip().replace('/', "")
    except:
        not_found.append([id,i])
        continue
    
    #extracting article text
    try:
        article_content_tag = soup.find('div', class_='td-post-content')
        article = article_content_tag.get_text(strip=True, separator=' ')
    except:
        continue
    
    #Saving title and content with URL_ID as file name
    file_name = f"D:/projects/Web scraping/input_data/{id}.txt"
    with open(file_name,'w',encoding='utf-8') as file:
        file.write(article_name + '\n' + article)
           

In [5]:
import os
#loading stop words
directory = "D:\projects\Web scraping\StopWords"

# Initialize a set to store unique words
all_words = set()

# Loop through each file in the directory
for files in os.listdir(directory):
    with open(os.path.join(directory,files),'r',encoding='ISO-8859-1') as f:
        words=f.read().splitlines()
        all_words.update(words)


In [6]:
#loading positive words
directory= "D:\projects\Web scraping\MasterDictionary\positive-words.txt"

# Initialize a set to store unique words
positive_words=set()

# Loop through each file in the directory
with open(directory, 'r',encoding='ISO-8859-1') as file:
    words=file.read().splitlines()
    positive_words.update(words)


In [7]:
#loading negative words
directory= r'D:\projects\Web scraping\MasterDictionary'

#initialize a set to store negative words
negative_words=set()

#Loop through each file in the directory
for file in os.listdir(directory):
  if file =='negative-words.txt':
    with open(os.path.join(directory,file),'r',encoding='ISO-8859-1') as f:
        negative_words.update(f.read().splitlines())


#### Data Analysis

In [8]:
# converting the test into list of tokens using tokenize module
# removing the stop words from the text
 
extracted_data="D:\projects\Web scraping\input_data"
text=[]
for file in os.listdir(extracted_data):
  with open(os.path.join(extracted_data,file),'r') as f:
    data = f.read()
    words=word_tokenize(data)
    new= [word for word in words if word.lower() not in all_words]
# add each filtered tokens of each file into a list
    text.append(new)

In [9]:
#variables for positive and negative words, and polarity and subjectivity score. we will use the converted tokens for calculating these output variables
pos_words = []
neg_words =[]
positive_score = []
negative_score = []
polarity_score = []
subjectivity_score = []


#calculating positive and negative score for each article
for i in range(len(text)):
  pos_words.append([word for word in text[i] if word.lower() in positive_words])
  neg_words.append([word for word in text[i] if word.lower() in negative_words])
  positive_score.append(len(pos_words[i]))
  negative_score.append(len(neg_words[i]))
  polarity_score.append((positive_score[i] - negative_score[i]) / ((positive_score[i] + negative_score[i]) + 0.000001))
  subjectivity_score.append((positive_score[i] + negative_score[i]) / ((len(text[i])) + 0.000001))

In [11]:
#importing regular expression
import re
avg_sent_len_list = []
percent_complex_words_list = []
fog_index_list = []
complex_word_count_list = []
avg_syllable_word_count_list = []
word_count_list = []
average_word_length_list = []
pronouns=[]


#stopwords such as a, an, the etc
stop_words = set(stopwords.words('english'))

def analyze_text(text):
    
    #removing punctuations
    text=re.sub(r'[^\w\s.]','',text)
    
    #splitting text into sentences
    sen=text.split('.')
    
    #storing number of sentences
    num_sentences = len(sen)
    
    #removing stop words present in text using nltk.corpus.stopwords 
    words = [word for word in text.split() if word.lower() not in stop_words]

    #calculating average length of words
    length = sum(len(word) for word in words)
    average_word_length = length / len(words)
    word_count=len(words)
    num_words = len(words)
    
    
    #counting complex words
    complex_words = 0
    syllable_count_word=0
    vowels = ['a','e','i','o','u']
    for word in words:
        x=re.compile('[es|ed$]')
        if x.match(word.lower()):
            syllable_count_word+=1
        else:
            for j in word:
                if j.lower() in vowels:
                    syllable_count_word+=1
        if syllable_count_word > 2:
            complex_words+=1
        syllable_count_word=0
    
    
    
    #counting syllables
    syllable_count = 0
    syllable_words = []
    for word in words:
        if word.endswith('es'):
            word = word[:-2]
        elif word.endswith('ed'):
            word = word[:-2]
        vowels = ['a','e','i','o','u']
        syllable_count_word = sum(1 for letter in word if letter.lower() in vowels)
        if syllable_count_word >= 1:
            syllable_words.append(word)
            syllable_count += syllable_count_word


    #counting pronouns
    p_list=['i','we','my','ours','us' ]
    cnt=0
    for word in words:
        if word.lower() in p_list:
            cnt+=1
    
    
    #counting rest output variables
    avg_sent_len = num_words / num_sentences
    avg_syllable_word_count = syllable_count / len(syllable_words)
    percent_complex_words = complex_words / num_words
    fog_index = 0.4 * (avg_sent_len + percent_complex_words)
    
    #returning calculated variables
    return avg_sent_len, percent_complex_words*100, fog_index, complex_words, avg_syllable_word_count,word_count,average_word_length,cnt


extracted_data="D:\projects\Web scraping\input_data"

#iterating through each extracted file present in input_data folder

for file in os.listdir(extracted_data):
    
  with open(os.path.join(extracted_data,file),'r') as f:
    text=f.read()
    avg_sent_len_val, percent_complex_words_val, fog_index_val, complex_word_count_val, avg_syllable_word_count_val, word_count, average_word_length,p_count= analyze_text(text)
    avg_sent_len_list.append(avg_sent_len_val)
    percent_complex_words_list.append(percent_complex_words_val)
    fog_index_list.append(fog_index_val)
    complex_word_count_list.append(complex_word_count_val)
    avg_syllable_word_count_list.append(avg_syllable_word_count_val)
    word_count_list.append(word_count)
    average_word_length_list.append(average_word_length)
    pronouns.append(p_count)


In [None]:
#loading output file
output= pd.read_excel('Output Data Structure.xlsx')

In [None]:
#drop columns not found
index=[]
for i in not_found:
  index.append(i[1])
output.drop(index, inplace=True)
excel_file = 'output_file.xlsx'

# Create a Pandas Excel writer object
excel_writer = pd.ExcelWriter(excel_file, engine='xlsxwriter')

# Write the DataFrame to the Excel file
output.to_excel(excel_writer, sheet_name='Sheet1', index=False)

# Save the Excel file
excel_writer.save()    

#### Output Data Structure.xlsx

In [None]:
#creating dictionary with all the variables
data=[positive_score, negative_score, polarity_score, subjectivity_score, avg_sent_len_list, percent_complex_words_list,fog_index_list, avg_sent_len_list, complex_word_count_list, word_count_list, avg_syllable_word_count_list, pronouns, average_word_length_list]

#loading output file
df=pd.read_excel('output_file.xlsx')

# write the values to the dataframe
for i, var in enumerate(data):
    df.iloc[:,i+2] = var

df.to_excel('Output Data.xlsx')

#### Another approach
1. Instead of BeautifulSoup object we can use Scrapy to extract article from urls as spider can follow large amount of links at once.
2. It will be beneficial for larger datasets. 
3. Other approaches like selenium won't work as it will slow down the extracting speed.