## Code Submitted by AVIK CHAKRABORTY
## This task follows on the problem of data extraction from the web using web scrapping and then performing Text Analysis

### OBJECTIVES:
#### 1. Data Extration from the website provided in the Input.xlsx file.
#### 2. Cleaning the extracted data by removing the STOPWORDS.
#### 3. Perform: POSITIVE SCORE, NEGATIVE SCORE, POLARITY SCORE, SUBJECTIVITY SCORE, AVG SENTENCE LENGTH, PERCENTAGE OF COMPLEX WORDS, FOG INDEX, AVG NUMBER OF WORDS PER SENTENCE, COMPLEX WORD COUNT, WORD COUNT, SYLLABLE PER WORD, PERSONAL PRONOUNS, AVG WORD LENGTH
#### 4. Save the file as the file structure in Output Data Structure.xlsx file.


In [2]:
import pandas as p
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import cmudict
import re

##### Reading the file

In [2]:
data = p.read_excel('Input.xlsx')
data.head()

Unnamed: 0,URL_ID,URL
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...


##### Checking the duplicate or empty data from the file.

In [3]:
data.shape

(100, 2)

In [4]:
data.duplicated().unique()

array([False])

In [5]:
data.empty

False

In [6]:
data.URL.empty

False

In [7]:
data.URL_ID.empty

False

### DATA EXTRACTION via WEB SCRAPPING

In [8]:
title_class = ['tdb-title-text']
text_class = ['tdb-block-inner td-fix-index', 'td-post-content tagdiv-type']

##### Server side web messages displayed when the algorithm scrapes the data.
##### Website no 36 and 49 cannot be scrapped as the server is NOT FOUND

In [9]:
def check_status_URL(url):
    status = requests.get(url)

    if status.status_code == 200:
        return 'OK'
    if status.status_code == 400:
        return 'BAD REQUEST'
    if status.status_code == 401:
        return 'UNAUTHORIZED'
    if status.status_code == 403:
        return 'Forbidden'
    if status.status_code == 404:
        return 'NOT FOUND' 

##### Finding the HTML h1 tag's class for scrapping the title

In [10]:
def finding_HTMLelement_class(soup):
    title_class.append(soup.find('h1').get('class')[0])

##### Checking the length of the article text for the sites if it shows 404 error.

In [11]:
def check_length_ARTICLE_TEXT(article_text):
    high = pos = 0
    
    for i in range(len(article_text)):
        if high < len(article_text[i]):
            high = len(article_text[i])
            pos = i

    return article_text[pos]

##### Saving the extracted data in a file in a sperate forlder

In [12]:
def saving_data(article_title, article_text, file_name):
    with open(file = str('extracted files/' + file_name + '.txt'), mode = 'wb') as file:
        file.write(article_title)
        file.write(article_text)

##### Extracting the data from the web using BeautifulSoup

In [13]:
def extracting_data(soup, file_name):
    finding_HTMLelement_class(soup)

    for i in title_class:
        if soup.find('h1', i):
            # encoding the text in utf-8 form as get_text() cannot encode all the letters
            article_title = soup.find('h1', i).get_text().encode('utf-8')
            break

    article_text = []
    for i in text_class:
        if soup.find('div', i):
            # article_text = soup.find_all('div', i).get_text().encode('utf-8')
            for tags in soup.find_all('div', i):
                article_text.append(tags.get_text().encode('utf-8'))

            article_text = check_length_ARTICLE_TEXT(article_text)
            break

    saving_data(article_title, article_text, file_name)

##### Accessing the sites in the Input.xlsx file

In [23]:
for ind in range(97,98,1):
    url = data.iloc[ind, 1]

    check_status = check_status_URL(url)

    if check_status == 'OK':
        html = urlopen(url).read()
        soup = BeautifulSoup(html, features = 'html.parser').find('article')

        # killing all html scripts and style elements.
        for script in soup(['script', 'style']):
            script.extract() # ripping the scripts out

        extracting_data(soup, data.iloc[ind, 0])

        if check_status == 'BAD REQUEST':
            print(f'URL at index {ind} is a {check_status} !')
        
        if check_status == 'UNAUTHORIZED':
            print(f'URL at index {ind} is {check_status} !')

        if check_status == 'FORBIDDEN':
            print(f'URL at index {ind} is {check_status} !')

        if check_status == 'NOT FOUND':
            print(f'URL at index {ind} is {check_status} !')

print('All the files have been saved !')

ALl the files have been saved !


### TEXT ANALYSIS

##### Changing the lines in the file into seperate lines and storing them.

##### Saving changed file.

In [74]:
def saving_reformed_files(full_text, file):
    with open(file = 'changed files/' + file, mode = 'w', encoding = 'utf-8') as fp:
        fp.write(full_text)
        fp.close()

##### Seperating the file lines by the terminators ['.', '?', '!'] and storing the seperate lines as a full text.

In [75]:
def reformed_files(file_text, file):
    full_text = ''
    for sentence in file_text:
        words = word_tokenize(sentence)
        
        line = ''
        for word in words:
            if word in ['?', '.', '!']:
                line += '.\n'
            else:
                line = line + ' ' + word

        full_text += line

    saving_reformed_files(full_text, file)

##### Opening the file and reading each line of the file.
##### The file is read line by line. But the problem with the readlines() is that it accepts the paragraph as a single line as seen here. Therefore we need to change the .txt in a way that all the lines are read as seperate lines for future work

In [76]:
for file in os.listdir('extracted files'):
    if file.endswith('.txt'):
        with open(file = 'extracted_files/' + file, mode = 'r', encoding = 'utf-8') as fp:
            file_text = fp.readlines()
            fp.close()

        reformed_files(file_text, file)

##### Finding and storing all the STOP WORDS

In [3]:
stop_words = []

for file in os.listdir('StopWords'):
    if file.endswith('.txt'):
        with open(file = 'StopWords/' + file, mode = 'r') as fp:
            for word in fp.read().split():
                stop_words.append(word)
            fp.close()

##### Saving the cleaned files

In [78]:
def save_cleaned_file(full_text, file):
    with open('cleaned files/' + file, mode = 'w', encoding = 'utf-8') as fp:
        fp.write(full_text)
        fp.close()

##### Removing STOP WORDS

In [79]:
for file in os.listdir('changed files'):
    if file.endswith('.txt'):
        with open(file = 'changed files/' + file, mode = 'r', encoding = 'utf-8') as fp:
            file_text = fp.readlines()
            fp.close()

        full_text = ''
        for lines in file_text:
            words = word_tokenize(lines)
            
            words_copy = words.copy()
            for word in words_copy:
                if word in stop_words:
                    words.remove(word)

            full_text += ' '.join(words) + '\n'

            save_cleaned_file(full_text, file)

##### Global Variables

In [36]:
positive_score_perFile = []
negative_score_perFile = []
polarity_score_perFile = []
subjectivity_score_perFile = []

##### Saving the positive, negative, polarity, subjectivity score to the Output.xlsx File

In [37]:
def save_PNPoSub_score(data):
    # since file no 36 does not exits
    positive_score_perFile.insert(35, 0)
    negative_score_perFile.insert(35, 0)
    subjectivity_score_perFile.insert(35, 0)
    polarity_score_perFile.insert(35, 0)
    
    # since file no 49 does not exits
    positive_score_perFile.insert(48, 0)
    negative_score_perFile.insert(48, 0)
    subjectivity_score_perFile.insert(48, 0)
    polarity_score_perFile.insert(48, 0)

    data['POSITIVE SCORE'] = positive_score_perFile
    data['NEGATIVE SCORE'] = negative_score_perFile
    data['POLARITY SCORE'] = polarity_score_perFile
    data['SUBJECTIVITY SCORE'] = subjectivity_score_perFile

    data.to_excel('Output.xlsx', index = False)

##### Saving the positive and negative words of each file for future use.

In [38]:
def save_PN_words(positive_words, negative_words):
    positive_words = '\n'.join(positive_words)
    negative_words = '\n'.join(negative_words)

    with open('MasterDictionary/' + 'positive-words.txt', mode = 'a', encoding = 'utf-8') as fp:
        fp.write('\n' + positive_words)
        fp.close()

    with open('MasterDictionary/' + 'negative-words.txt', mode = 'a', encoding = 'utf-8') as fp:
        fp.write('\n' + negative_words)
        fp.close()

##### Finding the Polarity Score and Subjectivity Score

In [39]:
def finding_polarityANDsubjectivity_score(positive_score, negative_score, file):
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)

    with open('cleaned files/' + file, mode = 'r', encoding = 'utf-8') as fp:
        file_text = fp.read().split()
        fp.close()

    count = 0
    for word in file_text:
        if ord(word[0]) in range(65, 91, 1) or ord(word[0]) in range(97, 123, 1):
            count += 1

    subjectivity_score = (positive_score + negative_score) / (count + 0.000001)

    polarity_score_perFile.append(polarity_score)
    subjectivity_score_perFile.append(subjectivity_score)


##### Finding the positive and negative words from the cleaned files and storing them
##### Also, finding the positive and negative score

In [40]:
def finding_PN_wordsANDscore(file_text, positive_words_file, negative_words_file, file):
    positive_words = []
    negative_words = []

    positive_score = 0
    negative_score = 0

    for lines in file_text:
        words = word_tokenize(lines)

        # the score is calculated after each word is found
        for word in words:
            if word in positive_words_file:
                positive_score += 1
                positive_words.append(word)
        
        for word in words:
            if word in negative_words_file:
                negative_score -= 1
                negative_words.append(word)


    negative_score = negative_score * (-1)

    positive_score_perFile.append(positive_score)
    negative_score_perFile.append(negative_score)
    
    # we do not need to repeat the same word and save it therefore set() is used
    positive_words = list(set(positive_words))
    negative_words = list(set(negative_words))

    finding_polarityANDsubjectivity_score(positive_score, negative_score, file)
    save_PN_words(positive_words, negative_words)

##### Fetching and storing the positive and negative words from the Master Dictionary

In [41]:
data = p.read_excel('Output.xlsx')

for file in os.listdir('cleaned files'):
    if file.endswith('.txt'):
        with open('MasterDictionary/' + 'positive-words.txt', mode = 'r', encoding = 'utf-8') as fp:
            positive_words_file = fp.read().split()
            fp.close()

        with open('MasterDictionary/' + 'negative-words.txt', mode = 'r') as fp:
            negative_words_file = fp.read().split()
            fp.close()

        file_text = ''
        with open('cleaned files/' + file, mode = 'r', encoding = 'utf-8') as fp:
            file_text = fp.readlines()
            fp.close()

        finding_PN_wordsANDscore(file_text, positive_words_file, negative_words_file, file)

save_PNPoSub_score(data)

#### Analysis of readability

##### GLobal Variables

In [63]:
word_count_perFile = []
avg_sent_length_perFile = []

##### Finding average sentence length

In [64]:
def average_sentence_length(count, file_sentences):
    avg_sent_length = count // len(file_sentences)

    avg_sent_length_perFile.append(avg_sent_length)

##### Finding the Percentage of Complex Words per file
##### Removing symbols and counting the words per file

In [65]:
def count_file_words(file_words, file_sentences):
    count = 0

    for word in file_words:
        if ord(word[0]) in range(65, 91, 1) or ord(word[0]) in range(97, 123, 1):
            count += 1

    word_count_perFile.append(count)
    
    average_sentence_length(count, file_sentences)

In [66]:
# Loading the CMU dictionary
d = cmudict.dict()

# opening each file
for file in os.listdir('cleaned files'):
    with open(file = 'cleaned files/' + file, mode = 'r', encoding = 'utf-8') as fp:
        file_words = fp.read().split()
        fp.close()
        
    with open(file = 'cleaned files/' + file, mode = 'r', encoding = 'utf-8') as fp:
        file_sentences = fp.readlines()
        fp.close()

    count_file_words(file_words, file_sentences)

##### Finding the complex words

In [67]:
numberof_complex_words_perFile = []

In [68]:
# Function to count syllables in a word
def count_syllables(word):
    if word.lower() in d:
        return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0]
    else:
        # Fallback: consider one syllable if the word is not in the dictionary
        return 1

# Define what makes a word complex (e.g., more than 3 syllables or more than 7 characters)
def is_complex_word(word):
    return len(word) > 7 or count_syllables(word) > 3

# Function to read a file and check each word
def find_complex_words(filename):
    with open('cleaned files/' + filename, mode = 'r', encoding = 'utf-8') as fp:
        text = fp.read()

    # Use regex to find words in the text
    words = re.findall(r'\b\w+\b', text)

    # Find and print complex words
    complex_words_perFile = [word for word in words if is_complex_word(word)]
    return complex_words_perFile

# Opening the files
for file in os.listdir('cleaned files'):
    complex_words_perFile = find_complex_words(file)
    numberof_complex_words_perFile.append(len(complex_words_perFile))

##### Percentage of complex words per file

In [69]:
percentage_complex_word_perFile = []

for pos in range(0, 98, 1):
    percentage_complex_word_perFile.append(numberof_complex_words_perFile[pos] / word_count_perFile[pos] * 100)

##### Finding the Fog Index

In [70]:
fog_index_perFile = []

for pos in range(0, 98, 1):
    fog_index_perFile.append(avg_sent_length_perFile[pos] + percentage_complex_word_perFile[pos])

##### Saving the Average Sentence Length, Percentage of Complex Words, Fog Index, Word Count, Complex Word Count in Output.xlsx file

In [80]:
# Since file 36 does not exists
avg_sent_length_perFile.insert(35, 0)
percentage_complex_word_perFile.insert(35, 0)
fog_index_perFile.insert(35, 0)
word_count_perFile.insert(35, 0)
numberof_complex_words_perFile.insert(35, 0)

# Since file 49 does not exists
avg_sent_length_perFile.insert(48, 0)
percentage_complex_word_perFile.insert(48, 0)
fog_index_perFile.insert(48, 0)
word_count_perFile.insert(48, 0)
numberof_complex_words_perFile.insert(48, 0)

data = p.read_excel('Output.xlsx')

data['AVG SENTENCE LENGTH'] = avg_sent_length_perFile
data['PERCENTAGE OF COMPLEX WORDS'] = percentage_complex_word_perFile
data['FOG INDEX'] = fog_index_perFile
data['WORD COUNT'] = word_count_perFile
data['COMPLEX WORD COUNT'] = numberof_complex_words_perFile

data.to_excel('Output.xlsx', index = False)

##### Finding the Syllable count per word in each file
##### GLobal Variables

In [54]:
syllable_count_perFile = []

##### Counting the syllables in a file

In [55]:
def counting_syllables_perFile(words):
    syllable_count_perWord = []

    for word in words:
        count = 0
        for letter in word:
            if letter in ['a', 'e', 'i', 'o', 'u']:
                count += 1

        syllable_count_perWord.append(count)

    return syllable_count_perWord

##### Removing symbols and filtering words that is ending with es and ed.

In [56]:
def remove_symbols(file_text):
    words = []

    for word in file_text:
        if ord(word[0]) in range(65, 90, 1) or ord(word[0]) in range(97, 122, 1):
            # handling the ed and es exception.
            # reversing the string as negative indexing is being used.
            if word[-1:-3:-1] not in ['se', 'de']:
                words.append(word)

    syllable_count_perWord = counting_syllables_perFile(words)

    return words, syllable_count_perWord

##### Calculating the syllable count per file.

In [57]:
for file in os.listdir('cleaned files'):
    with open(file = 'cleaned files/' + file, mode = 'r', encoding = 'utf-8') as fp:
        file_text = fp.read().split()
        fp.close()

    words, syllable_count_perWord = remove_symbols(file_text)

    '''Since it is mentioned that we need to count syllables in each word,
    Also in the output file the column is syllable per word.
    Therefore the average syllable count per word in each file is calculated and stored.'''
    syllable_count_perFile.append(sum(syllable_count_perWord) // len(words))

##### Saving the syllable in Output.xlsx file.

In [58]:
# since file 38 an 49 does not exists
syllable_count_perFile.insert(35, 0)
syllable_count_perFile.insert(48, 0)

data = p.read_excel('Output.xlsx')

data['SYLLABLE PER WORD'] = syllable_count_perFile

data.to_excel('Output.xlsx', index = False)

##### Calculating number of personal pronouns and average word length per file.

In [92]:
personal_pronouns = r'\b(I|me|my|mine|we|us|our|ours|you|your|yours|he|him|his|she|her|hers|it|its|they|them|their|theirs)\b'

personal_pronouns_perFile = []
avg_word_length_perFile = []

for file in os.listdir('cleaned files'):
    with open(file = 'cleaned files/' + file, mode = 'r', encoding = 'utf-8') as fp:
        file_text1 = fp.read()
        fp.close()
        
    with open(file = 'cleaned files/' + file, mode = 'r', encoding = 'utf-8') as fp:
        file_text2 = fp.read().split()
        fp.close()

    words = []
    for word in file_text2:
        if ord(word[0]) in range(65, 90, 1) or ord(word[0]) in range(97, 122, 1):
            words.append(word)

    each_word_length = []
    for word in words:
        each_word_length.append(len(word))

    avg_word_length_perFile.append(sum(each_word_length) // len(words) if len(words) != 0 else 0)

    words_pp = re.findall(personal_pronouns, file_text1)
    personal_pronouns_perFile.append(len(words_pp))

##### Saving the personal pronouns.

In [93]:
# file 36 and 49 does not exists
personal_pronouns_perFile.insert(35, 0)
avg_word_length_perFile.insert(35, 0)
personal_pronouns_perFile.insert(48, 0)
avg_word_length_perFile.insert(48, 0)

data = p.read_excel('Output.xlsx')
data['PERSONAL PRONOUNS'] = personal_pronouns_perFile
data['AVG WORD LENGTH'] = avg_word_length_perFile

data.to_excel('Output.xlsx', index = False)