## 1. Mounting and Importing

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
#import necessary pacakages
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
import re
import string

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 2. Load input

In [None]:
#read the url file into the pandas object
df = pd.read_excel('Input.xlsx')

In [None]:
df.head()

Unnamed: 0,URL_ID,URL
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...


## 3. Extracting data from URLs

In [None]:
# Define the folder path
folder_path = '/content/gdrive/MyDrive/BlackCoffer/Article/'

# Create the folder if it doesn't exist
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [None]:
#loop throgh each row in the df
for index, row in df.iterrows():
  url = row['URL']
  url_id = row['URL_ID']

  # make a request to url
  header = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
  try:
    response = requests.get(url,headers=header)
  except:
    print("can't get response of {}".format(url_id))

  #create a beautifulsoup object
  try:
    soup = BeautifulSoup(response.content, 'html.parser')
  except:
    print("can't get page of {}".format(url_id))

  #find title
  title_element = soup.find('h1', class_='entry-title')
  if title_element:
      text_title = title_element.get_text(strip=True)
  else:
      # Try finding an alternative element
      title_element_alternative = soup.find('h1', class_='tdb-title-text')

      if title_element_alternative:
          text_title = title_element_alternative.get_text(strip=True)
      else:
          print(f"Can't get title of {url_id}")
          text_title = " "

  #find text
  text_element = soup.find('div', class_='td-post-content tagdiv-type')
  if text_element:
    text_body = soup.find('div', class_='td-post-content tagdiv-type').get_text(separator='\n').strip()
  else:
    text_element_alternative =  soup.find('div', class_='tdb-block-inner td-fix-index')

    if text_element_alternative:
      text_body = text_element_alternative.get_text(separator='\n').strip()
    else:
      print("can't get text of {}".format(url_id))
      text_body = " "

  if text_title is not None and text_body is not None:
    article = text_title + ' ' + text_body

  # Save to a file in the 'TitleText' folder on Google Drive
  file_name = f'{folder_path}{url_id}.txt'

  with open(file_name, 'w') as file:
      file.write(article)

Can't get title of blackassign0036
Can't get title of blackassign0049


## 4. Preprocessing

In [None]:
# Directories
text_dir = "/content/gdrive/MyDrive/BlackCoffer/Article"
stopwords_dir = "/content/gdrive/MyDrive/BlackCoffer/StopWords"
sentment_dir = "/content/gdrive/MyDrive/BlackCoffer/MasterDictionary"

# store stop words from the stopwords directory in the set variable
stop_words = set()
for files in os.listdir(stopwords_dir):
  with open(os.path.join(stopwords_dir,files),'r',encoding='utf-8') as f:
    stop_words.update(set(f.read().splitlines()))

# store all text files from the directory in a list(docs)
docs = []
for text_file in os.listdir(text_dir):
  with open(os.path.join(text_dir,text_file),'r') as f:
    text = f.read()
    words = word_tokenize(text)
    filtered_text = [word for word in words if word.lower() not in stop_words]
    docs.append(filtered_text)

# store positive, Negative words from the directory
pos=set()
neg=set()

for files in os.listdir(sentment_dir):
  if files =='positive-words.txt':
    with open(os.path.join(sentment_dir,files),'r',encoding='utf-8') as f:
      pos.update(f.read().splitlines())
  else:
    with open(os.path.join(sentment_dir,files),'r',encoding='utf-8') as f:
      neg.update(f.read().splitlines())

## 5. Calculate Ouput Metrics


In [None]:
positive_words = []
Negative_words =[]
positive_score = []
negative_score = []
polarity_score = []
subjectivity_score = []
avg_sentence_length = []
Percentage_of_Complex_words  =  []
Fog_Index = []
complex_word_count =  []
avg_syllable_word_count =[]

for i in range(len(docs)):
  positive_words.append([word for word in docs[i] if word.lower() in pos])
  Negative_words.append([word for word in docs[i] if word.lower() in neg])
  positive_score.append(len(positive_words[i]))
  negative_score.append(len(Negative_words[i]))
  polarity_score.append((positive_score[i] - negative_score[i]) / ((positive_score[i] + negative_score[i]) + 0.000001))
  subjectivity_score.append((positive_score[i] + negative_score[i]) / ((len(docs[i])) + 0.000001))

# Average Sentence Length = the number of words / the number of sentences
# Percentage of Complex words = the number of complex words / the number of words
# Fog Index = 0.4 * (Average Sentence Length + Percentage of Complex words)

c_stop_words = set(stopwords.words('english'))
def measure(file):
  with open(os.path.join(text_dir, file),'r') as f:
    text = f.read()
# remove punctuations
    text = re.sub(r'[^\w\s.]','',text)
# split the given text file into sentences
    sentences = text.split('.')
# total number of sentences in a file
    num_sentences = len(sentences)
# total words in the file
    words = [word  for word in text.split() if word.lower() not in c_stop_words ]
    num_words = len(words)

# complex words
    complex_words = []
    for word in words:
      vowels = 'aeiou'
      syllable_count_word = sum( 1 for letter in word if letter.lower() in vowels)
      if syllable_count_word > 2:
        complex_words.append(word)

# Syllable Count Per Word
    syllable_count = 0
    syllable_words =[]
    for word in words:
      if word.endswith('es'):
        word = word[:-2]
      elif word.endswith('ed'):
        word = word[:-2]
      vowels = 'aeiou'
      syllable_count_word = sum( 1 for letter in word if letter.lower() in vowels)
      if syllable_count_word >= 1:
        syllable_words.append(word)
        syllable_count += syllable_count_word


    avg_sentence_len = num_words / num_sentences
    avg_syllable_word_count = syllable_count / len(syllable_words) if len(syllable_words) > 0 else 0
    Percent_Complex_words  =  len(complex_words) / num_words  if num_words > 0 else 0
    Fog_Index = 0.4 * (avg_sentence_len + Percent_Complex_words)

    return avg_sentence_len, Percent_Complex_words, Fog_Index, len(complex_words),avg_syllable_word_count

for file in os.listdir(text_dir):
  x,y,z,a,b = measure(file)
  avg_sentence_length.append(x)
  Percentage_of_Complex_words.append(y)
  Fog_Index.append(z)
  complex_word_count.append(a)
  avg_syllable_word_count.append(b)

# Word Count and Average Word Length
word_count = []
average_word_length = []

def cleaned_words(file):
  with open(os.path.join(text_dir,file), 'r') as f:
    text = f.read()
    text = re.sub(r'[^\w\s]', '' , text)
    words = [word  for word in text.split() if word.lower() not in c_stop_words]
    length = sum(len(word) for word in words)
    average_word_length = length / len(words) if len(words) > 0 else 0
  return len(words),average_word_length

for file in os.listdir(text_dir):
  x, y = cleaned_words(file)
  word_count.append(x)
  average_word_length.append(y)


# Personal Pronouns
pp_count = []

def count_personal_pronouns(file):
  with open(os.path.join(text_dir,file), 'r') as f:
    text = f.read()
    personal_pronouns = ["I", "we", "my", "ours", "us"]
    count = 0
    for pronoun in personal_pronouns:
      count += len(re.findall(r"\b" + pronoun + r"\b", text)) # \b is used to match word boundaries
  return count

for file in os.listdir(text_dir):
  x = count_personal_pronouns(file)
  pp_count.append(x)


output_df = pd.read_excel('/content/gdrive/MyDrive/BlackCoffer/Output Data Structure.xlsx')

# URL_ID 36,49 does not exists i,e. page does not exist, throughs 404 error
# output_df.drop([35,48], axis = 0, inplace=True)

# These are the required parameters
variables = [POSITIVE SCORE,
            NEGATIVE SCORE,
            POLARITY SCORE,
            SUBJECTIVITY SCORE,
            AVG SENTENCE LENGTH,
            PERCENTAGE OF COMPLEX WORDS,
            FOG INDEX,
            AVG NUMBER OF WORDS PER SENTENCE,
            AVG NUMBER OF WORDS PER SENTENCE,
            WORD COUNT,
            SYLLABLE PER WORD,
            PERSONAL PRONOUNS,
            AVG WORD LENGTH]

for i in range(len(output_df)):
    url_id = output_df.iloc[i, 0]
    print(f"{url_id}\t{positive_score[i]}\t{negative_score[i]}\t{polarity_score[i]}\t"
          f"{subjectivity_score[i]}\t{avg_sentence_length[i]}\t{Percentage_of_Complex_words[i]}\t"
          f"{Fog_Index[i]}\t{complex_word_count[i]}\t{word_count[i]}\t{avg_syllable_word_count[i]}\t"
          f"{pp_count[i]}\t{average_word_length[i]}")

# Save the values to the output file
output_df = pd.DataFrame(output_df)
output_df.iloc[:len(variables[0]), 2:] = list(zip(*variables))
output_df.to_excel('/content/gdrive/MyDrive/BlackCoffer/Output_Data.xlsx', index=False)

blackassign0001	33	6	0.6923076745562136	0.05379310337407848	8.2625	0.4084720121028744	3.4683888048411498	270	653	2.3525741029641187	6	6.28177641653905
blackassign0002	60	31	0.31868131517932624	0.08939096258409532	10.902439024390244	0.5637583892617449	4.586478965460795	504	890	2.738728323699422	3	7.21123595505618
blackassign0003	38	24	0.22580644797086374	0.07928388736664464	11.947368421052632	0.6270190895741556	5.029755004250715	427	681	2.9925373134328357	13	8.055800293685756
blackassign0004	38	75	-0.3274336254209414	0.14505776618092714	13.115384615384615	0.5909090909090909	5.482517482517483	403	681	2.8652694610778444	4	7.770925110132159
blackassign0005	22	8	0.4666666511111116	0.0659340657891559	10.292682926829269	0.48578199052132703	4.311385966940239	205	422	2.644230769230769	6	7.232227488151659
blackassign0006	87	27	0.5263157848568791	0.0853932583630013	12.813186813186814	0.5969125214408233	5.364039733851055	696	1162	2.877963125548727	6	7.791738382099828
blackassign0007	32	71	-0.37864

In [None]:
print("Positive Score:", positive_score)
print("Negative Score:", negative_score)
print("Polarity Score:", polarity_score)
print("Subjectivity Score:", subjectivity_score)
print("Average Sentence Length:", avg_sentence_length)
print("Percentage of Complex Words:", Percentage_of_Complex_words)
print("Fog Index:", Fog_Index)
print("Complex Word Count:", complex_word_count)
print("Word Count:", word_count)
print("Average Syllable Word Count:", avg_syllable_word_count)
print("Personal Pronouns Count:", pp_count)
print("Average Word Length:", average_word_length)

Positive Score: [33, 60, 38, 38, 22, 87, 32, 30, 38, 136, 60, 81, 38, 0, 35, 35, 49, 34, 62, 0, 22, 13, 30, 23, 21, 23, 27, 35, 2, 60, 66, 70, 61, 46, 27, 0, 38, 67, 51, 37, 31, 67, 1, 26, 80, 27, 14, 3, 0, 24, 17, 59, 41, 5, 31, 25, 15, 5, 23, 9, 48, 10, 23, 26, 43, 34, 25, 42, 31, 42, 23, 66, 27, 31, 23, 40, 23, 37, 33, 36, 37, 29, 0, 1, 39, 55, 30, 17, 17, 36, 24, 1, 4, 32, 8, 28, 22, 7, 0, 1]
Negative Score: [6, 31, 24, 75, 8, 27, 71, 9, 51, 182, 20, 24, 13, 0, 27, 27, 12, 10, 6, 0, 47, 10, 16, 3, 11, 15, 26, 27, 0, 38, 38, 28, 25, 23, 12, 0, 13, 41, 68, 24, 29, 24, 0, 0, 39, 0, 6, 4, 0, 69, 15, 38, 3, 1, 9, 8, 9, 0, 37, 2, 19, 18, 37, 49, 23, 26, 7, 8, 6, 16, 74, 23, 49, 26, 82, 35, 13, 25, 53, 12, 49, 71, 0, 0, 34, 16, 34, 44, 42, 48, 27, 0, 3, 49, 25, 57, 35, 3, 0, 0]
Polarity Score: [0.6923076745562136, 0.31868131517932624, 0.22580644797086374, -0.3274336254209414, 0.4666666511111116, 0.5263157848568791, -0.3786407730229051, 0.5384615246548328, -0.14606741408913018, -0.14465408