In [1]:
import os
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
import string
import re

In [2]:
# loading all text files
location = "C:\\Users\\ADMIN\\Desktop\\BlackCoffer\\Text_Files\\"
l = []
for f in os.listdir(location):
    l.append(f)

In [3]:
#loading stop words file
with open('StopWords_Generic.txt', 'r', encoding='utf-8') as f:
    stop_words = f.read()

In [4]:
STOPWORDS = stop_words.split("\n")
STOPWORDS = [x.lower() for x in STOPWORDS]

In [5]:
#loading the master dictionary for positive and negative sentiments
master = pd.read_csv('Loughran-McDonald_MasterDictionary_1993-2021.csv')
master.head()

Unnamed: 0,Word,Seq_num,Word Count,Word Proportion,Average Proportion,Std Dev,Doc Count,Negative,Positive,Uncertainty,Litigious,Strong_Modal,Weak_Modal,Constraining,Syllables,Source
0,AARDVARK,1,354,1.55008e-08,1.4226e-08,3.815486e-06,99,0,0,0,0,0,0,0,2,12of12inf
1,AARDVARKS,2,3,1.313627e-10,8.653817e-12,9.241714e-09,1,0,0,0,0,0,0,0,2,12of12inf
2,ABACI,3,9,3.940882e-10,1.169679e-10,5.290465e-08,7,0,0,0,0,0,0,0,3,12of12inf
3,ABACK,4,29,1.26984e-09,6.654735e-10,1.5951e-07,28,0,0,0,0,0,0,0,2,12of12inf
4,ABACUS,5,8570,3.752595e-07,3.809464e-07,3.529356e-05,1108,0,0,0,0,0,0,0,3,12of12inf


In [6]:
#removing rows with missing values
master.dropna(axis=0, inplace=True)

In [7]:
#making separate list for positive and negative words
positive_words = list(master['Word'][master['Negative']==0])
positive_words = [x.lower() for x in positive_words]
negative_words = list(master['Word'][master['Negative']!=0])
negative_words = [x.lower() for x in negative_words]

In [8]:
def load_text_files(text_file):
    with open(location + text_file,'r',encoding='utf-8') as f:
        actual_text = f.read()
    
    return actual_text

In [9]:
def removing_stopwords(original_text):
    actual_text_list = original_text.split()
        
    text_without_stopwords = [word for word in actual_text_list if word.lower() not in STOPWORDS]
    result = ' '.join(text_without_stopwords)

    return result

In [10]:
def clean_text(txt_no_stopwords):
    tokens = word_tokenize(txt_no_stopwords)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    #removing words that are less than 2 alphabets 
    tokens = [word for word in tokens if len(word) > 2]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    return words

In [11]:
def Extracting_Derived_variables(text_tokenized):
    pos_score = 0
    neg_score = 0
    
    for i in range(len(text_tokenized)):
        if text_tokenized[i] in positive_words:
            pos_score+=1
        elif text_tokenized[i] in negative_words:
            neg_score-=1
    
    Pos_score = pos_score
    #negative score is negative so multiplying it by -1 to make it a positive number
    Neg_score = neg_score*(-1)
    Polarity_Score = (Pos_score - Neg_score)/ ((Pos_score + Neg_score) + 0.000001)
    Word_Count = len(text_tokenized)
    Subjectivity_Score = (Pos_score + Neg_score)/((Word_Count) + 0.000001)
    
    return Pos_score , Neg_score, Polarity_Score, Subjectivity_Score, Word_Count

In [12]:
def syllable_count(word):
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e") and (word.endswith("able")==False):
        count -= 1
    if (word.endswith("ed") and (word.endswith("eed")==False)) or (word.endswith("es") and (word.endswith("ees")==False)):
        count-=1
    if count == 0:
        count += 1
        
    
    return count

In [13]:
def complex_word_count(words_list):
    complex_words=[]
    for i in words_list:
        if syllable_count(i) > 2:
            complex_words.append(i)
    return complex_words

In [14]:
def Analysis_of_Readability(original_text):
    number_of_sentences = sent_tokenize(original_text)
    Average_Sentence_Length = word_count /len(number_of_sentences)
    
    Percentage_of_Complex_words = len(complex_words) / word_count
    
    Fog_Index = 0.4 * (Average_Sentence_Length + Percentage_of_Complex_words)
    
    return Average_Sentence_Length,Percentage_of_Complex_words, Fog_Index

In [15]:
def Personal_Pronouns(original_text):
    pronounRegex = re.compile(r'\b(I|we|my|ours|(?-i:us))\b',re.I)
    pronouns = pronounRegex.findall(original_text)
    
    return len(pronouns)

In [16]:
def Average_Word_Length(words_list):
    average = sum(len(word) for word in words_list) / word_count
    return average

In [17]:
URL_ID = []
POSITIVE_SCORE = []              
NEGATIVE_SCORE = []                     
POLARITY_SCORE = []                
SUBJECTIVITY_SCORE = []
AVG_SENTENCE_LENGTH = []
PERCENTAGE_OF_COMPLEX_WORDS =[]
FOG_INDEX =[]
AVG_NUMBER_OF_WORDS_PER_SENTENCE =[]
COMPLEX_WORD_COUNT =[]
WORD_COUNT = []
SYLLABLE_PER_WORD =[]
PERSONAL_PRONOUNS =[]
AVG_WORD_LENGTH =[]

In [18]:
for i in range(len(l)):
    text = load_text_files(l[i])
    text_no_stopwords = removing_stopwords(text)
    cleaned_word_list = clean_text(text_no_stopwords)
    positive_score , negative_score, polarity_score, subjectivity_score, word_count = Extracting_Derived_variables(cleaned_word_list)
    syllable_list = list(map(syllable_count, cleaned_word_list))
    syllable_per_word = sum(syllable_list)/word_count
    complex_words = complex_word_count(cleaned_word_list)
    Average_Sentence_Length,Percentage_of_Complex_words, Fog_Index = Analysis_of_Readability(text)
    personal_pronouns = Personal_Pronouns(text)
    average_word_length = Average_Word_Length(cleaned_word_list)
    URL_ID.append(l[i].strip(".txt"))
    POSITIVE_SCORE.append(positive_score)
    NEGATIVE_SCORE.append(negative_score)                     
    POLARITY_SCORE.append(round(polarity_score,2))             
    SUBJECTIVITY_SCORE.append(round(subjectivity_score,2))
    AVG_SENTENCE_LENGTH.append(round(Average_Sentence_Length,2))
    PERCENTAGE_OF_COMPLEX_WORDS.append(round(Percentage_of_Complex_words*100, 2))
    FOG_INDEX.append(round(Fog_Index,2))
    AVG_NUMBER_OF_WORDS_PER_SENTENCE.append(round(Average_Sentence_Length,2))
    COMPLEX_WORD_COUNT.append(len(complex_words))
    WORD_COUNT.append(word_count)
    SYLLABLE_PER_WORD.append(round(syllable_per_word, 2))
    PERSONAL_PRONOUNS.append(personal_pronouns)
    AVG_WORD_LENGTH.append(round(average_word_length,2))
    

In [19]:
d = {'URL_ID':URL_ID,'POSITIVE SCORE':POSITIVE_SCORE, 'NEGATIVE SCORE':NEGATIVE_SCORE, 'POLARITY SCORE':POLARITY_SCORE,\
    'SUBJECTIVITY SCORE':SUBJECTIVITY_SCORE, "AVG SENTENCE LENGTH": AVG_SENTENCE_LENGTH, 'PERCENTAGE OF COMPLEX WORDS':PERCENTAGE_OF_COMPLEX_WORDS,\
    'FOG INDEX':FOG_INDEX, 'AVG NUMBER OF WORDS PER SENTENCE':AVG_NUMBER_OF_WORDS_PER_SENTENCE, 'COMPLEX WORD COUNT':COMPLEX_WORD_COUNT,\
    'WORD COUNT':WORD_COUNT,'SYLLABLE PER WORD':SYLLABLE_PER_WORD, 'PERSONAL PRONOUNS':PERSONAL_PRONOUNS, 'AVG WORD LENGTH':AVG_WORD_LENGTH }

In [20]:
df = pd.DataFrame(d)
df.tail()

Unnamed: 0,URL_ID,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
165,95,654,44,0.87,0.93,11.06,21.81,4.51,11.06,164,752,1.92,4,6.49
166,96,703,40,0.89,0.88,13.9,30.54,5.68,13.9,259,848,2.17,3,6.88
167,97,386,22,0.89,0.92,40.45,13.71,16.24,40.45,61,445,1.62,46,5.49
168,98,836,61,0.86,0.93,15.32,29.53,6.25,15.32,285,965,2.07,10,6.96
169,99,550,16,0.94,0.98,7.57,20.52,3.11,7.57,118,575,1.86,7,6.19


In [21]:
df_urls = pd.read_excel("Input.xlsx")
df_urls['URL_ID'] = df_urls['URL_ID'].astype(str)
df_urls['URL_ID']  = df_urls['URL_ID'].apply(lambda x : x.rstrip('0').rstrip('.'))

In [22]:
df_urls.head()

Unnamed: 0,URL_ID,URL
0,1,https://insights.blackcoffer.com/how-is-login-...
1,2,https://insights.blackcoffer.com/how-does-ai-h...
2,3,https://insights.blackcoffer.com/ai-and-its-im...
3,4,https://insights.blackcoffer.com/how-do-deep-l...
4,5,https://insights.blackcoffer.com/how-artificia...


In [23]:
output = df_urls.merge(df, on='URL_ID', how='left')

In [24]:
output.to_csv("Output Data Structure.csv", index=False)

In [25]:
output.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,1,https://insights.blackcoffer.com/how-is-login-...,386,5,0.97,0.96,16.92,23.89,6.86,16.92,97,406,1.99,4,6.67
1,2,https://insights.blackcoffer.com/how-does-ai-h...,369,6,0.97,0.98,13.64,29.58,5.58,13.64,113,382,2.13,2,6.91
2,3,https://insights.blackcoffer.com/ai-and-its-im...,924,23,0.95,0.95,13.36,34.23,5.48,13.36,343,1002,2.22,13,7.19
3,4,https://insights.blackcoffer.com/how-do-deep-l...,255,1,0.99,0.98,17.33,30.77,7.06,17.33,80,260,2.17,1,7.02
4,5,https://insights.blackcoffer.com/how-artificia...,355,12,0.93,0.96,10.11,34.38,4.18,10.11,132,384,2.23,21,7.07
