In [1]:
import pandas as pd
import nltk
from nltk import sent_tokenize,WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
import requests
from bs4 import BeautifulSoup

### making list of positive and negative words using link given in problem statement and a list of stop words from nltk

In [6]:
word_dict=pd.read_csv('files/master dictionary/LoughranMcDonald_MasterDictionary_2020.csv')

In [7]:
positive_words=list(word_dict.Word[word_dict.Positive>0])
positive_words=[word.lower() for word in positive_words]

In [8]:
negative_words=list(word_dict.Word[word_dict.Negative>0])
negative_words=[word.lower() for word in negative_words]

In [9]:
stop_words=stopwords.words('english')

### reading input

In [10]:
input_data=pd.read_excel('files/Input.xlsx')

### extracting text files from articles and saving them with their respective id names in articles folder

In [11]:
def extract_text(url_id,url):
    page=requests.get(url, headers={"User-Agent": "XY"}) 
    data=page.content
    soup=BeautifulSoup(data,'html.parser')
    article_title = soup.find('title').text.strip()
    article_content=soup.find("div",attrs={'class':"td-post-content"}).text.strip()
    with open(f"articles/{url_id}.txt",'w', encoding="utf-8") as f:
        f.write(article_title+'\n')
        f.write(article_content)

In [9]:
input_data.apply(lambda row: extract_text(row['URL_ID'],row['URL']),axis=1)

0      None
1      None
2      None
3      None
4      None
       ... 
165    None
166    None
167    None
168    None
169    None
Length: 170, dtype: object

### function to preprocess text, i.e, break text into sentences and words and then removing stop words from text and lowercasing it

In [12]:
def preprocess_text(text:str):
    sentences=sent_tokenize(text)
    tokens=word_tokenize(text)
    lemma=WordNetLemmatizer()
    punctuations=['?','!',',','.']
    words=[lemma.lemmatize(word.lower()) for word in tokens if word.isalpha() and word not in stop_words and word not in punctuations]
    return sentences,tokens,words

### to count positive and negative score of a text 

In [13]:
def score(words):
    pos_score=0
    neg_score=0
    for word in words:
        if word in positive_words:
            pos_score+=1
        if word in negative_words:
            neg_score-=1
    neg_score*=-1
    return pos_score,neg_score

### to count complex words

In [14]:
def count_syllables(word:str):
    vowels='aeiou'
    endings=['es','ed']
    syllables=0
    for i in word:
        if i in vowels:
            syllables+=1
    for end in endings:
        if word.endswith(end):
            syllables-=1
            break
    return syllables
def count_complex_words(words):
    complex_word_counts=0
    total_syllables=0
    for word in words:
        if count_syllables(word)>2:
            complex_word_counts+=1
        total_syllables+=1
    return complex_word_counts,total_syllables

### to count personal prononuns 

In [15]:
def count_pronouns(words):
    pronouns=['i','we','us','ours','my','I','We','Us','Ours','My']
    pronoun_count=0
    for word in words:
        if word in pronouns:
            pronoun_count+=1
    return pronoun_count

### total number of characters in a text 

In [16]:
def count_characters(words):
    character_count=0
    for word in words:
        character_count+=len(word)
        
    return character_count

### analyse the text for respective variables

In [17]:
def analyse_text(url_id):
    with open(f'articles/{url_id}.txt','r',encoding='utf8') as file:
        text=file.read().lower()
    sentences,words,cleaned_words=preprocess_text(text)
    pos_score,neg_score=score(cleaned_words)
    
    polarity=round((pos_score-neg_score)/((pos_score+neg_score)+0.000001),6)
    
    subjectivity=round((pos_score+neg_score)/(len(cleaned_words)+0.000001),6)
    
    average_sentence_length=round(len(words)/len(sentences))
    
    complex_word_counts, syllable_count =count_complex_words(words)
    
    complex_words_percentage=round(complex_word_counts*100/len(words),ndigits=6)
    
    fog_index=round(.4*(average_sentence_length+complex_words_percentage),ndigits=6)
    
    clean_word_count=len(cleaned_words)
    
    syllable_per_word= round(syllable_count/len(words))
    
    pronoun_count=count_pronouns(words)
    
    character_count=count_characters(words)
    
    average_word_length=round(character_count/len(words))
    
    return pos_score,neg_score,polarity,subjectivity,average_sentence_length,complex_words_percentage,fog_index,average_sentence_length,complex_word_counts,clean_word_count,syllable_per_word,pronoun_count,average_word_length


In [18]:
# making list of all new columns to be added
new=['POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE','SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH','PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX','AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT','SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']

In [19]:
# output dataframe for storing output
output_data=input_data

In [20]:
#making dataframe of all new columns to be added for analysed test
new_data=pd.DataFrame(list(input_data.apply(lambda row: analyse_text(row['URL_ID']),axis=1)),columns=new)

In [21]:
# joining output and new data
output_data=output_data.join(new_data)

In [22]:
output_data

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,1.0,https://insights.blackcoffer.com/how-is-login-...,4,5,-0.111111,0.021378,33,21.638142,21.855257,33,177,421,1,4,5
1,2.0,https://insights.blackcoffer.com/how-does-ai-h...,8,6,0.142857,0.036842,25,24.860335,19.944134,25,178,380,1,2,5
2,3.0,https://insights.blackcoffer.com/ai-and-its-im...,32,20,0.230769,0.049952,26,26.269098,20.907639,26,533,1041,1,13,5
3,4.0,https://insights.blackcoffer.com/how-do-deep-l...,6,1,0.714286,0.026718,31,26.626016,23.050406,31,131,262,1,1,5
4,5.0,https://insights.blackcoffer.com/how-artificia...,20,16,0.111111,0.053812,24,23.131673,18.852669,24,325,669,1,27,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,167.0,https://insights.blackcoffer.com/role-big-data...,15,38,-0.433962,0.062871,25,21.846330,18.738532,25,381,843,1,15,5
166,168.0,https://insights.blackcoffer.com/sales-forecas...,21,12,0.272727,0.056410,29,25.272727,21.709091,29,278,585,1,0,5
167,169.0,https://insights.blackcoffer.com/detect-data-e...,3,49,-0.884615,0.093190,21,25.448029,18.579212,21,284,558,1,6,5
168,170.0,https://insights.blackcoffer.com/data-exfiltra...,4,10,-0.428571,0.044164,24,21.777778,18.311111,24,147,317,1,11,4


In [23]:
output_data.describe()

Unnamed: 0,URL_ID,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
count,170.0,170.0,170.0,170.0,170.0,170.0,170.0,170.0,170.0,170.0,170.0,170.0,170.0,170.0
mean,86.247059,14.8,21.882353,-0.078236,0.057684,26.776471,22.513666,19.716055,26.776471,284.664706,620.494118,1.0,6.4,4.758824
std,49.547994,9.71158,18.383754,0.492179,0.022251,14.26244,3.636588,5.865223,14.26244,127.281914,275.570548,0.0,8.13961,0.429061
min,1.0,0.0,0.0,-1.0,0.009901,15.0,11.663067,12.065589,15.0,35.0,88.0,1.0,0.0,4.0
25%,43.5,7.25,9.0,-0.441967,0.041648,22.0,20.175687,17.532314,22.0,184.0,413.0,1.0,1.0,5.0
50%,86.5,13.5,18.0,-0.142857,0.055795,25.0,22.191959,19.266744,25.0,277.0,597.0,1.0,4.0,5.0
75%,128.75,20.75,32.75,0.333333,0.070979,28.0,24.965084,21.144596,28.0,375.75,817.0,1.0,8.0,5.0
max,171.0,47.0,121.0,1.0,0.138996,191.0,32.125604,84.874043,191.0,658.0,1927.0,1.0,44.0,5.0


In [25]:
output_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170 entries, 0 to 169
Data columns (total 15 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   URL_ID                            170 non-null    float64
 1   URL                               170 non-null    object 
 2   POSITIVE SCORE                    170 non-null    int64  
 3   NEGATIVE SCORE                    170 non-null    int64  
 4   POLARITY SCORE                    170 non-null    float64
 5   SUBJECTIVITY SCORE                170 non-null    float64
 6   AVG SENTENCE LENGTH               170 non-null    int64  
 7   PERCENTAGE OF COMPLEX WORDS       170 non-null    float64
 8   FOG INDEX                         170 non-null    float64
 9   AVG NUMBER OF WORDS PER SENTENCE  170 non-null    int64  
 10  COMPLEX WORD COUNT                170 non-null    int64  
 11  WORD COUNT                        170 non-null    int64  
 12  SYLLABLE

In [26]:
output_data.loc[95]

URL_ID                                                                           97.0
URL                                 https://insights.blackcoffer.com/how-will-covi...
POSITIVE SCORE                                                                      7
NEGATIVE SCORE                                                                     18
POLARITY SCORE                                                                  -0.44
SUBJECTIVITY SCORE                                                           0.056818
AVG SENTENCE LENGTH                                                                77
PERCENTAGE OF COMPLEX WORDS                                                 11.663067
FOG INDEX                                                                   35.465227
AVG NUMBER OF WORDS PER SENTENCE                                                   77
COMPLEX WORD COUNT                                                                108
WORD COUNT                                            

In [27]:
output_data.to_excel(f'files/output.xlsx')