In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer


In [2]:
!pip install textstat



In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
df=pd.read_csv('Input.xlsx.csv')

In [5]:
df

Unnamed: 0,URL_ID,URL
0,37,https://insights.blackcoffer.com/ai-in-healthc...
1,38,https://insights.blackcoffer.com/what-if-the-c...
2,39,https://insights.blackcoffer.com/what-jobs-wil...
3,40,https://insights.blackcoffer.com/will-machine-...
4,41,https://insights.blackcoffer.com/will-ai-repla...
...,...,...
109,146,https://insights.blackcoffer.com/blockchain-fo...
110,147,https://insights.blackcoffer.com/the-future-of...
111,148,https://insights.blackcoffer.com/big-data-anal...
112,149,https://insights.blackcoffer.com/business-anal...


# Data Extraction

In [6]:
import requests
from bs4 import BeautifulSoup

In [7]:
# loop through each URL and extract article text
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']


In [8]:
    # Get the HTML content of the URL
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

In [9]:
# Find the article title and text
article_title = soup.find('h1').text.strip()
article_text = ''
article_content = soup.find('div', {'class': 'entry-content'})
if article_content is not None:
    for p in article_content.find_all('p'):
        article_text += p.text.strip() + '\n'

In [10]:
# Save the extracted article in a text file with URL_ID as its file name
with open(f'{url_id}.txt', 'w', encoding='utf-8') as f:
    f.write(article_title + '\n\n')
    f.write(article_text)

# Sentiment Analysis

In [11]:
from textblob import TextBlob


In [12]:
# Extract the URL column as a Series
url_series = df["URL"]


In [13]:
# Define a function to extract text from a URL and apply sentiment analysis to it
def get_sentiment(url):
    # Code to extract text from the URL goes here
    # For example, you could use the requests library to fetch the HTML content
    # of the URL and then use BeautifulSoup or another library to extract the text
    
    # Once you have extracted the text, apply sentiment analysis using TextBlob
    text_blob = TextBlob(url)
    return text_blob.sentiment.polarity

In [14]:
# Apply the sentiment analysis function to each row of the URL Series
sentiment_series = url_series.apply(get_sentiment)

In [15]:
# Add the sentiment column to the DataFrame
df["sentiment"] = sentiment_series

In [16]:
# Save the output data to a CSV file
df.to_csv("output.csv", index=False)

In [17]:
df

Unnamed: 0,URL_ID,URL,sentiment
0,37,https://insights.blackcoffer.com/ai-in-healthc...,0.0
1,38,https://insights.blackcoffer.com/what-if-the-c...,0.0
2,39,https://insights.blackcoffer.com/what-jobs-wil...,0.0
3,40,https://insights.blackcoffer.com/will-machine-...,0.0
4,41,https://insights.blackcoffer.com/will-ai-repla...,0.0
...,...,...,...
109,146,https://insights.blackcoffer.com/blockchain-fo...,0.0
110,147,https://insights.blackcoffer.com/the-future-of...,0.0
111,148,https://insights.blackcoffer.com/big-data-anal...,0.0
112,149,https://insights.blackcoffer.com/business-anal...,0.0


In [18]:
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
def clean_text(text):
    # Remove any unwanted characters, punctuation marks, and numbers
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text into words
    words = nltk.word_tokenize(text)
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    # Join the remaining words back into a string
    text = " ".join(words)
    return text

# 1.2 Creating dictionary of Positive and Negative words

In [20]:
from nltk.corpus import opinion_lexicon

pos_words = set(opinion_lexicon.positive())
neg_words = set(opinion_lexicon.negative())

In [21]:
pos_words

{'elatedly',
 'loyalty',
 'witty',
 'splendor',
 'affectionate',
 'convincing',
 'peach',
 'agility',
 'succes',
 'tantalizingly',
 'excitedly',
 'instantly',
 'outperforming',
 'happier',
 'envious',
 'smile',
 'undamaged',
 'competitive',
 'elegantly',
 'exceled',
 'willingness',
 'poeticize',
 'chivalrous',
 'lovably',
 'bravo',
 'wows',
 'luck',
 'saintliness',
 'amiability',
 'surmount',
 'warm',
 'tops',
 'thoughtfulness',
 'great',
 'spotless',
 'securely',
 'sexy',
 'all-around',
 'advocated',
 'strong',
 'peps',
 'enjoy',
 'continuity',
 'hotcake',
 'fastest',
 'abounds',
 'exultant',
 'favor',
 'jubilant',
 'realizable',
 'sincerity',
 'diligence',
 'adoringly',
 'favour',
 'suffice',
 'magnificence',
 'exceeding',
 'daring',
 'pampers',
 'beckoning',
 'dumbfounded',
 'seamless',
 'hooray',
 'vigilant',
 'better-known',
 'chic',
 'headway',
 'timely',
 'dependably',
 'properly',
 'thumbs-up',
 'fascinatingly',
 'grandeur',
 'progress',
 'envy',
 'spectacular',
 'congratulator

In [22]:
neg_words

{'underestimate',
 'assassinate',
 'cartoonish',
 'lewdly',
 'nitpicking',
 'wheedle',
 'unhelpful',
 'anarchistic',
 'conspiracies',
 'crueler',
 'inessential',
 'selfinterested',
 'toil',
 'anxious',
 'crashes',
 'rampant',
 'languid',
 'over-balanced',
 'alarming',
 'froze',
 'picky',
 'fatalistically',
 'dilemma',
 'pratfall',
 'afflict',
 'childish',
 'hindrance',
 'devilishly',
 'alarmed',
 'nervous',
 'helplessly',
 'undignified',
 'distrustful',
 'absurdity',
 'tentatively',
 'sorrowful',
 'biases',
 'inconsolably',
 'phony',
 'fumble',
 'gimmicky',
 'flighty',
 'volatile',
 'debilitate',
 'discriminatory',
 'criticizing',
 'frustrated',
 'instigate',
 'disturbingly',
 'envious',
 'snare',
 'stale',
 'unfortunately',
 'dehumanization',
 'argumentative',
 'betraying',
 'donside',
 'election-rigger',
 'fulminate',
 'insolence',
 'diabolically',
 'disorderly',
 'grossly',
 'marginally',
 'negativity',
 'insecure',
 'recessionary',
 'condemnation',
 'filth',
 'extraneous',
 'injudi

# 1.3 Extracting Derived variables

In [23]:
from textblob import TextBlob

def get_adjectives(text):
    blob = TextBlob(text)
    return len([word for (word, tag) in blob.tags if tag.startswith("JJ")])

def get_adverbs(text):
    blob = TextBlob(text)
    return len([word for (word, tag) in blob.tags if tag.startswith("RB")])

def get_pronouns(text):
    blob = TextBlob(text)
    return len([word for (word, tag) in blob.tags if tag == "PRP"])

# 2 Analysis of Readability

In [24]:
import textstat

def get_readability_scores(text):
    scores = {}
    scores["flesch_reading_ease"] = textstat.flesch_reading_ease(text)
    scores["smog_index"] = textstat.smog_index(text)
    scores["flesch_kincaid_grade"] = textstat.flesch_kincaid_grade(text)
    return scores

# 3 Average Number of Words Per Sentence

In [25]:


def get_avg_words_per_sentence(text):
    ts = textstatistics()
    return ts.avg_sentence_length(text)

# 4 Complex Word Count

In [26]:
def get_complex_word_count(text):
    ts = textstatistics()
    return ts.lexicon_count_complex(text)

# 5 Word Count

In [27]:
def get_word_count(text):
    ts = textstatistics()
    return ts.lexicon_count(text)

# 6 Syllable Count Per Word

In [28]:
def get_syllable_count(text):
    ts = textstatistics()
    return ts.syllable_count(text)

# 7 Personal Pronouns

In [29]:
def get_personal_pronouns(text):
    personal_pronouns = ["I", "me", "my", "mine", "myself", "you", "your", "yours", "yourself", "we", "us", "our", "ours", "ourselves", "yourselves", "they", "them", "their", "theirs", "themselves"]
    blob = TextBlob(text)
    return len([word for word in blob.words if word.lower() in personal_pronouns])

In [30]:
df.to_csv('output.csv', index=False)

In [31]:
df

Unnamed: 0,URL_ID,URL,sentiment
0,37,https://insights.blackcoffer.com/ai-in-healthc...,0.0
1,38,https://insights.blackcoffer.com/what-if-the-c...,0.0
2,39,https://insights.blackcoffer.com/what-jobs-wil...,0.0
3,40,https://insights.blackcoffer.com/will-machine-...,0.0
4,41,https://insights.blackcoffer.com/will-ai-repla...,0.0
...,...,...,...
109,146,https://insights.blackcoffer.com/blockchain-fo...,0.0
110,147,https://insights.blackcoffer.com/the-future-of...,0.0
111,148,https://insights.blackcoffer.com/big-data-anal...,0.0
112,149,https://insights.blackcoffer.com/business-anal...,0.0


In [32]:
import pandas as pd
from textblob import TextBlob
from nltk.corpus import stopwords
from textstat import syllable_count, lexicon_count

In [33]:
# Define function to remove stopwords from text
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return " ".join([word for word in str(text).split() if word not in stop_words])


In [34]:
# Define function to get the average number of words per sentence
def get_avg_words_per_sentence(text):
    sentences = text.split('.')
    num_words = 0
    num_sentences = len(sentences)
    for sentence in sentences:
        num_words += lexicon_count(sentence)
    if num_sentences > 0:
        return num_words / num_sentences
    else:
        return 0

In [35]:
# Define function to get the complex word count
def get_complex_word_count(text):
    words = text.split()
    num_complex_words = 0
    for word in words:
        if syllable_count(word) >= 3:
            num_complex_words += 1
    return num_complex_words

In [36]:
# Clean text using stop words
df['cleaned_text'] = df['URL'].apply(remove_stopwords)

In [37]:
# Get sentiment polarity and subjectivity
df['polarity'] = df['cleaned_text'].apply(lambda text: TextBlob(text).sentiment.polarity)
df['subjectivity'] = df['cleaned_text'].apply(lambda text: TextBlob(text).sentiment.subjectivity)


In [38]:
# Add new columns for derived variables
df['word_count'] = df['cleaned_text'].apply(lambda text: len(text.split()))
df['avg_word_length'] = df['cleaned_text'].apply(lambda text: sum(len(word) for word in text.split()) / len(text.split()))
df['syllable_count'] = df['cleaned_text'].apply(lambda text: sum(syllable_count(word) for word in text.split()))
df['personal_pronouns'] = df['cleaned_text'].apply(lambda text: text.lower().count('i ') + text.lower().count('me ') + text.lower().count('my ') + text.lower().count('mine '))
df['avg_words_per_sentence'] = df['URL'].apply(get_avg_words_per_sentence)
df['complex_word_count'] = df['cleaned_text'].apply(get_complex_word_count)

In [39]:
# Write results to new CSV file
df.to_csv('output_data.csv', index=False)

In [40]:
df

Unnamed: 0,URL_ID,URL,sentiment,cleaned_text,polarity,subjectivity,word_count,avg_word_length,syllable_count,personal_pronouns,avg_words_per_sentence,complex_word_count
0,37,https://insights.blackcoffer.com/ai-in-healthc...,0.0,https://insights.blackcoffer.com/ai-in-healthc...,0.0,0.0,1,78.0,14,0,1.0,1
1,38,https://insights.blackcoffer.com/what-if-the-c...,0.0,https://insights.blackcoffer.com/what-if-the-c...,0.0,0.0,1,81.0,12,0,1.0,1
2,39,https://insights.blackcoffer.com/what-jobs-wil...,0.0,https://insights.blackcoffer.com/what-jobs-wil...,0.0,0.0,1,86.0,13,0,1.0,1
3,40,https://insights.blackcoffer.com/will-machine-...,0.0,https://insights.blackcoffer.com/will-machine-...,0.0,0.0,1,86.0,15,0,1.0,1
4,41,https://insights.blackcoffer.com/will-ai-repla...,0.0,https://insights.blackcoffer.com/will-ai-repla...,0.0,0.0,1,68.0,11,0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
109,146,https://insights.blackcoffer.com/blockchain-fo...,0.0,https://insights.blackcoffer.com/blockchain-fo...,0.0,0.0,1,57.0,9,0,1.0,1
110,147,https://insights.blackcoffer.com/the-future-of...,0.0,https://insights.blackcoffer.com/the-future-of...,0.0,0.0,1,57.0,11,0,1.0,1
111,148,https://insights.blackcoffer.com/big-data-anal...,0.0,https://insights.blackcoffer.com/big-data-anal...,0.0,0.0,1,66.0,15,0,1.0,1
112,149,https://insights.blackcoffer.com/business-anal...,0.0,https://insights.blackcoffer.com/business-anal...,0.0,0.0,1,79.0,18,0,1.0,1


In [42]:
# Compute sentiment scores using TextBlob
df['sentiment'] = df['cleaned_text'].apply(lambda x: TextBlob(x).sentiment)


In [43]:
# Extract the positive and negative scores from the sentiment object
df['positive_score'] = df['sentiment'].apply(lambda x: x.polarity if x.polarity > 0 else 0)
df['negative_score'] = df['sentiment'].apply(lambda x: abs(x.polarity) if x.polarity < 0 else 0)

In [44]:
# Extract the positive and negative scores from the sentiment object
df['positive_score'] = df['sentiment'].apply(lambda x: x.polarity if x.polarity > 0 else 0)
df['negative_score'] = df['sentiment'].apply(lambda x: abs(x.polarity) if x.polarity < 0 else 0)

In [45]:
# Write the results to a new CSV file
df.to_csv('output.csv', index=False)

In [46]:
df

Unnamed: 0,URL_ID,URL,sentiment,cleaned_text,polarity,subjectivity,word_count,avg_word_length,syllable_count,personal_pronouns,avg_words_per_sentence,complex_word_count,positive_score,negative_score
0,37,https://insights.blackcoffer.com/ai-in-healthc...,"(0.0, 0.0)",https://insights.blackcoffer.com/ai-in-healthc...,0.0,0.0,1,78.0,14,0,1.0,1,0,0
1,38,https://insights.blackcoffer.com/what-if-the-c...,"(0.0, 0.0)",https://insights.blackcoffer.com/what-if-the-c...,0.0,0.0,1,81.0,12,0,1.0,1,0,0
2,39,https://insights.blackcoffer.com/what-jobs-wil...,"(0.0, 0.0)",https://insights.blackcoffer.com/what-jobs-wil...,0.0,0.0,1,86.0,13,0,1.0,1,0,0
3,40,https://insights.blackcoffer.com/will-machine-...,"(0.0, 0.0)",https://insights.blackcoffer.com/will-machine-...,0.0,0.0,1,86.0,15,0,1.0,1,0,0
4,41,https://insights.blackcoffer.com/will-ai-repla...,"(0.0, 0.0)",https://insights.blackcoffer.com/will-ai-repla...,0.0,0.0,1,68.0,11,0,1.0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,146,https://insights.blackcoffer.com/blockchain-fo...,"(0.0, 0.0)",https://insights.blackcoffer.com/blockchain-fo...,0.0,0.0,1,57.0,9,0,1.0,1,0,0
110,147,https://insights.blackcoffer.com/the-future-of...,"(0.0, 0.0)",https://insights.blackcoffer.com/the-future-of...,0.0,0.0,1,57.0,11,0,1.0,1,0,0
111,148,https://insights.blackcoffer.com/big-data-anal...,"(0.0, 0.0)",https://insights.blackcoffer.com/big-data-anal...,0.0,0.0,1,66.0,15,0,1.0,1,0,0
112,149,https://insights.blackcoffer.com/business-anal...,"(0.0, 0.0)",https://insights.blackcoffer.com/business-anal...,0.0,0.0,1,79.0,18,0,1.0,1,0,0


In [47]:
import pandas as pd

# Read the output.csv file
df = pd.read_csv('output.csv')

# Export to Excel format
df.to_excel('output.xlsx', index=False)

In [48]:
import pandas as pd
from IPython.display import FileLink

# Read the output.csv file
df = pd.read_csv('output.csv')

# Export to Excel format
df.to_excel('output.xlsx', index=False)

# Download the file
FileLink('output.xlsx')