## Data Prepare and Clean

In [75]:
import pandas as pd
import re

# Load the XLSX file
file_path = 'C:/Users/yufei/Programming/DeepTarget/web_scrape/scraped_data.xlsx'
data = pd.read_excel(file_path)

data.head()

Unnamed: 0,Title,URL,Summary Content,Main Content
0,A guide to clinical trials for cancer,https://medlineplus.gov/ency/patientinstructio...,"If you have cancer, a clinical trial may be an...","If you have cancer, a clinical trial may be an..."
1,A guide to help children understand cancer,https://medlineplus.gov/ency/patientinstructio...,"When your child is diagnosed with cancer, one ...","When your child is diagnosed with cancer, one ..."
2,A guide to herbal remedies,https://medlineplus.gov/ency/patientinstructio...,Herbal remedies are plants used like a medicin...,Herbal remedies are plants used like a medicin...
3,A1C test,https://medlineplus.gov/ency/article/003640.htm,A1C is a lab test that shows the average level...,A1C is a lab test that shows the average level...
4,Aarskog syndrome,https://medlineplus.gov/ency/article/001654.htm,Aarskog syndrome is a very rare disease that a...,Aarskog syndrome is a very rare disease that a...


In [76]:
from bs4 import BeautifulSoup

# Remove any rows with null values in the 'Cleaned Main Content' column
data = data.dropna(subset=['Main Content'])

# Define a function to clean the 'Main Content' column
def clean_text(text):
    # Remove HTML tags if present
    text = BeautifulSoup(text, "html.parser").get_text()
    # Remove extra spaces, tabs, and newlines
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Clean the 'Main Content' column
data['Cleaned Main Content'] = data['Main Content'].apply(clean_text)

data.head()

Unnamed: 0,Title,URL,Summary Content,Main Content,Cleaned Main Content
0,A guide to clinical trials for cancer,https://medlineplus.gov/ency/patientinstructio...,"If you have cancer, a clinical trial may be an...","If you have cancer, a clinical trial may be an...","If you have cancer, a clinical trial may be an..."
1,A guide to help children understand cancer,https://medlineplus.gov/ency/patientinstructio...,"When your child is diagnosed with cancer, one ...","When your child is diagnosed with cancer, one ...","When your child is diagnosed with cancer, one ..."
2,A guide to herbal remedies,https://medlineplus.gov/ency/patientinstructio...,Herbal remedies are plants used like a medicin...,Herbal remedies are plants used like a medicin...,Herbal remedies are plants used like a medicin...
3,A1C test,https://medlineplus.gov/ency/article/003640.htm,A1C is a lab test that shows the average level...,A1C is a lab test that shows the average level...,A1C is a lab test that shows the average level...
4,Aarskog syndrome,https://medlineplus.gov/ency/article/001654.htm,Aarskog syndrome is a very rare disease that a...,Aarskog syndrome is a very rare disease that a...,Aarskog syndrome is a very rare disease that a...


In [88]:
# Export the cleaned data to an Excel file
# file_path = "C:/Users/yufei/Downloads/cleaned_data.xlsx"
# data.to_excel(file_path, index=False)

In [77]:
filtered_data = data[['Title','URL','Cleaned Main Content']]

## Word Count

In [78]:
# Define a function to count the number of words in each summary
def word_count(text):
    return len(str(text).split())

# Apply the word count function to the "Summary Content" column
filtered_data['Word Count'] = filtered_data['Cleaned Main Content'].apply(word_count)

filtered_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Word Count'] = filtered_data['Cleaned Main Content'].apply(word_count)


Unnamed: 0,Title,URL,Cleaned Main Content,Word Count
0,A guide to clinical trials for cancer,https://medlineplus.gov/ency/patientinstructio...,"If you have cancer, a clinical trial may be an...",958
1,A guide to help children understand cancer,https://medlineplus.gov/ency/patientinstructio...,"When your child is diagnosed with cancer, one ...",1049
2,A guide to herbal remedies,https://medlineplus.gov/ency/patientinstructio...,Herbal remedies are plants used like a medicin...,837
3,A1C test,https://medlineplus.gov/ency/article/003640.htm,A1C is a lab test that shows the average level...,773
4,Aarskog syndrome,https://medlineplus.gov/ency/article/001654.htm,Aarskog syndrome is a very rare disease that a...,594


## Count the number of sentences

In [79]:
# Define a function to count the number of sentences in each summary
def sentence_count(text):
    return len(str(text).split('. '))

# Apply the sentence count function to the "Summary Content" column
filtered_data['Sentence Count'] = filtered_data['Cleaned Main Content'].apply(sentence_count)

filtered_data.head()

Unnamed: 0,Title,URL,Cleaned Main Content,Word Count,Sentence Count
0,A guide to clinical trials for cancer,https://medlineplus.gov/ency/patientinstructio...,"If you have cancer, a clinical trial may be an...",958,44
1,A guide to help children understand cancer,https://medlineplus.gov/ency/patientinstructio...,"When your child is diagnosed with cancer, one ...",1049,24
2,A guide to herbal remedies,https://medlineplus.gov/ency/patientinstructio...,Herbal remedies are plants used like a medicin...,837,50
3,A1C test,https://medlineplus.gov/ency/article/003640.htm,A1C is a lab test that shows the average level...,773,32
4,Aarskog syndrome,https://medlineplus.gov/ency/article/001654.htm,Aarskog syndrome is a very rare disease that a...,594,25


## Count the number of nounces

In [80]:

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from nltk import pos_tag
from nltk.tokenize import word_tokenize

# Tokenize and count nouns
def noun_count(text):
    words = word_tokenize(str(text))
    pos_tags = pos_tag(words)
    nouns = [word for word, pos in pos_tags if pos in ['NN', 'NNS', 'NNP', 'NNPS']]
    return len(nouns)

filtered_data['Noun Count'] = filtered_data['Cleaned Main Content'].apply(noun_count)

filtered_data.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yufei\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\yufei\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0,Title,URL,Cleaned Main Content,Word Count,Sentence Count,Noun Count
0,A guide to clinical trials for cancer,https://medlineplus.gov/ency/patientinstructio...,"If you have cancer, a clinical trial may be an...",958,44,294
1,A guide to help children understand cancer,https://medlineplus.gov/ency/patientinstructio...,"When your child is diagnosed with cancer, one ...",1049,24,363
2,A guide to herbal remedies,https://medlineplus.gov/ency/patientinstructio...,Herbal remedies are plants used like a medicin...,837,50,290
3,A1C test,https://medlineplus.gov/ency/article/003640.htm,A1C is a lab test that shows the average level...,773,32,303
4,Aarskog syndrome,https://medlineplus.gov/ency/article/001654.htm,Aarskog syndrome is a very rare disease that a...,594,25,306


## Sentiment Analysis

In [81]:

from textblob import TextBlob

# Define a function to get the sentiment polarity
def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

# Apply the sentiment function to the "Summary Content" column
filtered_data['Sentiment'] = filtered_data['Cleaned Main Content'].apply(get_sentiment)

# Show the dataframe with sentiment scores
filtered_data.head()


Unnamed: 0,Title,URL,Cleaned Main Content,Word Count,Sentence Count,Noun Count,Sentiment
0,A guide to clinical trials for cancer,https://medlineplus.gov/ency/patientinstructio...,"If you have cancer, a clinical trial may be an...",958,44,294,0.196665
1,A guide to help children understand cancer,https://medlineplus.gov/ency/patientinstructio...,"When your child is diagnosed with cancer, one ...",1049,24,363,0.067192
2,A guide to herbal remedies,https://medlineplus.gov/ency/patientinstructio...,Herbal remedies are plants used like a medicin...,837,50,290,0.156495
3,A1C test,https://medlineplus.gov/ency/article/003640.htm,A1C is a lab test that shows the average level...,773,32,303,0.040591
4,Aarskog syndrome,https://medlineplus.gov/ency/article/001654.htm,Aarskog syndrome is a very rare disease that a...,594,25,306,0.087954


- -1 indicates Very Negative
- -0.5 indicates Negative
- 0 indicates Neutral
- 0.5 indicates Positive
- 1 indicates Very Positive

## Readability Score - Flesch-Kincaid readability score

In [82]:
import textstat

# Define a function to calculate the Flesch-Kincaid readability score
def calculate_readability(text):
    return textstat.flesch_reading_ease(text)

# Apply the readability function to the "Summary Content" column
filtered_data['Readability'] = filtered_data['Cleaned Main Content'].apply(calculate_readability)

# Display the first few rows with the readability scores
filtered_data.head()


Unnamed: 0,Title,URL,Cleaned Main Content,Word Count,Sentence Count,Noun Count,Sentiment,Readability
0,A guide to clinical trials for cancer,https://medlineplus.gov/ency/patientinstructio...,"If you have cancer, a clinical trial may be an...",958,44,294,0.196665,66.64
1,A guide to help children understand cancer,https://medlineplus.gov/ency/patientinstructio...,"When your child is diagnosed with cancer, one ...",1049,24,363,0.067192,71.75
2,A guide to herbal remedies,https://medlineplus.gov/ency/patientinstructio...,Herbal remedies are plants used like a medicin...,837,50,290,0.156495,70.19
3,A1C test,https://medlineplus.gov/ency/article/003640.htm,A1C is a lab test that shows the average level...,773,32,303,0.040591,66.03
4,Aarskog syndrome,https://medlineplus.gov/ency/article/001654.htm,Aarskog syndrome is a very rare disease that a...,594,25,306,0.087954,45.86


- 90-100: Very easy to read
- 60-90: Easily understood by 13-15-year-olds
- 30-60: College-level text
- 30-: Very difficult to read

## Aggregated Statistics

In [83]:
# Count the Mean
aggregation_results = filtered_data[['Word Count', 'Sentence Count', 'Noun Count', 'Sentiment', 'Readability']].agg(
    ['mean']
)

# Display the aggregated results
aggregation_results

Unnamed: 0,Word Count,Sentence Count,Noun Count,Sentiment,Readability
mean,750.153061,32.112245,322.132653,0.069072,58.153265


In [84]:
# Round the results
aggregation_results['Word Count'] = aggregation_results['Word Count'].round(0)
aggregation_results['Sentence Count'] = aggregation_results['Sentence Count'].round(0)
aggregation_results['Noun Count'] = aggregation_results['Noun Count'].round(0)
aggregation_results['Sentiment'] = aggregation_results['Sentiment'].round(4)
aggregation_results['Readability'] = aggregation_results['Readability'].round(4)

aggregation_results.head()

Unnamed: 0,Word Count,Sentence Count,Noun Count,Sentiment,Readability
mean,750.0,32.0,322.0,0.0691,58.1533
