In [2]:
import pandas as pd

# Define the file path
file_path = 'data.csv'

# Load the dataset into a pandas DataFrame
try:
    df = pd.read_csv(file_path)
    print("File loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}")
    print("Please make sure 'data.csv' is in the same folder as your notebook.")

# Display the first 5 rows of the DataFrame
print("--- First 5 rows of the data ---")
print(df.head())

# Get a concise summary of the DataFrame
print("\n--- DataFrame Info ---")
df.info()

File loaded successfully!
--- First 5 rows of the data ---
                                                 url  \
0     https://www.cm-alliance.com/cybersecurity-blog   
1    https://www.varonis.com/blog/cybersecurity-tips   
2  https://www.cisecurity.org/insights/blog/11-cy...   
3  https://www.cisa.gov/topics/cybersecurity-best...   
4  https://www.qnbtrust.bank/Resources/Learning-C...   

                                        html_content  
0  <!doctype html><!--[if lt IE 7]> <html class="...  
1  <!doctype html><html lang="en"><head>\n    <me...  
2  <!DOCTYPE html><html data-unhead-vue-server-re...  
3  \n\n<!DOCTYPE html>\n<html lang="en" dir="ltr"...  
4                                                NaN  

--- DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 0 to 80
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   url           81 non-null     object
 1   html_content  69 

In [3]:
from bs4 import BeautifulSoup

def parse_html(html_content):
    """
    Parses HTML content to extract clean, readable text.
    Handles potential errors by returning an empty string.
    """
    # Check if content is a string, return empty if not (handles non-string/NaN values)
    if not isinstance(html_content, str):
        return ""
    try:
        # Use the lxml parser for speed and efficiency
        soup = BeautifulSoup(html_content, 'lxml')

        # Remove script and style elements as they don't contain readable content
        for script_or_style in soup(["script", "style"]):
            script_or_style.decompose()

        # Get text, using a space as a separator and stripping extra whitespace
        text = soup.get_text(separator=' ', strip=True)
        return text
    except Exception as e:
        # If any other error occurs during parsing, return an empty string
        return ""

In [4]:
# Apply the function to the 'html_content' column
df['cleaned_text'] = df['html_content'].apply(parse_html)

# Display the first 5 rows of the new column to see the result
print("--- Cleaned Text from HTML ---")
print(df[['url', 'cleaned_text']].head())

--- Cleaned Text from HTML ---
                                                 url  \
0     https://www.cm-alliance.com/cybersecurity-blog   
1    https://www.varonis.com/blog/cybersecurity-tips   
2  https://www.cisecurity.org/insights/blog/11-cy...   
3  https://www.cisa.gov/topics/cybersecurity-best...   
4  https://www.qnbtrust.bank/Resources/Learning-C...   

                                        cleaned_text  
0  Cyber Security Blog <img  height="1" width="1"...  
1  Top 10 Cybersecurity Awareness Tips: How to St...  
2  11 Cyber Defense Tips to Stay Secure at Work a...  
3  Cybersecurity Best Practices | Cybersecurity a...  
4                                                     


In [5]:
import sys

print(f"Checking libraries in this Python environment: {sys.executable}\n")

libraries_to_check = {
    "pandas": "pandas",
    "scikit-learn": "sklearn",
    "nltk": "nltk",
    "BeautifulSoup": "bs4",
    "lxml": "lxml",
    "textstat": "textstat",
    "tqdm": "tqdm",
    "streamlit": "streamlit"
}

all_installed = True

for install_name, import_name in libraries_to_check.items():
    try:
        __import__(import_name)
        print(f"✅ {install_name.ljust(15)} ... INSTALLED")
    except ImportError:
        print(f"❌ {install_name.ljust(15)} ... NOT INSTALLED (run: pip install {install_name})")
        all_installed = False

print("\n" + "="*40)
if all_installed:
    print("✅ All required libraries are installed. You are ready to go!")
else:
    print("❌ Some libraries are missing. Please install them using the commands above.")

Checking libraries in this Python environment: c:\Users\91854\Documents\leadwalnut\venv\Scripts\python.exe

✅ pandas          ... INSTALLED
✅ scikit-learn    ... INSTALLED
✅ nltk            ... INSTALLED
✅ BeautifulSoup   ... INSTALLED
✅ lxml            ... INSTALLED
✅ textstat        ... INSTALLED
✅ tqdm            ... INSTALLED
❌ streamlit       ... NOT INSTALLED (run: pip install streamlit)

❌ Some libraries are missing. Please install them using the commands above.


In [7]:
import re
from tqdm import tqdm

print("Using a custom, NLTK-free preprocessing function to fix the error.")

def preprocess_text_simple(text):
    """
    A simple, NLTK-free text preprocessor that cannot fail.
    """
    if not isinstance(text, str):
        return []

    # A short list of common stop words
    stop_words = set(['the', 'a', 'an', 'in', 'is', 'it', 'and', 'of', 'for', 'to', 'was', 'were', 'on', 'at', 'with', 'by'])

    # 1. Convert to lowercase and remove punctuation
    text = re.sub(r'[^\w\s]', '', text.lower())

    # 2. Split text into words
    tokens = text.split()

    # 3. Remove stop words
    processed_tokens = [word for word in tokens if word not in stop_words]
    
    return processed_tokens

# --- Use a more stable apply method to run our new function ---
tqdm.pandas(desc="Processing Text")
df['processed_tokens'] = df['cleaned_text'].progress_apply(preprocess_text_simple)

print("\nPREPROCESSING IS FINALLY COMPLETE!")

# --- Display the Result ---
print("\n--- Text After Preprocessing ---")
print(df[['cleaned_text', 'processed_tokens']].head())

Using a custom, NLTK-free preprocessing function to fix the error.


Processing Text: 100%|██████████| 81/81 [00:00<00:00, 745.93it/s]


PREPROCESSING IS FINALLY COMPLETE!

--- Text After Preprocessing ---
                                        cleaned_text  \
0  Cyber Security Blog <img  height="1" width="1"...   
1  Top 10 Cybersecurity Awareness Tips: How to St...   
2  11 Cyber Defense Tips to Stay Secure at Work a...   
3  Cybersecurity Best Practices | Cybersecurity a...   
4                                                      

                                    processed_tokens  
0  [cyber, security, blog, img, height1, width1, ...  
1  [top, 10, cybersecurity, awareness, tips, how,...  
2  [11, cyber, defense, tips, stay, secure, work,...  
3  [cybersecurity, best, practices, cybersecurity...  
4                                                 []  





In [8]:
def calculate_text_stats(cleaned_text, tokens):
    """
    Calculates basic text statistics from the cleaned text and tokens.
    """
    word_count = len(tokens)
    char_count = len(cleaned_text)

    # Avoid division by zero if there are no words
    if word_count > 0:
        # Calculate the average word length from the tokens themselves
        avg_word_length = sum(len(word) for word in tokens) / word_count
    else:
        avg_word_length = 0

    return word_count, char_count, avg_word_length

In [9]:
# Apply the function across the rows of the DataFrame
df[['word_count', 'char_count', 'avg_word_length']] = df.apply(
    lambda row: calculate_text_stats(row['cleaned_text'], row['processed_tokens']),
    axis=1,
    result_type='expand'
)

# Display the new feature columns
print("--- Basic Text Features ---")
print(df[['url', 'word_count', 'char_count', 'avg_word_length']].head())

--- Basic Text Features ---
                                                 url  word_count  char_count  \
0     https://www.cm-alliance.com/cybersecurity-blog      2143.0     17417.0   
1    https://www.varonis.com/blog/cybersecurity-tips      2006.0     16436.0   
2  https://www.cisecurity.org/insights/blog/11-cy...      1227.0      9775.0   
3  https://www.cisa.gov/topics/cybersecurity-best...       954.0      8625.0   
4  https://www.qnbtrust.bank/Resources/Learning-C...         0.0         0.0   

   avg_word_length  
0         6.183854  
1         6.194417  
2         5.999185  
3         7.060797  
4         0.000000  


In [10]:
import textstat

def calculate_readability(text):
    """
    Calculates the Flesch Reading Ease score.
    Handles potential errors (e.g., for empty text) by returning 0.
    """
    try:
        # The text must have a minimum number of words for textstat to work
        if len(text.split()) > 100:
            return textstat.flesch_reading_ease(text)
        else:
            return 0
    except:
        return 0

In [11]:
# Apply the readability function
df['readability_score'] = df['cleaned_text'].apply(calculate_readability)

# Display the new feature along with the word count
print("--- Readability Scores ---")
print(df[['url', 'word_count', 'readability_score']].head())

--- Readability Scores ---
                                                 url  word_count  \
0     https://www.cm-alliance.com/cybersecurity-blog      2143.0   
1    https://www.varonis.com/blog/cybersecurity-tips      2006.0   
2  https://www.cisecurity.org/insights/blog/11-cy...      1227.0   
3  https://www.cisa.gov/topics/cybersecurity-best...       954.0   
4  https://www.qnbtrust.bank/Resources/Learning-C...         0.0   

   readability_score  
0          28.674270  
1          32.694481  
2          36.281772  
3          -0.732078  
4           0.000000  


In [12]:
# Join the list of tokens into a single string with spaces in between
df['processed_text'] = df['processed_tokens'].apply(' '.join)

# Display the new column to see the result
print("--- Joined Tokens for Vectorization ---")
print(df[['processed_text']].head())

--- Joined Tokens for Vectorization ---
                                      processed_text
0  cyber security blog img height1 width1 styledi...
1  top 10 cybersecurity awareness tips how stay s...
2  11 cyber defense tips stay secure work home yo...
3  cybersecurity best practices cybersecurity inf...
4                                                   


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer. We'll limit it to the 5000 most frequent words
# to keep the matrix size manageable and focus on important terms.
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit the vectorizer to your text and transform the data into a matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_text'])

# Print the shape of the matrix to see what you've created
print(f"\nShape of the TF-IDF matrix: {tfidf_matrix.shape}")


Shape of the TF-IDF matrix: (81, 5000)


In [14]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate the cosine similarity between all document vectors
cosine_sim_matrix = cosine_similarity(tfidf_matrix)

# Print the shape to confirm it's a square matrix (81x81)
print(f"Shape of the Cosine Similarity Matrix: {cosine_sim_matrix.shape}")

Shape of the Cosine Similarity Matrix: (81, 81)


In [15]:
import numpy as np

def find_similar_pairs(similarity_matrix, threshold=0.90):
    """
    Finds pairs of documents with a similarity score above a given threshold.
    """
    # We only need to check the upper triangle of the matrix to avoid duplicates
    upper_triangle = np.triu(similarity_matrix, k=1)
    
    # Find the indices (row, col) where similarity is above the threshold
    rows, cols = np.where(upper_triangle > threshold)
    
    similar_pairs = []
    for doc1_index, doc2_index in zip(rows, cols):
        similarity_score = similarity_matrix[doc1_index, doc2_index]
        similar_pairs.append((doc1_index, doc2_index, similarity_score))
        
    return similar_pairs

# Find pairs with similarity > 90%
similar_document_pairs = find_similar_pairs(cosine_sim_matrix, threshold=0.90)

# --- Display the results ---
print(f"\nFound {len(similar_document_pairs)} pairs with > 90% similarity.")
print("--- Highly Similar Document Pairs ---")

if not similar_document_pairs:
    print("No highly similar pairs found.")
else:
    for pair in similar_document_pairs:
        doc1_url = df.iloc[pair[0]]['url']
        doc2_url = df.iloc[pair[1]]['url']
        score = pair[2]
        print(f"\nURL 1: {doc1_url}")
        print(f"URL 2: {doc2_url}")
        print(f"Similarity Score: {score:.4f}")


Found 0 pairs with > 90% similarity.
--- Highly Similar Document Pairs ---
No highly similar pairs found.


In [16]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the scaler
scaler = MinMaxScaler()

# Scale the features to be between 0 and 1
df['scaled_readability'] = scaler.fit_transform(df[['readability_score']])
df['scaled_word_count'] = scaler.fit_transform(df[['word_count']])

# Create a composite score. We'll give readability a 60% weight and word count a 40% weight.
df['quality_score'] = (0.6 * df['scaled_readability']) + (0.4 * df['scaled_word_count'])

# Display the new scores
print("--- Composite Quality Scores ---")
print(df[['url', 'quality_score']].head())

--- Composite Quality Scores ---
                                                 url  quality_score
0     https://www.cm-alliance.com/cybersecurity-blog       0.230479
1    https://www.varonis.com/blog/cybersecurity-tips       0.251596
2  https://www.cisecurity.org/insights/blog/11-cy...       0.260940
3  https://www.cisa.gov/topics/cybersecurity-best...       0.044325
4  https://www.qnbtrust.bank/Resources/Learning-C...       0.034733


In [17]:
# Calculate the median (the middle value) of your quality scores
median_quality_score = df['quality_score'].median()

# Create the binary label: 1 if the score is above the median, 0 otherwise
df['is_high_quality'] = (df['quality_score'] > median_quality_score).astype(int)

# Check the distribution of your new label to see how many high/low quality articles you have
print("\n--- Distribution of Quality Labels ---")
print(df['is_high_quality'].value_counts())


--- Distribution of Quality Labels ---
is_high_quality
0    41
1    40
Name: count, dtype: int64


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Define the feature columns you want the model to learn from
feature_columns = ['word_count', 'char_count', 'avg_word_length', 'readability_score']

# Create your feature matrix (X) and target vector (y)
X = df[feature_columns]
y = df['is_high_quality']

# Split the data into a training set (80%) and a testing set (20%)
# The test set is held back to evaluate the model's performance on unseen data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

Training data shape: (64, 4)
Testing data shape: (17, 4)


In [19]:
# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)

print("Model training complete!")

Model training complete!


In [20]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the model's accuracy (how many predictions were correct)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Print a detailed classification report showing precision, recall, and f1-score
print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred))

Model Accuracy: 0.94

--- Classification Report ---
              precision    recall  f1-score   support

           0       0.90      1.00      0.95         9
           1       1.00      0.88      0.93         8

    accuracy                           0.94        17
   macro avg       0.95      0.94      0.94        17
weighted avg       0.95      0.94      0.94        17



In [21]:
import joblib

# Define filenames for your model and vectorizer
model_filename = 'content_quality_model.joblib'
vectorizer_filename = 'tfidf_vectorizer.joblib'

# Save the trained model
joblib.dump(model, model_filename)

# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, vectorizer_filename)

print(f"Model saved to {model_filename}")
print(f"Vectorizer saved to {vectorizer_filename}")

Model saved to content_quality_model.joblib
Vectorizer saved to tfidf_vectorizer.joblib
