Data Collection:

In [None]:
import requests
import pandas as pd
import time

API_KEY = '7fd6cc57-2121-4d82-ab66-548f409faf19'
BASE_URL = 'https://content.guardianapis.com/search'

# Parameters for the API request
params = {
    'api-key': API_KEY,
    'from-date': '1989-01-01',
    'to-date': '2024-06-30',
    'page-size': 50,
    'show-fields': 'headline',
    'order-by': 'oldest',
}

def fetch_titles(page):
    params['page'] = page
    response = requests.get(BASE_URL, params=params)
    if response.status_code == 200:
        data = response.json()
        return data['response']['results']
    else:
        print(f'Failed to fetch page {page}: {response.status_code}')
        return []

titles = []
page = 1
while True:
    results = fetch_titles(page)
    if not results:
        break
    for result in results:
        titles.append(result['fields']['headline'])
    print(f'Fetched page {page}')
    page += 1
    time.sleep(0.1)  # to avoid hitting rate limits

# Save titles to a CSV file
df = pd.DataFrame(titles, columns=['Title'])
df.to_csv('guardian_titles.csv', index=False)
print('Saved titles to guardian_titles.csv')


In [12]:

# Example: Load data from a CSV file
data = pd.read_csv('guardian_titles.csv')





Data Preprocessing:

Here we remove common stopwords like "the", "a", etc. While the presence of these might actually indicate relative success of an article title, we're looking for other keywords and similarities titles share. We'll also ignore tenses through "Lemmatization". 

In [15]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_title(title):
    tokens = word_tokenize(title.lower())
    filtered_tokens = [lemmatizer.lemmatize(w) for w in tokens if not w in stop_words]
    return ' '.join(filtered_tokens)

data['processed_title'] = data['Title'].apply(preprocess_title)


Feature Extraction:

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd

# Sample DataFrame (replace with your actual data)
data = pd.DataFrame({
    'Title': ["This is a test title", "Another example title", "More data to process", "", "   "]
})

# Preprocessing steps
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_title(title):
    tokens = word_tokenize(title.lower())
    filtered_tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return filtered_tokens

# Assuming 'data' is your DataFrame containing titles
data['processed_title'] = data['Title'].apply(preprocess_title)

# Filter out empty or whitespace-only titles after preprocessing
data['processed_title_str'] = data['processed_title'].apply(lambda x: ' '.join(x))
data = data[data['processed_title_str'].str.strip() != '']

# Create TF-IDF matrix
if not data.empty:
    vectorizer = TfidfVectorizer(ngram_range=(1, 2))
    tfidf_matrix = vectorizer.fit_transform(data['processed_title_str'])
    print("TF-IDF matrix created successfully")
else:
    print("No valid titles to process")


TF-IDF matrix created successfully


Model Training:

In [42]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def compute_similarity(input_title):
    # Preprocess the input title
    preprocessed_input = ' '.join(preprocess_title(input_title))
    preprocessed_input = ' '.join(preprocess_title(input_title))

    # Transform the input title to TF-IDF vector
    input_vector = vectorizer.transform([preprocessed_input])
    print(input_vector.todense())
    # Compute cosine similarity
    similarities = cosine_similarity(input_vector, tfidf_matrix)
    # Get the maximum similarity score
    max_similarity = np.max(similarities)
    return max_similarity


In [28]:
def similarity_score(input_title):
    similarity = compute_similarity(input_title)
    # Scale similarity to 0-100
    score = similarity * 100
    return score


Example of usage of the similarity_score function

In [50]:
input_title = "president"
print(f"Similarity Score: {score}")


Similarity Score: 0.0
