In [1]:
import nltk
import gensim
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
from collections import Counter


In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
#File upload
from google.colab import files
files.upload()

Saving Aurora.txt to Aurora.txt
Saving ML.txt to ML.txt


 'ML.txt': b'Article 1: Introduction to Machine Learning\r\n\r\nMachine learning is a subfield of artificial intelligence that focuses on the development of algorithms and models that enable computers to learn and make predictions or decisions without explicit programming. It is based on the idea that machines can learn from data and improve their performance over time.\r\n\r\nSupervised learning is a common approach in machine learning, where models are trained on labeled data. The model learns the patterns and relationships between the input features and the corresponding output labels. Classification and regression are two types of supervised learning tasks. In classification, the goal is to predict the class or category of an input, while in regression, the goal is to predict a continuous value.\r\n\r\nUnsupervised learning, on the other hand, deals with unlabeled data. The model learns to identify patterns, structures, or relationships in the data without any prior knowledge of th

In [4]:
# Read the file
with open("ML.txt", "r") as file:
    contents = file.readlines()

In [5]:
contents

['Article 1: Introduction to Machine Learning\n',
 '\n',
 'Machine learning is a subfield of artificial intelligence that focuses on the development of algorithms and models that enable computers to learn and make predictions or decisions without explicit programming. It is based on the idea that machines can learn from data and improve their performance over time.\n',
 '\n',
 'Supervised learning is a common approach in machine learning, where models are trained on labeled data. The model learns the patterns and relationships between the input features and the corresponding output labels. Classification and regression are two types of supervised learning tasks. In classification, the goal is to predict the class or category of an input, while in regression, the goal is to predict a continuous value.\n',
 '\n',
 'Unsupervised learning, on the other hand, deals with unlabeled data. The model learns to identify patterns, structures, or relationships in the data without any prior knowledg

In [6]:
# Preprocess texts
def preprocess(text):
    # tokenize, remove punctuation and stopwords, lemmatize
    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha()]
    stopwords = nltk.corpus.stopwords.words("english")
    tokens = [token for token in tokens if token not in stopwords]
    lemmatizer = nltk.stem.WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(tokens)

# Preprocess the contents
preprocessed_contents = [preprocess(content) for content in contents]

In [7]:
preprocessed_contents

['article introduction machine learning',
 '',
 'machine learning subfield artificial intelligence focus development algorithm model enable computer learn make prediction decision without explicit programming based idea machine learn data improve performance time',
 '',
 'supervised learning common approach machine learning model trained labeled data model learns pattern relationship input feature corresponding output label classification regression two type supervised learning task classification goal predict class category input regression goal predict continuous value',
 '',
 'unsupervised learning hand deal unlabeled data model learns identify pattern structure relationship data without prior knowledge output label clustering dimensionality reduction popular unsupervised learning technique clustering group similar data point together dimensionality reduction reduces number input feature preserving important information',
 '',
 'deep learning subset machine learning focus using arti

In [8]:
# Split the data into training, testing, and validation sets
X_train, X_test = train_test_split(preprocessed_contents, test_size=0.2, random_state=42)
X_train, X_val = train_test_split(X_train, test_size=0.2, random_state=42)

In [9]:
# Create TF-IDF vectorizer and extract features
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(X_train)

In [10]:
features.shape

(8, 131)

In [11]:
# Define a global cache dictionary
cache = {}

def calculate_cosine_similarity(query_features, article_features):
    # Convert the sparse matrix to a hashable key
    query_key = tuple(zip(query_features.nonzero()[0], query_features.data))

    # Check if the cosine similarity is already cached
    if query_key in cache:
        similarities = cache[query_key]
    else:
        # Calculate the cosine similarity and store it in the cache
        similarities = cosine_similarity(query_features, article_features)[0]
        cache[query_key] = similarities
    return similarities

In [12]:
# Define a function to extract hot keywords
def extract_hot_keywords(query, features, terms, topn=10):
    # Vectorize the query
    query_features = vectorizer.transform([preprocess(query)])

    # Calculate or retrieve the cached cosine similarity between query and article features
    similarities = calculate_cosine_similarity(query_features, features)

    # Get the indices of the top-n most similar articles
    top_indices = similarities.argsort()[::-1][:topn]

    # Get the hot keywords from the top articles
    keywords = []
    for index in top_indices:
        article_keywords = extract_keywords_from_article(X_train[index], terms)
        keywords.extend(article_keywords)

    # Sort the keywords by their frequency
    keyword_freq = nltk.FreqDist(keywords)
    sorted_keywords = sorted(keyword_freq.items(), key=lambda x: x[1], reverse=True)

    return sorted_keywords[:topn]

In [13]:
# Define a function to extract keywords from an article
def extract_keywords_from_article(article, terms):
    tokens = nltk.word_tokenize(article)
    keywords = [token for token in tokens if token in terms]
    return keywords

In [14]:
# Example usage of extract_hot_keywords
query = "machine learning"
terms = ["machine", "learning", "data", "algorithm"]  # Replace with your relevant terms
top_keywords = extract_hot_keywords(query, features, terms, topn=10)
print("Top Keywords:")
for keyword, frequency in top_keywords:
    print(keyword, ":", frequency)

Top Keywords:
learning : 12
machine : 6
data : 6
algorithm : 1


##Try feature technique word embedding

In [15]:
# Train the Word2Vec model
model = gensim.models.Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=1, workers=4)

def calculate_cosine_similarity(query_tokens, article_tokens):
    # Calculate the average word embeddings for the query and article
    query_embedding = np.mean([model.wv[token] for token in query_tokens if token in model.wv], axis=0).flatten()
    article_embedding = np.mean([model.wv[token] for token in article_tokens if token in model.wv], axis=0).flatten()

    # Calculate the cosine similarity
    similarity = 1 - cosine(query_embedding, article_embedding)
    return similarity

# Define a function to extract hot keywords
def extract_hot_keywords(query, features, terms, topn=10):
    # Preprocess the query
    query_tokens = preprocess(query)

    # Calculate the similarity between query and articles
    similarities = [calculate_cosine_similarity(query_tokens, article_tokens) for article_tokens in X_train]

    # Get the indices of the top-n most similar articles
    top_indices = np.argsort(similarities)[::-1][:topn]

    # Get the hot keywords from the top articles
    keywords = []
    for index in top_indices:
        article_keywords = extract_keywords_from_article(X_train[index])
        keywords.extend(article_keywords)

    # Filter the keywords based on the specified terms
    filtered_keywords = [keyword for keyword in keywords if keyword in terms]

    # Count the frequency of each keyword
    keyword_freq = Counter(filtered_keywords)

    # Sort the keywords by their frequency
    sorted_keywords = sorted(keyword_freq.items(), key=lambda x: x[1], reverse=True)

    return sorted_keywords[:topn]

# Define a function to extract keywords from an article
def extract_keywords_from_article(article_tokens):
    keywords = [token for token in article_tokens if token in model.wv]
    return keywords

# Example usage of extract_hot_keywords
query = "machine learning"
terms = ["machine", "learning", "data", "algorithm"]  # Replace with your relevant terms
top_keywords = extract_hot_keywords(query, X_train, terms, topn=10)
print("Top Keywords:")
for keyword, frequency in top_keywords:
    print(keyword, ":", frequency)



Top Keywords:


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


## Compare the Feature techniques by Average_precision metric

In [26]:
from sklearn.metrics import average_precision_score

# Extract top keywords using the first approach (TF-IDF)
query_tfidf = "machine learning"
terms_tfidf = ["machine", "learning", "data", "algorithm"]  # Replace with your relevant terms
top_keywords_tfidf = extract_hot_keywords(query_tfidf, features, terms_tfidf, topn=10)

# Extract top keywords using the second approach (Word2Vec)
query_w2v = "machine learning"
terms_w2v = ["machine", "learning", "data", "algorithm"]  # Replace with your relevant terms
top_keywords_w2v = extract_hot_keywords(query_w2v, X_train, terms_w2v, topn=10)


In [21]:
# Extract the list of true relevant keywords
true_keywords = set(keyword for keyword, _ in top_keywords_tfidf)

# Calculate the average precision scores for each technique
scores_tfidf = [1 if keyword in true_keywords else 0 for keyword, _ in top_keywords_tfidf]
if not scores_tfidf:
    average_precision_tfidf = 0.0
else:
    average_precision_tfidf = average_precision_score(scores_tfidf, range(1, len(scores_tfidf) + 1))

scores_w2v = [1 if keyword in true_keywords else 0 for keyword, _ in top_keywords_w2v]
if not scores_w2v:
    average_precision_w2v = 0.0
else:
    average_precision_w2v = average_precision_score(scores_w2v, range(1, len(scores_w2v) + 1))


# Determine which technique performed better
if average_precision_tfidf > average_precision_w2v:
    print("TF-IDF performed better.")
else:
    print("Word2Vec performed better.")

Word2Vec performed better.


#Task 2
##Hot Keyword Extraction using spaCy

In [19]:
#File upload
from google.colab import files
files.upload()

Saving Aurora.txt to Aurora.txt




In [20]:
# Read the file
with open("Aurora.txt", "r") as file:
    contents = file.read()


In [21]:
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")


In [22]:
def extract_keywords(article_texts):
    # Initialize an empty dictionary to store keyword frequencies
    keywords = {}

    # Iterate over each article text
    for article_text in article_texts:
        # Process the article text with spaCy
        doc = nlp(article_text)

        # Iterate over each tok en in the document
        for token in doc:
            # Consider only relevant tokens (e.g., nouns, adjectives, and verbs)
            if token.pos_ in ["NOUN", "ADJ", "VERB"]:
                # Convert the token to lowercase
                keyword = token.lemma_.lower()

                # Exclude stopwords and punctuation marks
                if not token.is_stop and not token.is_punct:
                    # Update the keyword frequency dictionary
                    if keyword in keywords:
                        keywords[keyword] += 1
                    else:
                        keywords[keyword] = 1

    # Sort the keywords by frequency in descending order
    sorted_keywords = sorted(keywords.items(), key=lambda x: x[1], reverse=True)

    return sorted_keywords

In [23]:
# Example usage with a single article text
keywords = extract_keywords([contents])


In [24]:
keywords

[('aurora', 37),
 ('auroral', 15),
 ('light', 14),
 ('sun', 14),
 ('space', 14),
 ('say', 12),
 ('solar', 12),
 ('night', 10),
 ('activity', 10),
 ('people', 9),
 ('magnetic', 9),
 ('year', 8),
 ('particle', 7),
 ('mile', 7),
 ('come', 7),
 ('sky', 6),
 ('day', 6),
 ('know', 6),
 ('satellite', 6),
 ('line', 6),
 ('magnetosphere', 6),
 ('northern', 6),
 ('science', 5),
 ('atmosphere', 5),
 ('earth', 5),
 ('power', 5),
 ('huge', 5),
 ('wind', 5),
 ('energy', 5),
 ('field', 5),
 ('region', 5),
 ('spirit', 5),
 ('change', 4),
 ('intense', 4),
 ('picture', 4),
 ('shape', 4),
 ('physicist', 4),
 ('current', 4),
 ('man', 4),
 ('research', 4),
 ('work', 4),
 ('system', 4),
 ('scientist', 4),
 ('high', 4),
 ('time', 4),
 ('link', 4),
 ('study', 4),
 ('sunspot', 4),
 ('produce', 4),
 ('happen', 4),
 ('japanese', 4),
 ('point', 3),
 ('place', 3),
 ('planet', 3),
 ('peak', 3),
 ('charge', 3),
 ('make', 3),
 ('glow', 3),
 ('upper', 3),
 ('display', 3),
 ('good', 3),
 ('late', 3),
 ('information', 3

In [25]:
# Print the top 5 keywords with their frequencies
for keyword, frequency in keywords[:5]:
    print(f"{keyword}: {frequency}")

aurora: 37
auroral: 15
light: 14
sun: 14
space: 14
