## Q-6 Stopword Steamming
Write a program for pre-processing of a text document such as stop word removal, stemming.

In [None]:
import io
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')

print(stopwords.words('english'))

sent = "This is a sample sentence, showing off the stop words filtration."
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(sent)

# converts the words in word_tokens to lower case and then checks whether they are present in stop_words or not
filtered_sent = [w for w in word_tokens if not w.lower() in stop_words]
filtered_sent = []
for w in word_tokens:
 if w not in stop_words:
     filtered_sent.append(w)
print(word_tokens)
print(filtered_sent)

f1 = open("input.txt")
line = f1.read()
words = line.split()
print("Before Removing Stopwords:",words)
print("Length:",len(words))

for r in words:
    if not r in stop_words:
        appendfile = open('output.txt','a')
        appendfile.write(" "+r)
        appendfile.close()

f2 = open("output.txt")
line2 = f2.read()
words = line2.split()
print("Before Removing Stopwords:",words)
print("Length:",len(words))

ps = PorterStemmer()

sample = ['program', 'programmer', 'programming', 'programs', 'programmers']
for s in sample:
    print(s,":", ps.stem(s))

sent = "Programmers program with different programming languages"
words = word_tokenize(sent)
print(words)

for w in words:
    print(w,":", ps.stem(w))

## Q-7 Inverted File
Implement a program for retrieval of documents using inverted files.

In [None]:
doc1 = "The quick brown fox jumped over the lazy dog."
doc2 = "The lazy dog slept in the sun."

# Convert each document to lowercase and split it into words
token1 = doc1.lower().split()
token2 = doc2.lower().split()
print("Token1:",token1)
print("Token2 :",token2)

# Combine the tokens into a list of unique terms
terms = list(set(token1 + token2))
print("Terms :",terms)

# Create an empty dictionary to store the inverted index
inverted_index = {}
# For each term, find the documents that contain it
for term in terms:
    doc = []
    if term in token1:
        doc.append("Doc1")
    if term in token2:
        doc.append("Doc2")
    inverted_index[term] = doc
print("Inverted Index Dictionary :", inverted_index)

# Print inverted index
for term, doc in inverted_index.items():
    print("term ->", ','.join(doc))

## Q-8 Bayesian Network
Write a program to construct a Bayesian network considering medical data. Use this model to
demonstrate the diagnosis of heart patients using the standard Heart Disease Data Set (You can use
Java/Python ML library classes/API.

In [None]:
!pip install pgmpy==0.1.16

import numpy as np
import pandas as pd
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination

df= pd.read_csv('heart.csv')
df.head()

#In dataset has some value as '?' so removed it
df=df.replace('?', np.nan) 

# Model Bayesian Network
model = BayesianNetwork([('age','trestbps'),('age','fbs'),('sex','trestbps'),('exang','trestbps'),('trestbps','heartdisease'),
                       ('fbs','heartdisease'),('heartdisease','restecg'), ('heartdisease','thalach'),('heartdisease','chol')])

# Learning CPDs using Maximum Likelihood Estimators
print("Learning CPD using Maximum likelihood estimators")
model.fit(df, estimator=MaximumLikelihoodEstimator)

# Inferencing with Bayesian Network
print('Inferencing with Bayesian Network:')
HeartDisease_infer = VariableElimination(model)

# Computing the Probability of HeartDisease given Age
print("1. Probability of HeartDisease given Age=35")
p= HeartDisease_infer.query(variables=['heartdisease'], evidence={'age':35})
print(p)

# Computing the Probability of HeartDisease given cholesterol
print("2. Probability of HeartDisease given cholesterol=230")
p=HeartDisease_infer.query(variables=['heartdisease'], evidence ={'chol':230})
print(p)

## Q-9 Agglomerative clustering
Implement Agglomerative hierarchical clustering algorithm using appropriate dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.metrics import silhouette_score
import scipy.cluster.hierarchy as shc

df = pd.read_csv('CC_GENERAL.csv')
df.head()

df.info()

# Dropping the CUST_ID column from the data
df =df.drop('CUST_ID', axis = 1)

df.isnull().sum()

# Handling the missing values
df['CREDIT_LIMIT'].fillna(df['CREDIT_LIMIT'].mean(), inplace=True)
df['MINIMUM_PAYMENTS'].fillna(df['MINIMUM_PAYMENTS'].mode()[0], inplace = True)

df.isnull().sum()

# Scaling the data so that all the features become comparable
sc = StandardScaler()
df_scaled = sc.fit_transform(df)

# Normalizing the data so that the data approximately follows a Gaussian distribution
df_normalized = normalize(df_scaled)

# Converting the numpy array into a pandas
data_normalized = pd.DataFrame(df_normalized)

# Reducing the dimensionality of the Data
pca = PCA(n_components = 2)
df_principal = pca.fit_transform(df_normalized)
df_principal = pd.DataFrame(df_principal)
df_principal.columns = ['P1', 'P2']

# Visualizing theworking of the Dendrograms
# Dendrograms are used to divide a given clusterinto many different clusters
plt.figure(figsize =(8, 8))
plt.title('Visualising the data')
Dendrogram = shc.dendrogram((shc.linkage(df_principal, method ='ward')))

# Building and Visualizing the different clustering models for different values of k
# k = 2
ac2 = AgglomerativeClustering(n_clusters = 2)
plt.figure(figsize =(6, 6))
plt.scatter(df_principal['P1'], df_principal['P2'],
c = ac2.fit_predict(df_principal))
plt.show()

# k = 3
ac3 = AgglomerativeClustering(n_clusters = 3)
plt.figure(figsize =(6, 6))
plt.scatter(df_principal['P1'],df_principal['P2'],
c = ac3.fit_predict(df_principal))
plt.show()

# k = 4
ac4 = AgglomerativeClustering(n_clusters = 4)
plt.figure(figsize =(6, 6))
plt.scatter(df_principal['P1'],df_principal['P2'],
c = ac4.fit_predict(df_principal))
plt.show()

# k = 5
ac5 = AgglomerativeClustering(n_clusters = 5)
plt.figure(figsize =(6, 6))
plt.scatter(df_principal['P1'],df_principal['P2'],
c = ac5.fit_predict(df_principal))
plt.show()

# k = 6
ac6 = AgglomerativeClustering(n_clusters = 6)
plt.figure(figsize =(6, 6))
plt.scatter(df_principal['P1'],df_principal['P2'],
c = ac6.fit_predict(df_principal))
plt.show()

# Evaluating the different models and Visualizing the results.
k = [2, 3, 4, 5, 6]

# Appending the silhouette scores of the different models to the list
silhouette_scores = []
silhouette_scores.append(silhouette_score(df_principal, ac2.fit_predict(df_principal)))
silhouette_scores.append(silhouette_score(df_principal, ac3.fit_predict(df_principal)))
silhouette_scores.append(silhouette_score(df_principal, ac4.fit_predict(df_principal)))
silhouette_scores.append(silhouette_score(df_principal, ac5.fit_predict(df_principal)))
silhouette_scores.append(silhouette_score(df_principal, ac6.fit_predict(df_principal)))
print(silhouette_scores)


# Plotting a bar graph to compare the results
plt.bar(k,silhouette_scores)
plt.xlabel('Number of Clusters')
plt.ylabel('S(i)')
plt.show()

## Q-10 Page Rank
Implement Page Rank Algorithm. (Use python or beautiful soup for implementation).

In [None]:
def pagerank(G, alpha=0.85, personalization=None, max_iter=100, tol=1.0e-6, nstart=None, weight='weight', dangling=None):
    if len(G) == 0:
        return {}
    if not G.is_directed():
        D = G.to_directed()
    else:
        D = G

    # Create a copy in (right) stochastic form
    W = nx.stochastic_graph(D, weight=weight)
    N = W.number_of_nodes()

    # Choose fixed starting vector if not given
    if nstart is None:
        x = dict.fromkeys(W, 1.0 / N)
    else:
        s = float(sum(nstart.values())) # Normalized nstart vector
        x = dict((k, v / s) for k, v in nstart.items())

    if personalization is None:
        p = dict.fromkeys(W, 1.0 / N) # Assign uniform personalization vector if not given
    else:
        missing = set(G) - set(personalization)
        if missing:
            raise NetworkXError('Personalization dictionary must have a value for every node. Missing nodes %s' % missing)
        s = float(sum(personalization.values()))
        p = dict((k, v / s) for k, v in personalization.items())

    if dangling is None:
        dangling_weights = p# Use personalization vector if dangling vector not specified
    else:
        missing = set(G) - set(dangling)
        if missing:
            raise NetworkXError('Dangling node dictionary must have a value for every node. Missing nodes %s' % missing)
        s = float(sum(dangling.values()))
        dangling_weights = dict((k, v/s) for k, v in dangling.items())

    dangling_nodes = [n for n in W if W.out_degree(n, weight=weight) == 0.0]

    # power iteration: make up to max_iter iterations
    for _ in range(max_iter):
        xlast = x
        x = dict.fromkeys(xlast.keys(), 0)
        danglesum = alpha * sum(xlast[n] for n in dangling_nodes)
        for n in x:
            # this matrix multiply looks odd because it is doing a left multiply x^T=xlast^T*W
            for nbr in W[n]:
                x[nbr] += alpha * xlast[n] * W[n][nbr][weight]
            x[n] += danglesum * dangling_weights[n] + (1.0 - alpha) * p[n]

        # check convergence, l1 norm
        err = sum([abs(x[n] - xlast[n]) for n in x])
        if err < N*tol:
            return x
    raise NetworkXError('Pagerank: power iteration failed to converge in %d iterations.' % max_iter)



import networkx as nx
G = nx.barabasi_albert_graph(60, 41)
pr = nx.pagerank(G, 0.4)
print(pr)


ANOTHER ONE


import networkx as nx
from bs4 import BeautifulSoup
import requests


class PageRank:
    def __init__(self, damping_factor=0.85, max_iter=100, tol=1.0e-6):
        self.damping_factor = damping_factor
        self.max_iter = max_iter
        self.tol = tol
        self.graph = nx.DiGraph()

    def add_edge(self, from_node, to_node):
        self.graph.add_edge(from_node, to_node)

    def compute_pagerank(self):
        pagerank = nx.pagerank(self.graph, alpha=self.damping_factor, max_iter=self.max_iter, tol=self.tol)
        return pagerank

    def scrape_links(self, url):
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')
            links = set()
            for link in soup.find_all('a', href=True):
                links.add(link['href'])
            return links
        except Exception as e:
            print(f"Error scraping {url}: {e}")
            return set()

    def build_graph_from_urls(self, seed_url, depth=1):
        urls_to_visit = [seed_url]
        visited_urls = set()

        for _ in range(depth):
            new_urls = []
            for url in urls_to_visit:
                if url not in visited_urls:
                    visited_urls.add(url)
                    links = self.scrape_links(url)
                    for link in links:
                        self.add_edge(url, link)  # Create edges in the graph
                        new_urls.append(link)
            urls_to_visit = new_urls




pagerank = PageRank()
# Example: Build graph from URLs (seeds)
# https://en.wikipedia.org/wiki/Web_scraping
# https://medium.com/@arti.singh280/list/the-quantum-world-6126d55e1882
seed_url = "https://docs.quantum.ibm.com/api/qiskit/release-notes/0.44#misc-deprecations"  # Change this URL as needed
pagerank.build_graph_from_urls(seed_url, depth=2)

# Compute PageRank
ranks = pagerank.compute_pagerank()

# Print the PageRank scores
print("PageRank Scores:")
for url, score in ranks.items():
    print(f"{url}: {score:.4f}")