In [13]:
import pandas as pd
import plotly.express as px
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from textblob import TextBlob
import spacy
from collections import defaultdict
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

nlp = spacy.load('en_core_web_sm')

data = pd.read_csv("articles.csv", encoding='latin-1')
print(data.head())

                                             Article  \
0  Data analysis is the process of inspecting and...   
1  The performance of a machine learning algorith...   
2  You must have seen the news divided into categ...   
3  When there are only two classes in a classific...   
4  The Multinomial Naive Bayes is one of the vari...   

                                               Title  
0                  Best Books to Learn Data Analysis  
1         Assumptions of Machine Learning Algorithms  
2          News Classification with Machine Learning  
3  Multiclass Classification Algorithms in Machin...  
4        Multinomial Naive Bayes in Machine Learning  


In [14]:
titles_text = ' '.join(data['Title'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(titles_text)

# Plot the Word Cloud
fig = px.imshow(wordcloud, title='Word Cloud of Titles')
fig.update_layout(showlegend=False)
fig.show()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
data['Sentiment'] = data['Article'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Plot the distribution of sentiments
fig = px.histogram(data, x='Sentiment', title='Distribution of Sentiments')
fig.show()

In [None]:
def extract_named_entities(text):
    doc = nlp(text)
    entities = defaultdict(list)
    for ent in doc.ents:
        entities[ent.label_].append(ent.text)
    return dict(entities)

data['Entities'] = data['Article'].apply(extract_named_entities)

entity_counts = Counter(entity for entities in data['Entities'] for entity in entities)
entity_df = pd.DataFrame.from_dict(entity_counts, orient='index').reset_index()
entity_df.columns = ['Entity', 'Count']

fig = px.bar(entity_df, x='Entity', y='Count', title='Most Common Entities')
fig.show()

In [None]:
vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
tf = vectorizer.fit_transform(data['Article'])
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_topic_matrix = lda_model.fit_transform(tf)

topic_names = ["Topic " + str(i) for i in range(lda_model.n_components)]
data['Dominant_Topic'] = [topic_names[i] for i in lda_topic_matrix.argmax(axis=1)]

fig = px.bar(data['Dominant_Topic'].value_counts().reset_index(), x='Dominant_Topic', y='count', title='Topic Distribution')
fig.show()

In [None]:
topic_words = []
for topic in lda_model.components_:
    word_idx = topic.argsort()[::-1][:5]
    topic_words.append([vectorizer.get_feature_names_out()[i] for i in word_idx])

topic_df = pd.DataFrame(topic_words, columns=['Word 1', 'Word 2', 'Word 3', 'Word 4', 'Word 5'])
topic_df.index.name = 'Topic'

print(topic_df)

In [None]:
words = vectorizer.get_feature_names_out()
topics = ["Topic " + str(i) for i in range(lda_model.components_.shape[0])]
word_sums = lda_model.components_.sum(axis=0)
top_word_indices = word_sums.argsort()[-15:][::-1]
top_words = words[top_word_indices]
top_word_associations = lda_model.components_[:, top_word_indices]

fig = px.imshow(top_word_associations, 
                labels=dict(x="Top Words", y="Topics", color="Association"),
                x=top_words,
                y=topics,
                color_continuous_scale='Viridis', 
                title='LDA Topic Matrix')

fig.update_yaxes(autorange="reversed")
fig.show()