In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Load the CSV data from a file
try:
    # Replace 'input.csv' with the path to your CSV file
    df = pd.read_csv('Olacabs_articles.csv')
except FileNotFoundError:
    print("The specified CSV file was not found.")
    exit()

# Ensure the CSV has a 'content' column
if 'content' not in df.columns:
    print("The input CSV file must contain a 'content' column.")
    exit()

# Define stopwords list (can be customized)
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    # Lowercase the text
    text = re.sub(r'\[\+\d+\s*chars\]', '', text)
    text = text.lower()
    
    # Remove special characters and digits (keeping only alphabets)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # Rejoin tokens back into a string
    return ' '.join(tokens)

# Apply preprocessing to the tweet content
df['cleaned_content'] = df['content'].apply(preprocess_text)

# Display the cleaned content for verification
print(df[['content', 'cleaned_content']].head())

# Use a pre-trained model for sentence embeddings
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# # Create and fit the BERTopic model
# topic_model = BERTopic(
#     embedding_model=model,       
#     nr_topics=5         
# )
# topic_model.hdbscan_model.min_cluster_size = 5  # Adjust this to your preference
# topic_model.hdbscan_model.min_samples = 2 

# topics, _ = topic_model.fit_transform(df['cleaned_content'].tolist())

# # View the topics
# print(topic_model.get_topic_info())

umap_model = UMAP(n_neighbors=10, min_dist=0.1, metric='cosine')  # Adjusting UMAP parameters for better clustering
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 2))  # Use bigrams to capture short context

# Create and fit the BERTopic model
topic_model = BERTopic(
    embedding_model=model,        
    umap_model=umap_model,       
    vectorizer_model=vectorizer_model,
    nr_topics=10,              # Let BERTopic decide the number of topics
    # min_cluster_size=5,          # Minimum cluster size to capture meaningful topics
    # min_samples=3                # Minimum samples per topic to avoid too small topics
)
topic_model.hdbscan_model.min_cluster_size = 10  # Adjust this to your preference
topic_model.hdbscan_model.min_samples = 3 
# Fit the model
topics, _ = topic_model.fit_transform(df['cleaned_content'].tolist())

# View the topics
print(topic_model.get_topic_info())

# Visualize the topics
# topic_model.visualize_topics()
topic_info = topic_model.get_topic_info()

# Save the topic information to a CSV file
topic_info.to_csv('olaarticlestopics_output_news.csv', index=False)

def filter_topic_terms(topic_model, score_threshold):
    filtered_topics = {}
    for topic_num in range(len(topic_model.get_topics())):
        terms = topic_model.get_topic(topic_num)
        if terms:  # Check if the topic has terms
            filtered_terms = [(term, score) for term, score in terms if score >= score_threshold]
            filtered_topics[topic_num] = filtered_terms
    return filtered_topics

# Set a threshold for term scores
score_threshold = 0.1

# Filter topics using the threshold
filtered_topics = filter_topic_terms(topic_model, score_threshold)

# Save the filtered topic info to a text file
with open("olacabsarticles_topics_output.txt", "w") as f:
    for topic_num, terms in filtered_topics.items():
        f.write(f"Topic {topic_num}:\n")
        for term, score in terms:
            f.write(f"  {term}: {score:.4f}\n")
        f.write("\n")

# Display the filtered topics
for topic_num, terms in filtered_topics.items():
    print(f"Topic {topic_num}:")
    for term, score in terms:
        print(f"  {term}: {score:.4f}")
    print()
# Save the final DataFrame to a CSV file
# df.to_csv("filtered_tweet_topics_output.csv", index=False)

# print("Output has been saved to 'filtered_tweet_topics_output.csv' and 'filtered_topics_output.txt'")


[nltk_data] Downloading package stopwords to C:\Users\Vikrant
[nltk_data]     Yadav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Vikrant
[nltk_data]     Yadav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                             content  \
0  To compound the terrifying experience, the pas...   
1  On Wednesday, Ola CEO Bhavish Aggarwal announc...   
2  A man recently took to social media to share h...   
3  Ola will soon offer food and beverages through...   
4  Welcome to a new edition of ETtech Unwrapped –...   

                                     cleaned_content  
0  compound terrifying experience passenger attem...  
1  wednesday ola ceo bhavish aggarwal announced c...  
2  man recently took social media share experienc...  
3  ola soon offer food beverages throughout india...  
4  welcome new edition ettech unwrapped weekend n...  
   Topic  Count                                     Name  \
0     -1     41          -1_ola_company_ridehailing_cabs   
1      0     13               0_maps_google_ola maps_ola   
2      1     20          1_chief_hemant_officer_ola cabs   
3      2     14       2_electric_ola electric_ola_shares   
4      3     33               3_u