In [27]:
import pandas as pd
import json
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Load the JSON data from a file
try:
    with open('ola_combined.json', 'r', encoding='utf-8') as file:
        data = json.load(file)
except UnicodeDecodeError as e:
    print(f"Error decoding the JSON file: {e}")
    exit()
except FileNotFoundError:
    print("The specified JSON file was not found.")
    exit()

# Convert the loaded data to a DataFrame
df = pd.json_normalize(data)

# Define stopwords list (can be customized)
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove special characters and digits (keeping only alphabets)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # Rejoin tokens back into a string
    return ' '.join(tokens)

# Apply preprocessing to the tweet content
df['cleaned_content'] = df['content'].apply(preprocess_text)

# Display the cleaned content for verification
print(df[['content', 'cleaned_content']].head())

# Use a pre-trained model for sentence embeddings
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Create and fit the BERTopic model
topic_model = BERTopic(
    embedding_model=model,       
    nr_topics=5         
)
topic_model.hdbscan_model.min_cluster_size = 3  # Adjust this to your preference
topic_model.hdbscan_model.min_samples = 2 

topics, _ = topic_model.fit_transform(df['cleaned_content'].tolist())

# View the topics
print(topic_model.get_topic_info())

# Visualize the topics (Optional: You can comment this out if you want to just save results)
# topic_model.visualize_topics()

# Get the top words in each topic and save them to a text file
with open("topics_output.txt", "w") as f:
    for topic_num in range(len(topic_model.get_topics())):
        f.write(f"Topic {topic_num}: {topic_model.get_topic(topic_num)}\n\n")

# Add the topics to the DataFrame
df['topic'] = topics

# Display the DataFrame with topics assigned
print(df[['content', 'topic']].head())

# Save the final DataFrame to a CSV file
df.to_csv("tweet_topics_output.csv", index=False)

print("Output has been saved to 'tweet_topics_output.csv' and 'topics_output.txt'")


[nltk_data] Downloading package stopwords to C:\Users\Vikrant
[nltk_data]     Yadav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Vikrant
[nltk_data]     Yadav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                             content  \
0  India, now get tasty meals delivered right at ...   
1  Inaugurating record-breaking 4,000 stores on C...   
2  DASH brings your favorite meals to your doorst...   
3  Yep, taking our commitment to to the next leve...   
4  Were not stopping at cities. 4,000 Ola Stores ...   

                                     cleaned_content  
0  india get tasty meals delivered right doorstep...  
1  inaugurating recordbreaking stores christmas t...  
2  dash brings favorite meals doorstep minutes li...  
3  yep taking commitment next level scaling food ...  
4  stopping cities ola stores service centres rol...  
   Topic  Count                                       Name  \
0     -1     24  -1_future_electric_savingswalascooter_ola   
1      0    104   0_savingswalascooter_ola_electric_future   
2      1     22          1_olacoins_rewards_ride_discounts   
3      2      9                2_diwali_happy_weekend_food   
4      3      4        