In [6]:
import pandas as pd
import json
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Load the JSON data from a file
try:
    with open('Ola.json', 'r', encoding='utf-8') as file:
        data = json.load(file)
except UnicodeDecodeError as e:
    print(f"Error decoding the JSON file: {e}")
    exit()
except FileNotFoundError:
    print("The specified JSON file was not found.")
    exit()

# Convert the loaded data to a DataFrame
df = pd.json_normalize(data)

# Define stopwords list (can be customized)
stop_words = set(stopwords.words('english'))

# Function to preprocess tweet text
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove mentions (e.g., @username)
    # text = re.sub(r'@\w+', '', text)
    # Remove hashtags (optional, can be kept if needed for analysis)
    # text = re.sub(r'#\w+', '', text)
    # Remove non-ASCII characters (e.g., emojis)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    # Remove special characters and digits, keep only alphabets and spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Rejoin tokens back into a string
    return ' '.join(tokens)

# Apply preprocessing to the tweet content
df['cleaned_content'] = df['content'].apply(preprocess_text)

# Display the cleaned content for verification
print(df[['content', 'cleaned_content']].head())

# Use a pre-trained model for sentence embeddings
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Set the hyperparameters for BERTopic
umap_model = UMAP(n_neighbors=5, min_dist=0.3, metric='cosine')  # Adjusting UMAP parameters for better clustering
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 2))  # Use bigrams to capture short context

# Create and fit the BERTopic model
topic_model = BERTopic(
    embedding_model=model,        
    umap_model=umap_model,       
    vectorizer_model=vectorizer_model,
    nr_topics=10,              # Let BERTopic decide the number of topics
    # min_cluster_size=5,          # Minimum cluster size to capture meaningful topics
    # min_samples=3                # Minimum samples per topic to avoid too small topics
)
topic_model.hdbscan_model.min_cluster_size = 10  # Adjust this to your preference
topic_model.hdbscan_model.min_samples = 3 
# Fit the model
topics, _ = topic_model.fit_transform(df['cleaned_content'].tolist())

# View the topics
print(topic_model.get_topic_info())

# Visualize the topics
# topic_model.visualize_topics()
topic_info = topic_model.get_topic_info()

# Save the topic information to a CSV file
topic_info.to_csv('OLAtopic_output.csv', index=False)
# Get the top words in each topic and save them to a text file
with open("olatopics_output.txt", "w") as f:
    for topic_num in range(len(topic_model.get_topics())):
        f.write(f"Topic {topic_num}: {topic_model.get_topic(topic_num)}\n\n")

# Add the topics to the DataFrame
df['topic'] = topics

# Display the DataFrame with topics assigned
print(df[['content', 'topic']].head())

# Save the final DataFrame to a CSV file
df.to_csv("OLAtweet_topics_output.csv", index=False)

print("Output has been saved to 'tweet_topics_output.csv' and 'topics_output.txt'")


[nltk_data] Downloading package stopwords to C:\Users\Vikrant
[nltk_data]     Yadav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Vikrant
[nltk_data]     Yadav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                             content  \
0  Yet again fucking driver accepted the booking ...   
1  More than 1 hour and the food is still not her...   
2  No one is constantly as motherfucking assholes...   
3  Freelance content writers needed. Fully remote...   
4  , Are you guys even concerned about what type ...   

                                     cleaned_content  
0  yet fucking driver accepted booking came cance...  
1  hour food still olafoods handles delivery cust...  
2  one constantly motherfucking assholes driverst...  
3  freelance content writers needed fully remote ...  
4  guys even concerned type people hiring booked ...  
   Topic  Count                                               Name  \
0     -1    763                        -1_ola_share_service_launch   
1      0    889                        0_hiring_ola_service_driver   
2      1    829                      1_ola_launch_electric_scooter   
3      2    245                     2_layoff_layoffs_