In [14]:
import pandas as pd
import re
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer
import json

nltk.download('stopwords')
nltk.download('punkt')


nlp = spacy.load("en_core_web_sm")

# Load the CSV data from a file
# try:
#     # Replace 'input.csv' with the path to your CSV file
#     df = pd.read_csv('Olacabs_articles.csv')
# except FileNotFoundError:
#     print("The specified CSV file was not found.")
#     exit()

# # Ensure the CSV has a 'content' column
# if 'content' not in df.columns:
#     print("The input CSV file must contain a 'content' column.")
#     exit()
try:
    with open('Ola.json', 'r', encoding='utf-8') as file:
        data = json.load(file)
except UnicodeDecodeError as e:
    print(f"Error decoding the JSON file: {e}")
    exit()
except FileNotFoundError:
    print("The specified JSON file was not found.")
    exit()

df = pd.json_normalize(data)



stop_words = set(stopwords.words('english'))


def preprocess_text(text):
    
    text = re.sub(r'\[\+\d+\s*chars\]', '', text)
    text = text.lower()
    
    
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
   
    tokens = word_tokenize(text)
    
    
    tokens = [word for word in tokens if word not in stop_words]
    
    
    return ' '.join(tokens)

def extract_entities(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents]
    return ' '.join(entities)

df['cleaned_content'] = df['content'].apply(preprocess_text)


df['entities'] = df['content'].apply(extract_entities)


df['combined_content'] = df['cleaned_content'] + ' ' + df['entities']


print(df[['content', 'cleaned_content', 'entities', 'combined_content']].head())


model = SentenceTransformer("paraphrase-xlm-r-multilingual-v1")

umap_model = UMAP(n_neighbors=10, min_dist=0.1, metric='cosine')  
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 2))  # Use bigrams to capture short context


topic_model = BERTopic(
    embedding_model=model,        
    umap_model=umap_model,       
    vectorizer_model=vectorizer_model,
    nr_topics=10,              
)

topic_model.hdbscan_model.min_cluster_size = 10  
topic_model.hdbscan_model.min_samples = 3 


topics, _ = topic_model.fit_transform(df['combined_content'].tolist())


print(topic_model.get_topic_info())


topic_info = topic_model.get_topic_info()
topic_info.to_csv('olaJSon_with_entities.csv', index=False)


def filter_topic_terms(topic_model, score_threshold):
    filtered_topics = {}
    for topic_num in range(len(topic_model.get_topics())):
        terms = topic_model.get_topic(topic_num)
        if terms:  
            filtered_terms = [(term, score) for term, score in terms if score >= score_threshold]
            filtered_topics[topic_num] = filtered_terms
    return filtered_topics


score_threshold = 0.05


filtered_topics = filter_topic_terms(topic_model, score_threshold)


with open("olaJSonwith_entities.txt", "w") as f:
    for topic_num, terms in filtered_topics.items():
        f.write(f"Topic {topic_num}:\n")
        for term, score in terms:
            f.write(f"  {term}: {score:.4f}\n")
        f.write("\n")


for topic_num, terms in filtered_topics.items():
    print(f"Topic {topic_num}:")
    for term, score in terms:
        print(f"  {term}: {score:.4f}")
    print()


df.to_csv("Jsonentities_output.csv", index=False)

print("Output has been saved to 'filtered_articles_with_entities_output.csv' and 'filtered_topics_output_with_entities.txt'")


[nltk_data] Downloading package stopwords to C:\Users\Vikrant
[nltk_data]     Yadav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Vikrant
[nltk_data]     Yadav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                             content  \
0  Yet again fucking driver accepted the booking ...   
1  More than 1 hour and the food is still not her...   
2  No one is constantly as motherfucking assholes...   
3  Freelance content writers needed. Fully remote...   
4  , Are you guys even concerned about what type ...   

                                     cleaned_content            entities  \
0  yet fucking driver accepted booking came cance...                       
1  hour food still olafoods handles delivery cust...  More than 1 hour #   
2  one constantly motherfucking assholes driverst...          20 minutes   
3  freelance content writers needed fully remote ...                 two   
4  guys even concerned type people hiring booked ...               today   

                                    combined_content  
0  yet fucking driver accepted booking came cance...  
1  hour food still olafoods handles delivery cust...  
2  one constantly motherfucking assholes 