In [None]:
#Report and wordcloud generation
import pandas as pd
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import os
import re

# Define attribute mapping
attribute_mapping = {
    "Art": "Art",
    "Art and Craft": "Art",
    "Art and Crafts": "Art",
    "Assam Tea": "Cuisine",
    "Ayurveda in Kerala": "Medicine",
    "Costume": "Costume",
    "Costume of Madhya Pradesh": "Costume",
    "Costumes": "Costume",
    "Costumes of Bengal": "Costume",
    "Costumes of Jammu Kashmir": "Costume",
    "Costumes of Maharashtra": "Costume",
    "Costumes of Uttar Pradesh": "Costume",
    "Crafts": "Art",
    "Cuisine": "Cuisine",
    "Cuisines": "Cuisine",
    "Cuisines in Bengal": "Cuisine",
    "Cuisines in Jammu Kashmir": "Cuisine",
    "Cuisines in Maharashtra": "Cuisine",
    "Cuisines in Uttar Pradesh": "Cuisine",
    "Cuisines of Meghalaya": "Cuisine",
    "Culture": "Rituals and Ceremonies",
    "Culture and Tradition": "Rituals and Ceremonies",
    "Culture and Tradition of Jammu Kashmir": "Rituals and Ceremonies",
    "Culture and Tradition of Jharkhand": "Rituals and Ceremonies",
    "Culture and Tradition of Uttar Pradesh": "Rituals and Ceremonies",
    "Culture of Madhya Pradesh": "Rituals and Ceremonies",
    "Culture of Maharashtra": "Rituals and Ceremonies",
    "Culture of Meghalaya": "Rituals and Ceremonies",
    "Dadra and Nagar Haveli and Daman and Diu Language": "Language",
    "Dance": "Music",
    "Dance and Music": "Music",
    "Dressing": "Costume",
    "Fair and Festival": "Festivals",
    "Fair and Festivals": "Festivals",
    "Fairs and Festival": "Festivals",
    "Fairs and Festivals": "Festivals",
    "Famous People": "Personalities",
    "Famous Tourist Attractions": "Tourism",
    "Famous personalities": "Personalities",
    "Famous tourist attraction": "Tourism",
    "Famous tourist attractions": "Tourism",
    "Festival of Kerala": "Festivals",
    "Festivals": "Festivals",
    "Festivals of Bengal": "Festivals",
    "Gujarat Traditions": "Rituals and Ceremonies",
    "Gujarati Costumes": "Costume",
    "Gujarati Language": "Language",
    "Gujarati Music and dance": "Music",
    "Handicraft": "Art",
    "Haryana Costumes": "Costume",
    "Haryana Cuisines": "Cuisine",
    "Haryana Music and dance": "Music",
    "Haryana Traditions": "Rituals and Ceremonies",
    "Haryanavi Language": "Language",
    "History": "History",
    "History of Dadra and Nagar Haveli and Daman and Diu": "History",
    "Language": "Language",
    "Languages": "Language",
    "Music": "Transport",
    "Music and Dance": "Music",
    "Music and Dances of Bengal": "Music",
    "Music and Dances of Jammu Kashmir": "Music",
    "Music and Dances of Maharashtra": "Music",
    "Music and Dances of Uttar Pradesh": "Music",
    "Nightlife in Goa": "Nightlife",
    "Odisha Costume": "Costume",
    "Odisha Cuisines": "Cuisine",
    "Odisha Custom and Traditions": "Rituals and Ceremonies",
    "Odisha Language": "Language",
    "Odisha Music and Dance": "Music",
    "Odisha Wedding tradition": "Rituals and Ceremonies",
    "Origin": "History",
    "Overview": "Cultural Common Sense",
    "Paintings": "Art",
    "Punjabi Attire": "Costume",
    "Punjabi Cuisine": "Cuisine",
    "Punjabi Dance": "Music",
    "Punjabi Music": "Music",
    "Punjabi Wedding Traditions": "Rituals and Ceremonies",
    "Rajasthani Art and craft": "Art",
    "Rajasthani Costume": "Costume",
    "Rajasthani Cuisines": "Cuisine",
    "Rajasthani Music and Dance": "Music",
    "Rajasthani Tradition": "Rituals and Ceremonies",
    "Religion": "Religion",
    "Sports": "Sports",
    "Tamil Nadu Wedding Tradition": "Rituals and Ceremonies",
    "Tourism": "Tourism",
    "Tourism in Goa": "Tourism",
    "Tourism of Madhya Pradesh": "Tourism",
    "Tradition of Bengal": "Rituals and Ceremonies",
    "Tradition of Maharashtra": "Rituals and Ceremonies",
    "Transport": "Transport",
    "Wedding": "Rituals and Ceremonies",
}

# Load and process CSV files
csv_files = [r"C:\Users\Arijit Maji\Git_Repositories\CulturalRepo\ManuallyCheckedAndFilteredDataset\mcqs_all_states_AssociationPred_corrected_Manually.csv", r"C:\Users\Arijit Maji\Git_Repositories\CulturalRepo\ManuallyCheckedAndFilteredDataset\mcqs_all_states_CountryPred_corrected_Manually.csv", r"C:\Users\Arijit Maji\Git_Repositories\CulturalRepo\ManuallyCheckedAndFilteredDataset\mcqs_all_states_GKPred_corrected_Manually.csv", r"C:\Users\Arijit Maji\Git_Repositories\CulturalRepo\ManuallyCheckedAndFilteredDataset\mcqs_all_states_StatePred_corrected_Manually.csv"] # Replace with actual file paths


# Extract question types from file names
question_type_mapping = {
    "mcqs_all_states_AssociationPred_corrected_Manually.csv": "Association",
    "mcqs_all_states_CountryPred_corrected_Manually.csv": "Country",
    "mcqs_all_states_GKPred_corrected_Manually.csv": "General Knowledge",
    "mcqs_all_states_StatePred_corrected_Manually.csv": "State",
}

dataframes = []
for file in csv_files:
    df = pd.read_csv(file, encoding='ISO-8859-1')
    file_name = os.path.basename(file)
    df['Question_Type'] = question_type_mapping[file_name]  # Add question type column
    dataframes.append(df)

# Combine all data into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# Replace attributes using the mapping without altering unmatched values
combined_df['Attribute'] = combined_df['Attribute'].apply(lambda x: attribute_mapping[x] if x in attribute_mapping else x)

# Calculate statistics
# attribute_counts = combined_df['Attribute'].value_counts()
state_question_type_counts = combined_df.groupby(['State', 'Question_Type']).size().reset_index(name='Count')
state_counts = combined_df['State'].value_counts()
# question_type_counts = combined_df['Question_Type'].value_counts()

import nltk
nltk.download('stopwords')

def preprocess_data(data):
    """
    Cleans the input data by removing unwanted patterns and stopwords.

    Args:
        data (dict): The original data containing word frequencies.
    
    Returns:
        dict: Cleaned data with unwanted patterns and stopwords removed.
    """
    # Load stopwords
    stop_words = set(stopwords.words('english'))
    
    cleaned_data = {}
    for word, freq in data.items():
        # Remove terms matching patterns like Q<number>: or Q<number>, and stopwords
        if not re.match(r'Q\d+\s*:?', word) and word.lower() not in stop_words:
            cleaned_data[word] = freq
    return cleaned_data

# Generate word clouds
def generate_word_cloud(data, title, output_path):
    cleaned_data = preprocess_data(data)
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(cleaned_data)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=16)
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()

# Word clouds for attributes
attribute_word_counts = Counter(combined_df['Attribute'])
for attribute, count in attribute_word_counts.items():
    attribute_data = combined_df[combined_df['Attribute'] == attribute]['Question'].str.cat(sep=' ')
    if attribute_data:
        word_freq = Counter(attribute_data.split())
        # generate_word_cloud(word_freq, f"Word Cloud for {attribute}", f"Wordcloud/{attribute}_wordcloud.png")

# Word clouds for states
state_word_counts = Counter(combined_df['State'])
for state, count in state_word_counts.items():
    state_data = combined_df[combined_df['State'] == state]['Question'].str.cat(sep=' ')
    if state_data:
        word_freq = Counter(state_data.split())
        # generate_word_cloud(word_freq, f"Word Cloud for {state}", f"Wordcloud/{state}_wordcloud.png")

dataset_words = combined_df[combined_df['Attribute'] == attribute]['Question'].str.cat(sep=' ')
word_freq_entire_dataset = Counter(dataset_words.split())
generate_word_cloud(word_freq_entire_dataset, f"Word Cloud for Sanskriti", f"Wordcloud/India_wordcloud.png")

# Save statistics to CSV
# attribute_counts.to_csv("attribute_counts.csv", header=True)
state_question_type_counts.to_csv("state_counts_questionWise.csv", header=True)
# question_type_counts.to_csv("question_type_counts.csv", header=True)

# print("Processing complete. Statistics and word clouds saved.")
import pandas as pd
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import os
import re
import spacy

# Load the spaCy model for named entity recognition
nlp = spacy.load("en_core_web_sm")

# Define attribute mapping
attribute_mapping = {
    "Art": "Art",
    "Art and Craft": "Art",
    "Art and Crafts": "Art",
    "Assam Tea": "Cuisine",
    "Ayurveda in Kerala": "Medicine",
    "Costume": "Costume",
    "Costume of Madhya Pradesh": "Costume",
    "Costumes": "Costume",
    "Costumes of Bengal": "Costume",
    "Costumes of Jammu Kashmir": "Costume",
    "Costumes of Maharashtra": "Costume",
    "Costumes of Uttar Pradesh": "Costume",
    "Crafts": "Art",
    "Cuisine": "Cuisine",
    "Cuisines": "Cuisine",
    "Cuisines in Bengal": "Cuisine",
    "Cuisines in Jammu Kashmir": "Cuisine",
    "Cuisines in Maharashtra": "Cuisine",
    "Cuisines in Uttar Pradesh": "Cuisine",
    "Cuisines of Meghalaya": "Cuisine",
    "Culture": "Rituals and Ceremonies",
    "Culture and Tradition": "Rituals and Ceremonies",
    "Culture and Tradition of Jammu Kashmir": "Rituals and Ceremonies",
    "Culture and Tradition of Jharkhand": "Rituals and Ceremonies",
    "Culture and Tradition of Uttar Pradesh": "Rituals and Ceremonies",
    "Culture of Madhya Pradesh": "Rituals and Ceremonies",
    "Culture of Maharashtra": "Rituals and Ceremonies",
    "Culture of Meghalaya": "Rituals and Ceremonies",
    "Dadra and Nagar Haveli and Daman and Diu Language": "Language",
    "Dance": "Dance and Music",
    "Dance and Music": "Dance and Music",
    "Dressing": "Costume",
    "Fair and Festival": "Festivals",
    "Fair and Festivals": "Festivals",
    "Fairs and Festival": "Festivals",
    "Fairs and Festivals": "Festivals",
    "Famous People": "Personalities",
    "Famous Tourist Attractions": "Tourism",
    "Famous personalities": "Personalities",
    "Famous tourist attraction": "Tourism",
    "Famous tourist attractions": "Tourism",
    "Festival of Kerala": "Festivals",
    "Festivals": "Festivals",
    "Festivals of Bengal": "Festivals",
    "Gujarat Traditions": "Rituals and Ceremonies",
    "Gujarati Costumes": "Costume",
    "Gujarati Language": "Language",
    "Gujarati Music and dance": "Dance and Music",
    "Handicraft": "Art",
    "Haryana Costumes": "Costume",
    "Haryana Cuisines": "Cuisine",
    "Haryana Music and dance": "Dance and Music",
    "Haryana Traditions": "Rituals and Ceremonies",
    "Haryanavi Language": "Language",
    "History": "History",
    "History of Dadra and Nagar Haveli and Daman and Diu": "History",
    "Language": "Language",
    "Languages": "Language",
    "Music": "Dance and Music",
    "Music and Dance": "Dance and Music",
    "Music and Dances of Bengal": "Dance and Music",
    "Music and Dances of Jammu Kashmir": "Dance and Music",
    "Music and Dances of Maharashtra": "Dance and Music",
    "Music and Dances of Uttar Pradesh": "Dance and Music",
    "Nightlife in Goa": "Nightlife",
    "Odisha Costume": "Costume",
    "Odisha Cuisines": "Cuisine",
    "Odisha Custom and Traditions": "Rituals and Ceremonies",
    "Odisha Language": "Language",
    "Odisha Music and Dance": "Dance and Music",
    "Odisha Wedding tradition": "Rituals and Ceremonies",
    "Origin": "History",
    "Overview": "Cultural Common Sense",
    "Paintings": "Art",
    "Punjabi Attire": "Costume",
    "Punjabi Cuisine": "Cuisine",
    "Punjabi Dance": "Dance and Music",
    "Punjabi Music": "Dance and Music",
    "Punjabi Wedding Traditions": "Rituals and Ceremonies",
    "Rajasthani Art and craft": "Art",
    "Rajasthani Costume": "Costume",
    "Rajasthani Cuisines": "Cuisine",
    "Rajasthani Music and Dance": "Dance and Music",
    "Rajasthani Tradition": "Rituals and Ceremonies",
    "Religion": "Religion",
    "Sports": "Sports",
    "Tamil Nadu Wedding Tradition": "Rituals and Ceremonies",
    "Tourism": "Tourism",
    "Tourism in Goa": "Tourism",
    "Tourism of Madhya Pradesh": "Tourism",
    "Tradition of Bengal": "Rituals and Ceremonies",
    "Tradition of Maharashtra": "Rituals and Ceremonies",
    "Transport": "Transport",
    "Wedding": "Rituals and Ceremonies",
    "Agriculture" : "Cuisine",
    "Architecture" : "History",
    "Clothing":"Costume",
    "Cousinie":"Cuisine",
    "Cultural":"Cultural Common Sense",
    "Cultural Commonsense":"Cultural Common Sense",
    "Cultural Common sense":"Cultural Common Sense",
    "Dance and song": "Dance and Music",
    "Dance ans Music": "Dance and Music",
    "Economy": "Cultural Common Sense",
    "Education": "Cultural Common Sense",
    "Environment": "Tourism",
    "Festival":"Festivals",
    "Folk Dance": "Dance and Music",
    "Fort": "Tourism",
    "Governance" : "Cultural Common Sense",
    "Healthcare" : "Medicine",
    "Lanuages": "Language",
    "Literature": "Art",
    "Maritime": "Cultural Common Sense",
    "Place": "Tourism",
    "Rituals" : "Rituals and Ceremonies",
    "Technology" : "Cultural Common Sense",
    "Tradition" : "Rituals and Ceremonies",
    "Traditional Greetings": "Cultural Common Sense",
    "Traditional Knowledge": "Cultural Common Sense",
    "Urban Planning": "Tourism",
    "Wildlife": "Tourism",
    "cultural Common Sense" : "Cultural Common Sense",
    "culture and tradition" : "Rituals and Ceremonies",
    "festivals": "Festivals",
    "language": "Language",
    "languages": "Language"
    
}

CSV_PATHS = {
    "English": "../Corrected_Questions_Final_Dataset_English.csv",
    "Hindi": "../Corrected_Questions_Final_Dataset_Hindi.csv",
    "Gujarati": "../Corrected_Questions_Final_Dataset_Gujarati.csv",
    "Malayalam": "../Corrected_Questions_Final_Dataset_Malayalam.csv",
    "Tamil": "../Corrected_Questions_Final_Dataset_Tamil.csv",
    "Telugu": "../Corrected_Questions_Final_Dataset_Telugu.csv",
    "Bengali": "../Corrected_Questions_Final_Dataset_Bengali.csv",
    "Marathi": "../Corrected_Questions_Final_Dataset_Marathi.csv",
    "Punjabi": "../Corrected_Questions_Final_Dataset_Punjabi.csv",
    "Odia": "../Corrected_Questions_Final_Dataset_Odia.csv",
    "Assamese": "../Corrected_Questions_Final_Dataset_Assamese.csv",
    "Urdu": "../Corrected_Questions_Final_Dataset_Urdu.csv",
    "Kannada": "../Corrected_Questions_Final_Dataset_Kannada.csv",
    "Konkani": "../Corrected_Questions_Final_Dataset_Konkani.csv",
    "Sindhi": "../Corrected_Questions_Final_Dataset_Sindhi.csv",
}

# Load stopwords
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def extract_named_entities(text):
    """
    Extracts named entities from the given text using spaCy.

    Args:
        text (str): Input text.

    Returns:
        list: List of named entities.
    """
    doc = nlp(text)
    return [ent.text for ent in doc.ents]

def preprocess_data(data):
    """
    Cleans the input data by removing unwanted patterns and stopwords.

    Args:
        data (dict): The original data containing word frequencies.
    
    Returns:
        dict: Cleaned data with unwanted patterns and stopwords removed.
    """
    cleaned_data = {}
    for word, freq in data.items():
        # Remove terms matching patterns like Q\d+\s*:?, stopwords, and one-character entities
        if not re.match(r'Q\d+\s*:?', word) and word.lower() != 'french' and word.lower() not in stop_words and len(word) > 1:
            cleaned_data[word] = freq
    return cleaned_data

# Generate word clouds
def generate_word_cloud(data, title, output_path):
    cleaned_data = preprocess_data(data)
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(cleaned_data)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=16)
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()

# Process each file independently and collect combined DataFrame
dataframes = []
for lang, file in CSV_PATHS.items():
    # Load the file
    df = pd.read_csv(file, encoding='ISO-8859-1')
    stop_words = set(stopwords.words(lang.strip().lower()))

    # Extract the question type based on file name
    file_name = os.path.basename(file)
    # question_type = question_type_mapping[file_name]

    # Replace attributes using the mapping
    df['Attribute'] = df['Attribute'].apply(lambda x: attribute_mapping[x] if x in attribute_mapping else x)

    # Extract named entities from the entire dataset of this file
    dataset_text = df['Cultural Artifact'].str.cat(sep=' ')
    named_entities = extract_named_entities(dataset_text)
    word_freq = Counter(named_entities)

    # Generate word cloud for named entities
    output_path = f"{lang}_named_entities_wordcloud_ca.png"
    generate_word_cloud(word_freq, f"Word Cloud for {lang} (Named Entities)", output_path)

    dataframes.append(df)

# Combine all data into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# Generate a word cloud for all named entities in the combined dataset
# dataset_text = combined_df['Question'].str.cat(sep=' ')
# named_entities = extract_named_entities(dataset_text)
# word_freq = Counter(named_entities)
# output_path = "Wordcloud/India_named_entities_wordcloud.png"
# generate_word_cloud(word_freq, "Word Cloud for India (Named Entities)", output_path)

print("Processing complete. Word clouds for question types and combined dataset generated.")