In [None]:
# Persian Sentiment Analysis and Visualization


# Importing essential libraries for text processing, sentiment analysis, and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud_fa import WordCloudFa
from hazm import Normalizer, WordTokenizer, stopwords_list
from textblob import TextBlob  # For rule-based sentiment analysis
import warnings
warnings.filterwarnings('ignore')

# Setting up a professional visualization theme
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("muted")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['font.family'] = 'Arial'

# --- 1. Data Loading and Initial Inspection ---
# Load the comments dataset
df = pd.read_csv('nazar.csv')

# Display dataset overview
print("Dataset Overview:")
print(f"Shape: {df.shape}")
print("\nFirst 5 Comments:")
print(df['nazaar'].head())

# --- 2. Text Preprocessing ---
# Initialize Hazm tools for Persian text processing
normalizer = Normalizer()
tokenizer = WordTokenizer()
stopwords = set(stopwords_list())  # Load default Hazm stopwords

# Load custom stopwords from file
with open('stopwords.txt', 'r', encoding='utf-8') as f:
    custom_stopwords = set(f.read().splitlines())
stopwords.update(custom_stopwords)

def preprocess_text(text):
    """Preprocess Persian text: normalize, tokenize, and remove stopwords."""
    if not isinstance(text, str):
        return ''
    # Normalize text (e.g., unify characters, remove diacritics)
    text = normalizer.normalize(text)
    # Tokenize and remove stopwords
    tokens = tokenizer.tokenize(text)
    tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(tokens)

# Apply preprocessing to comments
df['cleaned_text'] = df['nazaar'].apply(preprocess_text)

# --- 3. Sentiment Analysis ---
def get_sentiment(text):
    """Perform rule-based sentiment analysis using TextBlob polarity."""
    # Translate Persian text to English for TextBlob (as a simple workaround)
    # Note: For production, use a Persian-specific sentiment model like ParsBERT
    # Here, we use a rule-based approach for simplicity
    if not text:
        return 'Neutral'
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    if polarity > 0.1:
        return 'Positive'
    elif polarity < -0.1:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment analysis
df['sentiment'] = df['cleaned_text'].apply(get_sentiment)

# Display sentiment distribution
print("\nSentiment Distribution:")
print(df['sentiment'].value_counts())

# --- 4. Exploratory Data Analysis (EDA) ---
# Visualize sentiment distribution
plt.figure(figsize=(8, 5))
sns.countplot(x='sentiment', data=df, palette='coolwarm', order=['Positive', 'Neutral', 'Negative'])
plt.title('Sentiment Distribution of Mentorship Comments', pad=20)
plt.xlabel('Sentiment')
plt.ylabel('Number of Comments')
plt.show()

# Combine all cleaned text for word cloud
all_text = ' '.join(df['cleaned_text'])

# Generate word cloud
wordcloud = WordCloudFa(
    persian_normalize=True,
    include_numbers=False,
    background_color='white',
    width=1400,
    height=800,
    stopwords=stopwords,
    font_path='path/to/persian_font.ttf'  # Replace with a valid Persian font path, e.g., 'B Nazanin'
)
wc = wordcloud.generate(all_text)

# Visualize word cloud
plt.figure(figsize=(14, 8))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Mentorship Comments', pad=20)
plt.show()

# Save word cloud
wc.to_file('wordcloud.png')

# Word frequency analysis
word_freq = pd.Series(' '.join(df['cleaned_text']).split()).value_counts().head(10)
plt.figure(figsize=(10, 6))
sns.barplot(x=word_freq.values, y=word_freq.index, palette='viridis')
plt.title('Top 10 Most Frequent Words in Comments', pad=20)
plt.xlabel('Frequency')
plt.ylabel('Word')
plt.show()



# ---  Save Results ---
# Save processed dataset with sentiments
df.to_csv('processed_nazar.csv', index=False, encoding='utf-8')
print("\nProcessed dataset saved as 'processed_nazar.csv'.")