In [None]:
# Jupyter Notebook: main_analysis.ipynb
# Ensure all dependencies are installed using requirements.txt prior to running this notebook.

# Step 1: Setup and Imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scripts.text_preprocessing import TextPreprocessor
from scripts.data_exploration import plot_feature_distribution
from scripts.sentiment_analysis import classify_sentiment, generate_wordcloud, ngram_analysis
from scripts.sentiment_correlation import plot_correlation_heatmap
from scripts.topic_modeling import train_lda_model, train_bertopic_model
from scripts.sentiment_prediction import train_sentiment_model

# Step 2: Load Data
print("Loading data...")
DATA_PATH = 'data/senti_df.csv'
df = pd.read_csv(DATA_PATH)
print("Data loaded successfully!")
display(df.head())

# Step 3: Text Preprocessing
print("Starting text preprocessing...")
preprocessor = TextPreprocessor()
df['cleaned_text'] = df['text'].apply(preprocessor.clean_text)
print("Text preprocessing completed!")
display(df[['text', 'cleaned_text']].head())

# Step 4: Initial Data Exploration
print("Exploring data distributions...")
for feature in ['Speech_date', 'year', 'time', 'gender', 'party_group']:
    if feature in df.columns:
        plot_feature_distribution(df, feature)
print("Data exploration completed!")

# Step 5: Sentiment Analysis and Classification
print("Classifying sentiment and analyzing word frequencies...")
df = classify_sentiment(df, 'sentiment_score')  # Replace with your actual score column
generate_wordcloud(df, 'positive')
generate_wordcloud(df, 'negative')

print("Running n-gram analysis for positive and negative speeches...")
ngram_analysis(df, 'positive', 2)  # Bi-gram analysis for positive speeches
ngram_analysis(df, 'negative', 3)  # Tri-gram analysis for negative speeches

print("Sentiment analysis completed!")

# Step 6: Correlation Analysis
print("Calculating and plotting correlation heatmap...")
sentiment_columns = ['afinn_sentiment', 'jockers_sentiment', 'nrc_sentiment', 'huliu_sentiment', 'rheault_sentiment']
if all(col in df.columns for col in sentiment_columns):
    plot_correlation_heatmap(df, sentiment_columns)
else:
    print("Some sentiment columns are missing; skipping correlation heatmap.")
print("Correlation analysis completed!")

# Step 7: Topic Modeling (LDA and BERTopic)
print("Training LDA model...")
lda_model, dictionary, corpus = train_lda_model(df, 'cleaned_text')
print("LDA model training completed!")

print("Training BERTopic model...")
bertopic_model = train_bertopic_model(df, 'cleaned_text')
print("BERTopic model training completed!")

# Step 8: Sentiment Prediction Model
print("Training sentiment prediction model...")
train_sentiment_model(df, 'cleaned_text', 'sentiment')
print("Sentiment prediction model training completed!")

# Step 9: Conclusion
print("All tasks have been completed successfully!")
print("Make sure to review the visualizations and outputs for any further analysis.")
