In [1]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Ensure necessary NLTK resources are downloaded
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
file_path = 'all_sentiment_analysis.csv'
data = pd.read_csv(file_path)

# Sentiment Analysis
# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to determine sentiment
def get_sentiment(text):
    sentiment_scores = sia.polarity_scores(text)
    if sentiment_scores['compound'] >= 0.05:
        return 'Positive'
    elif sentiment_scores['compound'] <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment analysis to the dataset
data['Sentiment'] = data['Text'].apply(get_sentiment)

# Split data into training and test sets for evaluation
train_data, test_data = train_test_split(data[['Text', 'Sentiment']], test_size=0.2, random_state=42)

# Topic Modeling
stop_words = set(stopwords.words('english'))
texts_processed = [
    [word for word in word_tokenize(doc.lower()) if word.isalpha() and word not in stop_words]
    for doc in data['Preprocessed_Text'].dropna()
]

# Create Dictionary and Corpus
dictionary = Dictionary(texts_processed)
corpus = [dictionary.doc2bow(text) for text in texts_processed]

# Build LDA Model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, random_state=42, passes=10)

# Evaluate the model with Coherence Score
coherence_model = CoherenceModel(model=lda_model, texts=texts_processed, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()

# Display Results
print("Sample Sentiment Analysis Results:")
print(train_data.head(10))

print("\nTopic Modeling Results:")
for idx, topic in lda_model.print_topics(num_words=10):
    print(f"Topic {idx}: {topic}")

print(f"\nCoherence Score: {coherence_score}")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/ahmedatout/nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ahmedatout/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ahmedatout/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


AttributeError: 'float' object has no attribute 'encode'

In [3]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Ensure necessary NLTK resources are downloaded
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
file_path = 'all_sentiment_analysis.csv'
data = pd.read_csv(file_path)

# Drop rows with missing or invalid text
data = data.dropna(subset=['Text'])
data['Text'] = data['Text'].astype(str)

# Sentiment Analysis
# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to determine sentiment
def get_sentiment(text):
    sentiment_scores = sia.polarity_scores(text)
    if sentiment_scores['compound'] >= 0.05:
        return 'Positive'
    elif sentiment_scores['compound'] <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment analysis to the dataset
data['Sentiment'] = data['Text'].apply(get_sentiment)

# Split data into training and test sets for evaluation
train_data, test_data = train_test_split(data[['Text', 'Sentiment']], test_size=0.2, random_state=42)

# Topic Modeling
stop_words = set(stopwords.words('english'))
texts_processed = [
    [word for word in word_tokenize(doc.lower()) if word.isalpha() and word not in stop_words]
    for doc in data['Preprocessed_Text'].dropna()
]

# Create Dictionary and Corpus
dictionary = Dictionary(texts_processed)
corpus = [dictionary.doc2bow(text) for text in texts_processed]

# Build LDA Model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, random_state=42, passes=10)

# Evaluate the model with Coherence Score
coherence_model = CoherenceModel(model=lda_model, texts=texts_processed, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()

# Display Results
print("Sample Sentiment Analysis Results:")
print(train_data.head(10))

print("\nTopic Modeling Results:")
for idx, topic in lda_model.print_topics(num_words=10):
    print(f"Topic {idx}: {topic}")

print(f"\nCoherence Score: {coherence_score}")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/ahmedatout/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ahmedatout/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ahmedatout/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Sample Sentiment Analysis Results:
                                                  Text Sentiment
57   Because if like you don't go basically it's li...  Positive
228  I remember because it was around Thanksgiving ...   Neutral
281  please talk about the impact of stuttering on ...   Neutral
486  After the group was over, I gave her some advi...  Positive
221                                        go for it .   Neutral
448  is there anything you or what else would you w...  Positive
422                              as best as possible .  Positive
376  so with family (.) they don't really know that...  Negative
25                I would graduate some better school.  Positive
230  so she specialized in stuttering but she wasn'...  Negative

Topic Modeling Results:
Topic 0: 0.062*"l" + 0.031*"thing" + 0.015*"kind" + 0.014*"work" + 0.012*"n" + 0.012*"important" + 0.009*"xxx" + 0.009*"hand" + 0.009*"acceptance" + 0.009*"go"
Topic 1: 0.069*"stutter" + 0.042*"like" + 0.039*"people" + 0.030*