<a href="https://colab.research.google.com/github/Collins-nnaji/Data_Science/blob/main/Project_2_DrugReview_SentimentAnalysis_TopicModel_NaiveBayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
path = "/content/drive/MyDrive/Colab Notebooks/Drug_Review_Train.tsv"

In [None]:
!pip install wordcloud scikit-learn
!pip install textblob
!pip install gensim nltk pandas
!python -m spacy download en_core_web_sm
!pip install spacy
!pip install gensim sklearn pandas
!pip install pyLDAvis
!pip install matplotlib
%matplotlib inline
import pandas as pd
import nltk
import numpy as np
import matplotlib.pyplot as plt
import gensim
import seaborn as sns
import pyLDAvis.gensim_models
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel
from wordcloud import WordCloud
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



In [None]:
#1. READ FILE,from text format to dataframe

df = pd.read_csv(path, sep='\t')
df.head()

In [None]:
#2. SENTIMENT ANALYSIS USING TEXTBLOB

# Function to get the sentiment
def get_sentiment(text):
    # Create a TextBlob object
    analysis = TextBlob(str(text))

    # Classifying the polarity of the text
    if analysis.sentiment.polarity > 0:
        return 'positive', analysis.sentiment.polarity
    elif analysis.sentiment.polarity == 0:
        return 'neutral', analysis.sentiment.polarity
    else:
        return 'negative', analysis.sentiment.polarity

# Applying the function to the reviews to get sentiments and scores
df['benefitsSentiment'], df['benefitsSentimentScore'] = zip(*df['benefitsReview'].apply(get_sentiment))
df['sideEffectsSentiment'], df['sideEffectsSentimentScore'] = zip(*df['sideEffectsReview'].apply(get_sentiment))
df['commentsSentiment'], df['commentsSentimentScore'] = zip(*df['commentsReview'].apply(get_sentiment))

In [None]:
# 4. VIEW DATA IN DIFFERENT COLUMN COMBINATIONS
df.head(20)

In [None]:
benefits_data = df[['benefitsReview', 'benefitsSentiment', 'benefitsSentimentScore']]
benefits_data

In [None]:
side_effects_data = df[['sideEffectsReview', 'sideEffectsSentiment', 'sideEffectsSentimentScore']]
side_effects_data

In [None]:
comments_data = df[['commentsReview', 'commentsSentiment', 'commentsSentimentScore']]
comments_data

In [None]:
side_effects_data2 = df[['sideEffectsReview', 'sideEffectsSentiment', 'sideEffectsSentimentScore','rating','effectiveness','sideEffects']]
side_effects_data2

In [None]:
comments_data2 = df[['commentsReview', 'commentsSentiment', 'commentsSentimentScore','rating','effectiveness','sideEffects']]
comments_data2

In [None]:
benefits_data2 = df[['benefitsReview', 'benefitsSentiment', 'benefitsSentimentScore','rating','effectiveness','sideEffects']]
benefits_data2

In [None]:
df['effectiveness'].unique()

In [None]:
df['urlDrugName'].unique()

In [None]:
df['sideEffects'].unique()

In [None]:
#6. TOPIC MODELLING FOR CONCATENATED REVIEWS

# 1. Preprocessing
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token.lower() not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens
# Aggregating the textual data columns (ensuring all are strings)
df['agg_review'] = df['benefitsReview'].astype(str) + ' ' + df['sideEffectsReview'].astype(str) + ' ' + df['commentsReview'].astype(str)
# Applying preprocessing on the aggregated reviews
df['tokens'] = df['agg_review'].apply(preprocess)

# 2. Data Aggregation (using each review as a separate document here)
texts = df['tokens'].tolist()

# 3. & 4. Model Selection & Determine Topic Count
# Create a dictionary and a corpus for LDA
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
# Set the number of topics
num_topics = 10

# 5. Topic Modeling using LDA
lda_model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=15)

# 6. Model Evaluation using Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)


In [None]:
    vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary, mds='mmds', n_jobs=1)
    pyLDAvis.display(vis)

In [None]:
# 7. DATA VISUALIZATIONS
# Word Cloud for benefitsReview
all_reviews1 = ' '.join(df['benefitsReview'].astype(str))
wordcloud = WordCloud(stopwords=ENGLISH_STOP_WORDS, background_color="white", max_words=100, width=800, height=400).generate(all_reviews1)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Benefits Review')
plt.show()

# 2. Word Cloud for sideEffectsReview
all_reviews2 = ' '.join(df['sideEffectsReview'].astype(str))
wordcloud = WordCloud(stopwords=ENGLISH_STOP_WORDS, background_color="white", max_words=100, width=800, height=400).generate(all_reviews2)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for sideEffectsReview')
plt.show()

# 3. Word Cloud for commentsReview
all_reviews3 = ' '.join(df['commentsReview'].astype(str))
wordcloud = WordCloud(stopwords=ENGLISH_STOP_WORDS, background_color="white", max_words=100, width=800, height=400).generate(all_reviews3)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for commentsReview')
plt.show()

In [None]:
sns.countplot(data=df, x='rating')
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()


In [None]:
sns.countplot(data=df, y='effectiveness', order=df['effectiveness'].value_counts().index)
plt.title('Distribution of Effectiveness Ratings')
plt.xlabel('Count')
plt.ylabel('Effectiveness')
plt.show()


In [None]:
sns.countplot(data=df, y='sideEffects', order=df['sideEffects'].value_counts().index)
plt.title('Distribution of Reported Side Effects')
plt.xlabel('Count')
plt.ylabel('Side Effects')
plt.show()


In [None]:
# 8. TEXT CLASSIFICATION
# Grouping the effectiveness into two classes

df['benefitsReview'].fillna('', inplace=True)
df['effectiveness'] = df['effectiveness'].replace({
    'Highly Effective': 'Effective',
    'Considerably Effective': 'Effective',
    'Moderately Effective': 'Effective',
    'Marginally Effective': 'Ineffective',
    'Ineffective': 'Ineffective'
})

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(df['benefitsReview'], df['effectiveness'], test_size=0.25, random_state=42)

# Vectorizing benefitsReview
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tf = vectorizer.fit_transform(X_train)
X_test_tf = vectorizer.transform(X_test)

# Label encoding effectiveness
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Classifier
clf = MultinomialNB().fit(X_train_tf, y_train_encoded)
predictions = clf.predict(X_test_tf)

# Print Accuracy
print("Accuracy:", accuracy_score(y_test_encoded, predictions))

# Classification Report
print("\nClassification Report:\n", classification_report(y_test_encoded, predictions, target_names=label_encoder.classes_))

# Confusion Matrix
cm = confusion_matrix(y_test_encoded, predictions)

# Visualizing Confusion Matrix
plt.figure(figsize=(7,5))
sns.heatmap(cm, annot=True, cmap="YlGnBu", fmt='g', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.show()

NameError: ignored