In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from textblob import TextBlob  # For sentiment analysis example

In [2]:
# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Load the dataset (replace with your actual dataset path)
df = pd.read_csv('/content/drive/MyDrive/reviews.csv')

In [4]:
# Function for preprocessing text data
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    tokens = word_tokenize(text)  # Tokenization
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and len(word) > 1]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()  # Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [5]:
# Apply preprocessing to 'content' column
df['clean_text'] = df['content'].apply(preprocess_text)

In [6]:
# Sentiment analysis using TextBlob as an example
def analyze_sentiment(text):
    analysis = TextBlob(text)
    sentiment = analysis.sentiment.polarity
    if sentiment > 0:
        return 'positive'
    elif sentiment == 0:
        return 'neutral'
    else:
        return 'negative'

# Apply sentiment analysis
df['predicted_sentiment'] = df['clean_text'].apply(analyze_sentiment)

In [7]:
# Convert ratings to sentiment categories for evaluation
def convert_rating_to_sentiment(score):
    if score >= 4:
        return 'positive'
    elif score == 3:
        return 'neutral'
    else:
        return 'negative'

# Apply conversion
df['true_sentiment'] = df['score'].apply(convert_rating_to_sentiment)

# Example Evaluation and Recommendations
# Evaluation metrics example: Sentiment analysis accuracy
accuracy = accuracy_score(df['true_sentiment'], df['predicted_sentiment'])
print(f"Accuracy: {accuracy}")
print(classification_report(df['true_sentiment'], df['predicted_sentiment']))

Accuracy: 0.5130852340936375
              precision    recall  f1-score   support

    negative       0.73      0.28      0.41      4850
     neutral       0.15      0.19      0.17      1991
    positive       0.57      0.82      0.68      5654

    accuracy                           0.51     12495
   macro avg       0.49      0.43      0.42     12495
weighted avg       0.57      0.51      0.49     12495



In [8]:
# Recommendations based on analysis
positive_reviews = df[df['predicted_sentiment'] == 'positive']
common_issues = df[df['predicted_sentiment'] == 'negative']['clean_text'].value_counts().head(5)

# Print recommendations
print("Recommendations based on analysis:")
print(f"Positive reviews count: {len(positive_reviews)}")
print(f"Common negative issues:")
print(common_issues)

Recommendations based on analysis:
Positive reviews count: 8144
Common negative issues:
clean_text
bad            26
complicated    11
worst app      10
useless         8
confusing       7
Name: count, dtype: int64


In [9]:
# Topic Modeling using LDA
# Vectorize text data
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = vectorizer.fit_transform(df['clean_text'])

# Apply LDA
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_model.fit(dtm)

# Display top words per topic
print("Top words per topic:")
for index, topic in enumerate(lda_model.components_):
    print(f"Topic {index + 1}:")
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
    print()

Top words per topic:
Topic 1:
['day', 'need', 'work', 'use', 'sync', 'event', 'google', 'good', 'calendar', 'app']

Topic 2:
['new', 'pay', 'update', 'pro', 'feature', 'ad', 'free', 'premium', 'version', 'app']

Topic 3:
['reminder', 'dont', 'working', 'account', 'doesnt', 'phone', 'time', 'work', 'notification', 'app']

Topic 4:
['feature', 'like', 'reminder', 'option', 'time', 'day', 'add', 'app', 'list', 'task']

Topic 5:
['good', 'best', 'like', 'easy', 'really', 'love', 'habit', 'use', 'great', 'app']

