In [5]:
import pandas as pd
import plotly.graph_objects as go
from sklearn.feature_extraction.text import CountVectorizer
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_white"

df = pd.read_csv("chatgpt_reviews.csv")
df.head()

Unnamed: 0,Review Id,Review,Ratings,Review Date
0,6fb93778-651a-4ad1-b5ed-67dd0bd35aac,good,5,2024-08-23 19:30:05
1,81caeefd-3a28-4601-a898-72897ac906f5,good,5,2024-08-23 19:28:18
2,452af49e-1d8b-4b68-b1ac-a94c64cb1dd5,nice app,5,2024-08-23 19:22:59
3,372a4096-ee6a-4b94-b046-cef0b646c965,"nice, ig",5,2024-08-23 19:20:50
4,b0d66a4b-9bde-4b7c-8b11-66ed6ccdd7da,"this is a great app, the bot is so accurate to...",5,2024-08-23 19:20:39


In [6]:
df.isnull().sum()

Review Id      0
Review         6
Ratings        0
Review Date    0
dtype: int64

In [7]:
df['Review']= df['Review'].fillna('')

In [11]:
!pip install textblob
from textblob import TextBlob
def get_sentiment(review):
    sentiment = TextBlob(review).sentiment.polarity
    if sentiment > 0:
        return 'Positive'
    elif sentiment < 0:
        return 'Negative'
    else:
        return 'Neutral'
    
df['Sentiment'] = df['Review'].apply(get_sentiment)
sentiment_distribution = df['Sentiment'].value_counts()


[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip




In [13]:
fig = go.Figure(data=[go.Bar(x=sentiment_distribution.index,
                             y=sentiment_distribution.values,
                             marker_color=['green','grey','red'])])
fig.update_layout(
    title= "Sentiment Distribution of ChatGPT reviews",
    xaxis_title='Sentiment',
    yaxis_title='Number of Reviews',
    width=800,
    height=600
)
fig.show()

**Analyzing What Users Like About ChatGPT**

In [17]:
positive_reviews = df[df['Sentiment']=='Positive']['Review']

vectorizer = CountVectorizer(ngram_range=(2,3),stop_words="english",max_features=100)
X = vectorizer.fit_transform(positive_reviews)

phase_counts = X.sum(axis=0)
phases = vectorizer.get_feature_names_out()
phase_freq = [(phases[i],phase_counts[0,i])for i in range(len(phases))]

phase_freq = sorted(phase_freq,key=lambda x:x[1],reverse=True)
phase_df = pd.DataFrame(phase_freq, columns=['Phrase','Frequency'])
fig = px.bar(phase_df, 
             x='Frequency',
             y='Phrase',
             orientation='h',
             title='Top Common Phrases in Positive Reviews',
             width = 1000,
             height = 600)
fig.update_layout(
    xaxis_title='Frequency',
    yaxis_title='Phrase',
    yaxis={'categoryorder':'total ascending'}
)
fig.show()

**Analyzing What Users Don’t Like About ChatGPT**

In [19]:
negative_reviews = df[df['Sentiment']=='Negative']['Review']

X_neg = vectorizer.transform(negative_reviews)

phrase_counts_neg = X_neg.sum(axis=0)
phrases_neg = vectorizer.get_feature_names_out()
phrase_freq_neg = [(phrases_neg[i], phrase_counts_neg[0, i]) for i in range(len(phrases_neg))]

phrase_freq_neg = sorted(phrase_freq_neg, key=lambda x: x[1], reverse=True)

phrase_neg_df = pd.DataFrame(phrase_freq_neg, columns=['Phrase', 'Frequency'])

fig = px.bar(phrase_neg_df,
             x='Frequency',
             y='Phrase',
             orientation='h',
             title='Top Common Phrases in Negative Reviews',
             labels={'Phrase': 'Phrase', 'Frequency': 'Frequency'},
             width=1000,
             height=600)

fig.update_layout(
    xaxis_title='Frequency',
    yaxis_title='Phrase',
    yaxis={'categoryorder':'total ascending'}
)

fig.show()

**Common Problems Faced by Users in ChatGPT**

In [21]:
# grouping similar phrases into broader problem categories
problem_keywords = {
    'Incorrect Answers': ['wrong answer', 'gives wrong', 'incorrect', 'inaccurate', 'wrong'],
    'App Performance': ['slow', 'lag', 'crash', 'bug', 'freeze', 'loading', 'glitch', 'worst app', 'bad app', 'horrible', 'terrible'],
    'User Interface': ['interface', 'UI', 'difficult to use', 'confusing', 'layout'],
    'Features Missing/Not Working': ['feature missing', 'not working', 'missing', 'broken', 'not available'],
    'Quality of Responses': ['bad response', 'useless', 'poor quality', 'irrelevant', 'nonsense']
}

# initialize a dictionary to count problems
problem_counts = {key: 0 for key in problem_keywords.keys()}

# count occurrences of problem-related phrases in negative reviews
for phrase, count in phrase_freq_neg:
    for problem, keywords in problem_keywords.items():
        if any(keyword in phrase for keyword in keywords):
            problem_counts[problem] += count
            break

problem_df = pd.DataFrame(list(problem_counts.items()), columns=['Problem', 'Frequency'])

fig = px.bar(problem_df,
             x='Frequency',
             y='Problem',
             orientation='h', 
             title='Common Problems Faced by Users in ChatGPT',
             labels={'Problem': 'Problem', 'Frequency': 'Frequency'},
             width=1000,
             height=600)

fig.update_layout(
    plot_bgcolor='white',  
    paper_bgcolor='white', 
    xaxis_title='Frequency',
    yaxis_title='Problem',
    yaxis={'categoryorder':'total ascending'}  
)

fig.show()

**Analyzing How Reviews Changed Over Time**

In [22]:
df['Review Date'] = pd.to_datetime(df['Review Date'])

sentiment_over_time = df.groupby([df['Review Date'].dt.to_period('M'), 'Sentiment']).size().unstack(fill_value=0)

sentiment_over_time.index = sentiment_over_time.index.to_timestamp()

fig = go.Figure()

fig.add_trace(go.Scatter(x=sentiment_over_time.index, y=sentiment_over_time['Positive'],
                         mode='lines', name='Positive', line=dict(color='green')))
fig.add_trace(go.Scatter(x=sentiment_over_time.index, y=sentiment_over_time['Neutral'],
                         mode='lines', name='Neutral', line=dict(color='gray')))
fig.add_trace(go.Scatter(x=sentiment_over_time.index, y=sentiment_over_time['Negative'],
                         mode='lines', name='Negative', line=dict(color='red')))

fig.update_layout(
    title='Sentiment Trends Over Time',
    xaxis_title='Date',
    yaxis_title='Number of Reviews',
    plot_bgcolor='white',  
    paper_bgcolor='white',  
    legend_title_text='Sentiment',
    xaxis=dict(showgrid=True, gridcolor='lightgray'), 
    yaxis=dict(showgrid=True, gridcolor='lightgray')
)

fig.show()

**Analyzing How Often Users Promote ChatGPT**

Promoters: Respondents who provide a score of 9 or 10.
Passives: Respondents who provide a score of 7 or 8.
Detractors: Respondents who provide a score between 0 and 6.


NPS= %Promoters − %Detractors

In [23]:
df['NPS Category'] = df['Ratings'].apply(lambda x: 'Promoter' if x == 5 else ('Passive' if x == 4 else 'Detractor'))

nps_counts = df['NPS Category'].value_counts(normalize=True) * 100

nps_score = nps_counts.get('Promoter', 0) - nps_counts.get('Detractor', 0)

nps_score

64.35313912172705