In [1]:
import tkinter as tk
from tkinter import ttk
from tkinter import messagebox
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import plotly.express as px
import plotly.io as pio
import webbrowser

In [2]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
# Step 1: Load the Dataset
apps_df = pd.read_csv("C:/Users/DELL/Downloads/Play Store Data.csv")
reviews_df = pd.read_csv("C:/Users/DELL/Downloads/User Reviews.csv")

In [4]:
apps_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [5]:
print(type(reviews_df))
print(reviews_df.shape)
print(reviews_df.head())


<class 'pandas.core.frame.DataFrame'>
(64295, 5)
                     App                                  Translated_Review  \
0  10 Best Foods for You  I like eat delicious food. That's I'm cooking ...   
1  10 Best Foods for You    This help eating healthy exercise regular basis   
2  10 Best Foods for You                                                NaN   
3  10 Best Foods for You         Works great especially going grocery store   
4  10 Best Foods for You                                       Best idea us   

  Sentiment  Sentiment_Polarity  Sentiment_Subjectivity  
0  Positive                1.00                0.533333  
1  Positive                0.25                0.288462  
2       NaN                 NaN                     NaN  
3  Positive                0.40                0.875000  
4  Positive                1.00                0.300000  


In [6]:
print(reviews_df.head())


                     App                                  Translated_Review  \
0  10 Best Foods for You  I like eat delicious food. That's I'm cooking ...   
1  10 Best Foods for You    This help eating healthy exercise regular basis   
2  10 Best Foods for You                                                NaN   
3  10 Best Foods for You         Works great especially going grocery store   
4  10 Best Foods for You                                       Best idea us   

  Sentiment  Sentiment_Polarity  Sentiment_Subjectivity  
0  Positive                1.00                0.533333  
1  Positive                0.25                0.288462  
2       NaN                 NaN                     NaN  
3  Positive                0.40                0.875000  
4  Positive                1.00                0.300000  


In [7]:
reviews_df.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


In [8]:
# Step 2: Data Cleaning
apps_df = apps_df.dropna(subset=['Rating'])
for column in apps_df.columns:
    apps_df[column].fillna(apps_df[column].mode()[0], inplace=True)
apps_df.drop_duplicates(inplace=True)
apps_df = apps_df[apps_df['Rating'] <= 5]
reviews_df.dropna(subset=['Translated_Review'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  apps_df[column].fillna(apps_df[column].mode()[0], inplace=True)


In [9]:
apps_df.dtypes

App                object
Category           object
Rating            float64
Reviews            object
Size               object
Installs           object
Type               object
Price              object
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object

In [10]:
# Convert 'Installs' to string first, then remove ',' and '+'
apps_df['Installs'] = apps_df['Installs'].astype(str).str.replace(',', '', regex=False).str.replace('+', '', regex=False).astype(int)

# Convert 'Price' to string first, then remove '$'
apps_df['Price'] = apps_df['Price'].astype(str).str.replace('$', '', regex=False).astype(float)


In [11]:
apps_df['Installs'] = pd.to_numeric(apps_df['Installs'].astype(str).str.replace(',', '', regex=False).str.replace('+', '', regex=False), errors='coerce')
apps_df['Price'] = pd.to_numeric(apps_df['Price'].astype(str).str.replace('$', '', regex=False), errors='coerce')


In [12]:
apps_df.dtypes

App                object
Category           object
Rating            float64
Reviews            object
Size               object
Installs            int64
Type               object
Price             float64
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object

In [13]:
# Merge datasets on 'App' and handle non-matching apps
merged_df = pd.merge(apps_df, reviews_df, on='App', how='inner')

In [14]:
merged_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,A kid's excessive ads. The types ads allowed a...,Negative,-0.25,1.0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,It bad >:(,Negative,-0.725,0.833333
2,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,like,Neutral,0.0,0.0
3,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,I love colors inspyering,Positive,0.5,0.6
4,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,I hate,Negative,-0.8,0.9


In [15]:
def convert_size(size):
    if 'M' in size:
        return float(size.replace('M', ''))
    elif 'k' in size:
        return float(size.replace('k', '')) / 1024
    else:
        return np.nan

In [16]:
apps_df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,10000,Free,0.0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,5000000,Free,0.0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,50000000,Free,0.0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,100000,Free,0.0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10834,FR Calculator,FAMILY,4.0,7,2.6M,500,Free,0.0,Everyone,Education,"June 18, 2017",1.0.0,4.1 and up
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53M,5000,Free,0.0,Everyone,Education,"July 25, 2017",1.48,4.1 and up
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6M,100,Free,0.0,Everyone,Education,"July 6, 2018",1.0,4.1 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,Varies with device,1000,Free,0.0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device


In [17]:
# Logarithmic transformation for 'Installs'
apps_df['Log_Installs'] = np.log(apps_df['Installs'])

In [18]:
# Convert 'Reviews' to integer type
apps_df['Reviews'] = apps_df['Reviews'].astype(int)

In [19]:
# Logarithmic transformation for 'Reviews'
apps_df['Log_Reviews'] = np.log(apps_df['Reviews'])

In [20]:
apps_df.dtypes

App                object
Category           object
Rating            float64
Reviews             int64
Size               object
Installs            int64
Type               object
Price             float64
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
Log_Installs      float64
Log_Reviews       float64
dtype: object

In [21]:
# Add Rating Group column
def rating_group(rating):
    if rating >= 4:
        return 'Top rated'
    elif rating >= 3:
        return 'Above average'
    elif rating >= 2:
        return 'Average'
    else:
        return 'Below average'

apps_df['Rating_Group'] = apps_df['Rating'].apply(rating_group)

In [22]:
# Add Revenue column
apps_df['Revenue'] = apps_df['Price'] * apps_df['Installs']

In [23]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download VADER lexicon (only needed once)
nltk.download('vader_lexicon')

# Initialize the sentiment analyzer
sia = SentimentIntensityAnalyzer()


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [24]:
# Example 1: Positive review
review = "This app is amazing! I love the new features."
sentiment_score = sia.polarity_scores(review)
print(sentiment_score)

{'neg': 0.0, 'neu': 0.42, 'pos': 0.58, 'compound': 0.8516}


In [25]:
# Example 2: Negative review
review = "This app is very bad! I hate the new features."
sentiment_score = sia.polarity_scores(review)
print(sentiment_score)

{'neg': 0.535, 'neu': 0.465, 'pos': 0.0, 'compound': -0.8427}


In [26]:
# Example 3: Neutral review
review = "This app is okay."
sentiment_score = sia.polarity_scores(review)
print(sentiment_score)

{'neg': 0.0, 'neu': 0.612, 'pos': 0.388, 'compound': 0.2263}


In [27]:
# Sentiment Analysis
sia = SentimentIntensityAnalyzer()
reviews_df['Sentiment_Score'] = reviews_df['Translated_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])

In [28]:
reviews_df.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Sentiment_Score
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333,0.9531
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462,0.6597
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875,0.6249
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3,0.6369
5,10 Best Foods for You,Best way,Positive,1.0,0.3,0.6369


In [29]:
# Extract year from 'Last Updated' and create 'Year' column
apps_df['Last Updated'] = pd.to_datetime(apps_df['Last Updated'], errors='coerce')
apps_df['Year'] = apps_df['Last Updated'].dt.year

In [30]:
apps_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group,Revenue,Year
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,10000,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up,9.21034,5.068904,Top rated,0.0,2018
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up,13.122363,6.874198,Above average,0.0,2018
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,5000000,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up,15.424948,11.379508,Top rated,0.0,2018
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,50000000,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up,17.727534,12.281384,Top rated,0.0,2018
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,100000,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up,11.512925,6.874198,Top rated,0.0,2018


In [31]:
import os

In [32]:
# Directory to save HTML files
html_files_path = "./"
if not os.path.exists(html_files_path):
    os.makedirs(html_files_path)

In [33]:
plot_containers = ""

In [34]:
# Save each Plotly figure to an HTML file
def save_plot_as_html(fig, filename, insight):
    global plot_containers
    filepath = os.path.join(html_files_path, filename)
    html_content = pio.to_html(fig, full_html=False, include_plotlyjs='inline')
    
    # Append the plot and its insight to plot_containers
    plot_containers += f"""
    <div class="plot-container" id="{filename}" onclick="openPlot('{filename}')">
        <div class="plot">{html_content}</div>
        <div class="insights">{insight}</div>
    </div>
    """
    
    fig.write_html(filepath, full_html=False, include_plotlyjs='inline')

# Plot configuration settings
plot_width = 400
plot_height = 300
plot_bg_color = 'black'
text_color = 'white'
title_font = {'size': 16}
axis_font = {'size': 12}

In [35]:
# Figure 1
category_counts = apps_df['Category'].value_counts().nlargest(10)

fig1 = px.bar(
    x=category_counts.index,
    y=category_counts.values,
    labels={'x': 'Category', 'y': 'Count'},
    title='Top Categories on Play Store',
    color=category_counts.index,
    color_discrete_sequence=px.colors.sequential.Plasma,
    width=400,
    height=300
)

fig1.update_layout(
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white',
    title_font={'size': 16},
    xaxis=dict(title_font={'size': 12}),
    yaxis=dict(title_font={'size': 12}),
    margin=dict(l=10, r=10, t=30, b=10)
)

# fig1.update_traces(marker=dict(pattern=dict(line=dict(color='white', width=1))))
save_plot_as_html(
    fig1,
    "Category Graph 1.html",
    "The top categories on the Play Store are dominated by tools, entertainment, and productivity apps"
)


In [36]:
#  figure 2 
type_counts = apps_df['Type'].value_counts()
fig2 = px.pie(
    values=type_counts.values,
    names=type_counts.index,
    title='App Type Distribution',
    color_discrete_sequence=px.colors.sequential.RdBu,
    width=plot_width,
    height=plot_height
)
fig2.update_traces(textposition='inside', textinfo='percent+label')
fig2.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig2, "type_analysis.html", "Most apps on the Play Store are free, indicating a strategy to attract users first and monetize through ads or in-app purchases.")

In [37]:
# figure 3 
fig3 = px.histogram(
    apps_df,
    x='Rating',
    nbins=20,
    title='Rating Distribution',
    color_discrete_sequence=['#636EFA'],
    width=plot_width,
    height=plot_height
)
fig3.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig3, "rating_distribution.html", "Ratings are skewed towards higher values, suggesting that most apps are rated favorably by users.")

In [38]:
# figure 4
sentiment_counts = reviews_df['Sentiment_Score'].value_counts()
fig4 = px.bar(
    x=sentiment_counts.index,
    y=sentiment_counts.values,
    labels={'x': 'Sentiment Score', 'y': 'Count'},
    title='Sentiment Distribution',
    color=sentiment_counts.index,
    color_discrete_sequence=px.colors.sequential.RdPu,
    width=plot_width,
    height=plot_height
)
fig4.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig4.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig4, "sentiment_distribution.html", "Sentiments in reviews show a mix of positive and negative feedback, with a slight lean towards positive sentiments.")

In [39]:
# figure 5
installs_by_category = apps_df.groupby('Category')['Installs'].sum().nlargest(10)
fig5 = px.bar(
    x=installs_by_category.values,
    y=installs_by_category.index,
    orientation='h',
    labels={'x': 'Installs', 'y': 'Category'},
    title='Installs by Category',
    color=installs_by_category.index,
    color_discrete_sequence=px.colors.sequential.Blues,
    width=plot_width,
    height=plot_height
)
fig5.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig5.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig5, "installs_by_category.html", "The categories with the most installs are social and communication apps, which reflects their broad appeal and daily usage.")



In [40]:
# 
updates_per_year = apps_df['Last Updated'].dt.year.value_counts().sort_index()
fig6 = px.line(
    x=updates_per_year.index,
    y=updates_per_year.values,
    labels={'x': 'Year', 'y': 'Number of Updates'},
    title='Number of Updates Over the Years',
    color_discrete_sequence=['#AB63FA'],
    width=plot_width,
    height=plot_height
)
fig6.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig6, "updates_per_year.html", "Updates have been increasing over the years, showing that developers are actively maintaining and improving their apps.")

In [41]:
# figure 7
revenue_by_category = apps_df.groupby('Category')['Revenue'].sum().nlargest(10)
fig7 = px.bar(
    x=revenue_by_category.index,
    y=revenue_by_category.values,
    labels={'x': 'Category', 'y': 'Revenue'},
    title='Revenue by Category',
    color=revenue_by_category.index,
    color_discrete_sequence=px.colors.sequential.Greens,
    width=plot_width,
    height=plot_height
)
fig7.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig7.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig7, "revenue_by_category.html", "Categories such as Business and Productivity lead in revenue generation, indicating their monetization potential.")


In [42]:
# figure 8
genre_counts = apps_df['Genres'].str.split(';', expand=True).stack().value_counts().nlargest(10)
fig8 = px.bar(
    x=genre_counts.index,
    y=genre_counts.values,
    labels={'x': 'Genre', 'y': 'Count'},
    title='Top Genres',
    color=genre_counts.index,
    color_discrete_sequence=px.colors.sequential.OrRd,
    width=plot_width,
    height=plot_height
)
fig8.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig8.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig8, "genres_counts.html", "Action and Casual genres are the most common, reflecting users' preference for engaging and easy-to-play games.")

In [43]:
# figure 9
fig9 = px.scatter(
    apps_df,
    x='Last Updated',
    y='Rating',
    color='Type',
    title='Impact of Last Update on Rating',
    color_discrete_sequence=px.colors.qualitative.Vivid,
    width=plot_width,
    height=plot_height
)
fig9.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig9, "update_on_rating.html", "The scatter plot shows a weak correlation between the last update date and ratings, suggesting that more frequent updates don't always result in better ratings.")

In [44]:
# figure 10
fig10 = px.box(
    apps_df,
    x='Type',
    y='Rating',
    color='Type',
    title='Ratings for Paid vs Free Apps',
    color_discrete_sequence=px.colors.qualitative.Pastel,
    width=plot_width,
    height=plot_height
)
fig10.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig10, "ratings_paid_free.html", "Paid apps generally have higher ratings compared to free apps, suggesting that users expect higher quality from apps they pay for.")

In [45]:
# Split plot_containers to handle the last plot properly
plot_containers_split = plot_containers.split('</div>')
if len(plot_containers_split) > 1:
    final_plot = plot_containers_split[-2] + '</div>'
else:
    final_plot = plot_containers  # Use plot_containers as default if splitting isn't sufficient

In [46]:
# HTML template for the dashboard
dashboard_html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Google Play Store Reviews Analytics</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            background-color: #333;
            color: #fff;
            margin: 0;
            padding: 0;
        }}
        .header {{
            display: flex;
            align-items: center;
            justify-content: center;
            padding: 20px;
            background-color: #444;
        }}
        .header img {{
            margin: 0 10px;
            height: 50px;
        }}
        .container {{
            display: flex;
            flex-wrap: wrap;
            justify-content: center;
            padding: 20px;
        }}
        .plot-container {{
            border: 2px solid #555;
            margin: 10px;
            padding: 10px;
            width: {plot_width}px;
            height: {plot_height}px;
            overflow: hidden;
            position: relative;
            cursor: pointer;
        }}
        .insights {{
            display: none;
            position: absolute;
            right: 10px;
            top: 10px;
            background-color: rgba(0, 0, 0, 0.7);
            padding: 5px;
            border-radius: 5px;
            color: #fff;
        }}
        .plot-container:hover .insights {{
            display: block;
        }}
    </style>
    <script>
        function openPlot(filename) {{
            window.open(filename, '_blank');
        }}
    </script>
</head>
<body>
    <div class="header">
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/4/4a/Logo_2013_Google.png/800px-Logo_2013_Google.png" alt="Google Logo">
        <h1>Google Play Store Reviews Analytics</h1>
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Google_Play_Store_badge_EN.svg/1024px-Google_Play_Store_badge_EN.svg.png" alt="Google Play Store Logo">
    </div>
    <div class="container">
        {plots}
    </div>
</body>
</html>
"""

In [47]:
# Define the plot container
sentiment_plot_div = f"""
<div class="plot-container">
    <iframe src="sentiment_distribution.html"></iframe>
    <div class="insights">
        Insight: Apps with higher ratings show more positive sentiment.
    </div>
</div>
"""

# Assign to plots
plots = sentiment_plot_div  # Add more divs like + another_plot_div if needed

# Fill the HTML template
final_html = dashboard_html.format(plots=plots, plot_width=500, plot_height=400)

# Write to file
with open("dashboard.html", "w", encoding="utf-8") as f:
    f.write(final_html)


In [48]:
# Use these containers to fill in your dashboard HTML
final_html = dashboard_html.format(plots=plot_containers, plot_width=plot_width, plot_height=plot_height)


In [49]:
# Save the final dashboard to an HTML file
dashboard_path = os.path.join(html_files_path, "dashboard.html")
with open(dashboard_path, "w", encoding="utf-8") as f:
    f.write(final_html)

In [50]:
# Automatically open the generated HTML file in a web browser
webbrowser.open('file://' + os.path.realpath(dashboard_path))

True

1. Visualize the sentiment distribution (positive, neutral, negative) of user reviews using a stacked bar chart, segmented by rating groups (e.g., 1-2 stars, 3-4 stars, 4-5 stars). Include only apps with more than 1,000 reviews and group by the top 5 categories.

In [51]:
import pandas as pd
import plotly.express as px

# Load the datasets
playstore_df = pd.read_csv("C:/Users/DELL/Downloads/Play Store Data.csv")
reviews_df = pd.read_csv("C:/Users/DELL/Downloads/User Reviews.csv")

# Merge on App name
merged_df = pd.merge(playstore_df, reviews_df, on='App')

# Clean and convert columns
merged_df['Reviews'] = pd.to_numeric(merged_df['Reviews'], errors='coerce')
merged_df['Rating'] = pd.to_numeric(merged_df['Rating'], errors='coerce')

# Filter apps with more than 1000 reviews
filtered = merged_df[merged_df['Reviews'] > 1000]

# Find top 5 categories based on number of qualifying apps
top_categories = filtered['Category'].value_counts().head(5).index.tolist()
filtered = filtered[filtered['Category'].isin(top_categories)]

# Create rating group labels
def rating_group(r):
    if 1 <= r < 2:
        return '1-2'
    elif 2 <= r < 3:
        return '2-3'
    elif 3 <= r < 4:
        return '3-4'
    elif 4 <= r <= 5:
        return '4-5'
    else:
        return 'Unknown'

filtered['Rating_Group'] = filtered['Rating'].apply(rating_group)

# Group by Category, Rating Group, and Sentiment
sentiment_dist = filtered.groupby(['Category', 'Rating_Group', 'Sentiment']).size().reset_index(name='Count')

# Plot using Plotly
fig = px.bar(
    sentiment_dist,
    x='Rating_Group',
    y='Count',
    color='Sentiment',
    facet_col='Category',
    title='sentiment bar chart by Rating Group (Top 5 Categories, >1000 Reviews)',
    barmode='stack'
)

fig.write_html("sentiment_bar_chart.html")



2. Create an interactive Choropleth map using Plotly to visualize global installs by Category. Apply filters to show data for only the top 5 app categories and highlight category where the number of installs exceeds 1 million. The app category should not start with the characters “A,” “C,” “G,” or “S.” This graph should work only between 6 PM IST and 8 PM IST; apart from that time, we should not show it in the dashboard itself.


In [52]:
import pandas as pd
import plotly.express as px
from datetime import datetime
import pytz
import numpy as np

# Load the data
playstore_df = pd.read_csv("C:/Users/DELL/Downloads/Play Store Data.csv")

# Clean the 'Installs' column
playstore_df['Installs'] = playstore_df['Installs'].str.replace('[+,]', '', regex=True)
playstore_df['Installs'] = pd.to_numeric(playstore_df['Installs'], errors='coerce')

# Filter out categories starting with 'A', 'C', 'G', or 'S'
filtered_df = playstore_df[~playstore_df['Category'].str.startswith(('A', 'C', 'G', 'S'))]

# Group by Category and calculate total installs
category_installs = filtered_df.groupby('Category')['Installs'].sum().reset_index()

# Get the top 5 categories by total installs
top_5_categories = category_installs.sort_values(by='Installs', ascending=False).head(5)

# Add mock countries (required for choropleth – replace with real data if available)
top_5_categories['Country'] = ['US', 'IN', 'BR', 'RU', 'ID']  # Example: US, India, Brazil, Russia, Indonesia

# Add a flag for highlighting categories with installs > 1 million
top_5_categories['Highlight'] = top_5_categories['Installs'] > 1_000_000

# Get current time in IST
ist_now = datetime.now(pytz.timezone('Asia/Kolkata'))
hour = ist_now.hour

# Plot only between 6 PM and 8 PM IST
if 18 <= hour < 20:
    fig = px.choropleth(
        top_5_categories,
        locations="Country",
        locationmode="country names",
        color="Installs",
        hover_name="Category",
        title="Top 5 App Categories by Installs (Filtered) - Highlighting >1M Installs",
        color_continuous_scale="Viridis"
    )
    fig.write_html("Choropleth_map.html")

else:
    print("Choropleth map is not displayed. This visualization is only available between 6 PM and 8 PM IST.")


Choropleth map is not displayed. This visualization is only available between 6 PM and 8 PM IST.


 3 Plot a bubble chart to analyze the relationship between app size (in MB) and average rating, with the bubble size representing the number of installs. Include a filter to show only apps with a rating higher than 3.5 and that belong to the Game, Beauty ,business , commics , commication , Dating , Entertainment , social and event categories. Reviews should be greater than 500 and the app name should not contain letter "S" and sentiment subjectivity should be more than 0.5 and highlight the Game Category chart in Pink color. We have to translate the Beauty category in Hindi and Business category in Tamil and Dating category in German while showing it on Graphs. Installs should be more than 50k as well as this graph should work only between 5 PM IST to 7 PM IST apart from that time we should not show this graph in dashboard itself

In [53]:
import pandas as pd
import plotly.express as px
from datetime import datetime
import pytz

# Load the datasets
playstore_df = pd.read_csv("C:/Users/DELL/Downloads/Play Store Data.csv")
reviews_df = pd.read_csv("C:/Users/DELL/Downloads/User Reviews.csv")

# Merge both datasets on App name
merged_df = pd.merge(playstore_df, reviews_df, on='App')

# Clean Installs column
merged_df['Installs'] = merged_df['Installs'].str.replace('[+,]', '', regex=True)
merged_df['Installs'] = pd.to_numeric(merged_df['Installs'], errors='coerce')

# Clean Reviews column
merged_df['Reviews'] = pd.to_numeric(merged_df['Reviews'], errors='coerce')

# Convert Size to MB
def convert_size(size_str):
    if isinstance(size_str, str):
        if 'M' in size_str:
            return float(size_str.replace('M', ''))
        elif 'k' in size_str:
            return float(size_str.replace('k', '')) / 1024
    return None

merged_df['Size_MB'] = merged_df['Size'].apply(convert_size)

# Define categories and translation
selected_categories = [
    'GAME', 'BEAUTY', 'BUSINESS', 'COMICS',
    'COMMUNICATION', 'DATING', 'ENTERTAINMENT', 'SOCIAL', 'EVENT'
]

category_translations = {
    'BEAUTY': 'सौंदर्य',        # Hindi
    'BUSINESS': 'வணிகம்',      # Tamil
    'DATING': 'Partnersuche'    # German
}

# Apply filters
filtered = merged_df[
    (merged_df['Category'].isin(selected_categories)) &
    (merged_df['Rating'] > 3.5) &
    (merged_df['Reviews'] > 500) &
    (~merged_df['App'].str.contains('S', case=False, na=False)) &
    (merged_df['Sentiment_Subjectivity'] > 0.5) &
    (merged_df['Installs'] > 50000)
].copy()

# Translate categories
filtered['Category_Translated'] = filtered['Category'].replace(category_translations)

# Time condition: show only between 5 PM and 7 PM IST
ist_now = datetime.now(pytz.timezone('Asia/Kolkata'))
hour = ist_now.hour

if 17 <= hour < 19:
    fig = px.scatter(
        filtered,
        x='Size_MB',
        y='Rating',
        size='Installs',
        color='Category_Translated',
        hover_name='App',
        title='App Size vs. Average Rating (Filtered)',
        color_discrete_map={'GAME': 'pink'}  # Highlight 'GAME' category in pink
    )
    fig.write_html("bubble_chart.html")

else:
    print("Bubble chart is not displayed. This visualization is only available between 5 PM and 7 PM IST.")


Bubble chart is not displayed. This visualization is only available between 5 PM and 7 PM IST.


In [54]:
import os
import webbrowser

# HTML template
dashboard_html = """
<!DOCTYPE html>
<html>
<head>
    <title>App Analytics Dashboard</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            padding: 40px;
            background: #f9f9f9;
        }}
        h1 {{
            text-align: center;
            color: #333;
        }}
        h2 {{
            color: #444;
        }}
        .dashboard-row {{
            display: flex;
            flex-wrap: wrap;
            justify-content: space-around;
            gap: 20px;
            margin-bottom: 40px;
        }}
        .plot-container {{
            flex: 1;
            min-width: 45%;
            max-width: 48%;
        }}
        iframe {{
            border: none;
            width: 100%;
            height: {plot_height}px;
        }}
        .insights {{
            margin-top: 10px;
            font-style: italic;
            color: #555;
        }}
    </style>
</head>
<body>

    <h1>📊 App Analytics Dashboard</h1>

    <div class="dashboard-row">
        <div class="plot-container">
            <h2>1. Bubble Chart: App Size vs Rating</h2>
            <iframe src="bubble_chart.html"></iframe>
            <div class="insights">Larger apps tend to have higher ratings.</div>
        </div>

        <div class="plot-container">
            <h2>2. Choropleth Map: Global Installs by Category</h2>
            <iframe src="choropleth_map.html"></iframe>
            <div class="insights">Install patterns vary greatly by region and category.</div>
        </div>
    </div>

    <div class="dashboard-row">
        <div class="plot-container">
            <h2>3. Stacked Bar Chart: sentiment bar chart by Rating Group</h2>
            <iframe src="sentiment_bar_chart.html"></iframe>
            <div class="insights">Apps with higher ratings show more positive sentiment.</div>
        </div>
    </div>

</body>
</html>
"""

# Fill HTML
final_html = dashboard_html.format(plot_height=400)

# Save HTML file
dashboard_path = "task_dashboard.html"
with open(dashboard_path, "w", encoding="utf-8") as f:
    f.write(final_html)

# Open in new browser tab
webbrowser.open("file://" + os.path.realpath(dashboard_path), new=2)


True