In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
import webbrowser
import os
import plotly.graph_objects as go
import json

In [2]:
apps=pd.read_csv('Play Store Data.csv')
reviews=pd.read_csv('User Reviews.csv')

In [3]:
#data cleaning
apps = apps.dropna(subset=['Rating'])
for column in apps.columns :
    apps[column].fillna(apps[column].mode()[0],inplace=True)
apps.drop_duplicates(inplace=True)
apps=apps=apps[apps['Rating']<=5]
reviews.dropna(subset=['Translated_Review'],inplace=True)

In [4]:
#Convert the Installs columns to numeric by removing commas and +
apps['Installs'] = apps['Installs'].astype(str).str.replace(',', '').str.replace('+', '').astype(int)

In [5]:
#Convert Price column to numeric after removing $
apps['Price'] = apps['Price'].astype(str).str.replace('$', '').astype(float)

In [6]:
apps['Reviews']=apps['Reviews'].astype(int)

In [7]:
def safe_convert_size(size):
	if isinstance(size, str):
		if 'M' in size:
			return float(size.replace('M',''))
		elif 'k' in size:
			return float(size.replace('k',''))/1024
		else:
			return np.nan
	else:
		return size

In [8]:
apps['Size'] = apps['Size'].apply(safe_convert_size)

In [9]:
apps['Log_Installs']=np.log(apps['Installs'])
apps['Reviews']=apps['Reviews'].astype(int)
apps['Log_Reviews']=np.log(apps['Reviews'])

In [10]:
apps = apps.dropna(subset=['Size'])

In [11]:
apps['Revenue'] = apps['Price'] * apps['Installs']

In [12]:
apps['Last Updated'] = pd.to_datetime(apps['Last Updated'])
apps['Month'] = apps['Last Updated'].dt.month
apps['Year'] = apps['Last Updated'].dt.year

In [13]:
apps['Android Ver_Major'] = apps['Android Ver'].apply(
    lambda x: float(str(x).split('.')[0]) if 'and up' in str(x) else np.nan
)

In [14]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37427 entries, 0 to 64230
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   App                     37427 non-null  object 
 1   Translated_Review       37427 non-null  object 
 2   Sentiment               37427 non-null  object 
 3   Sentiment_Polarity      37427 non-null  float64
 4   Sentiment_Subjectivity  37427 non-null  float64
dtypes: float64(2), object(3)
memory usage: 1.7+ MB


In [15]:
reviews = reviews.dropna(subset=['Translated_Review', 'Sentiment_Subjectivity'])

In [16]:
merged_df = pd.merge(apps, reviews, on='App', how='inner')

In [17]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37800 entries, 0 to 37799
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   App                     37800 non-null  object        
 1   Category                37800 non-null  object        
 2   Rating                  37800 non-null  float64       
 3   Reviews                 37800 non-null  int32         
 4   Size                    37800 non-null  float64       
 5   Installs                37800 non-null  int32         
 6   Type                    37800 non-null  object        
 7   Price                   37800 non-null  float64       
 8   Content Rating          37800 non-null  object        
 9   Genres                  37800 non-null  object        
 10  Last Updated            37800 non-null  datetime64[ns]
 11  Current Ver             37800 non-null  object        
 12  Android Ver             37800 non-null  object

In [18]:
html_files_path="./"
if not os.path.exists(html_files_path):
    os.makedirs(html_files_path)

In [19]:
plot_containers=""

In [20]:
def save_plot_as_html(fig, filename, insight):
    global plot_containers
    filepath = os.path.join(html_files_path, filename)
    html_content = pio.to_html(fig, full_html=False, include_plotlyjs='inline')
    # Append the plot and its insight to plot_containers
    plot_containers += f"""
    <div class="plot-container" id="{filename}" onclick="openPlot('{filename}')">
        <div class="plot">{html_content}</div>
        <div class="insights">{insight}</div>
    </div>
    """
    fig.write_html(filepath, full_html=False, include_plotlyjs='inline')
plot_height=500,
plot_width=800,
plot_bgcolor='blue'

In [21]:
print("Working on Grouped Bar chart")

filtered_df = apps[
    (apps['Rating'] >= 4.0) &
    (apps['Size'] < 10) &
    (apps['Last Updated'].dt.month == 1)
]

top_10_installs_df = filtered_df.groupby('Category').agg(
    Average_Rating=('Rating', 'mean'),
    Total_Reviews=('Reviews', 'sum')
).sort_values(by='Total_Reviews', ascending=False).head(10).reset_index()

fig1= go.Figure()
fig1.add_trace(go.Bar(
    x=top_10_installs_df['Category'],
    y=top_10_installs_df['Average_Rating'],
    name='Average Rating',
    marker_color='rgb(102, 194, 165)'
))
fig1.add_trace(go.Bar(
    x=top_10_installs_df['Category'],
    y=top_10_installs_df['Total_Reviews'],
    name='Total Reviews',
    marker_color='rgb(252, 141, 98)',
    yaxis='y2'
))

# Layout settings
fig1.update_layout(
      legend=dict(
        orientation="v",
        yanchor="top",
        y=1,
        xanchor="right",
        x=1.5,
        font=dict(size=16)
    ),
    margin=dict(t=100, b=70, l=70, r=70),
    width=800,
    height=400,
    title='Average Rating and Total Reviews for Top 10 App Categories',
    xaxis_title='Category',
    yaxis=dict(
        title=dict(
            text='Average Rating',
            font=dict(color='rgb(102, 194, 165)')
        ),
        tickfont=dict(color='rgb(102, 194, 165)')
    ),
    yaxis2=dict(
        title=dict(
            text='Total Reviews',
            font=dict(color='rgb(252, 141, 98)')
        ),
        overlaying='y',
        side='right',
        tickfont=dict(color='rgb(252, 141, 98)')
    ),
    barmode='group',
    plot_bgcolor='white',
    paper_bgcolor='white',
    title_font_size=16,
)


save_plot_as_html(
    fig=fig1,
    filename="Grouped_bar_chart.html",
    insight="This grouped bar chart shows the average rating and total reviews for the top 10 app categories with high ratings and small size updated in January."
)
    

Working on Grouped Bar chart


In [22]:
print("Working on Choropleth Map")

# Filtered dataframe
filtered_df = apps[
    (apps['Installs'] > 800000) &
    (apps['Category'].str.startswith(('A', 'C', 'G', 'S')))
]

# Top 5 categories
top_5_categories = filtered_df.groupby('Category')['Installs'].sum().nlargest(5).index
filtered_df = filtered_df[filtered_df['Category'].isin(top_5_categories)]

# Random country assignment
countries = ['United States', 'India', 'Brazil', 'Germany', 'Japan', 'South Korea', 'United Kingdom', 'France']
filtered_df['Country'] = np.random.choice(
    countries,
    size=len(filtered_df),
    p=[0.25, 0.25, 0.1, 0.1, 0.1, 0.1, 0.05, 0.05]
)

# Grouping for choropleth
installs_by_country_category = filtered_df.groupby(['Country', 'Category'])['Installs'].sum().reset_index()

# Build choropleth figure
fig2 = px.choropleth(
    installs_by_country_category,
    locations='Country',
    locationmode='country names',
    color='Installs',
    hover_name='Country',
    animation_frame='Category',
    color_continuous_scale=px.colors.sequential.Teal,
    title='Global Installs by App Category',
)

# Layout updates
fig2.update_layout(

    width=800,
    height=400,
    title_font_size=16
)
save_plot_as_html(
    fig=fig2,
    filename="Choropleth_Map.html",
    insight="This choropleth map visualizes the global installs by app category, highlighting the top categories across different countries."
)


Working on Choropleth Map


In [23]:
print("Working on Dual Axis-chart")

# Filtered dataframe
filtered_df = apps[
    (apps['Installs'] >= 10000) &
    (apps['Revenue'] >= 10000) &
    (apps['Android Ver_Major'] > 4.0) &
    (apps['Size'] > 15) &
    (apps['Content Rating'] == 'Everyone') &
    (apps['App'].str.len() <= 30)
]

# Top 3 categories
top_3_categories = filtered_df.groupby('Category')['Installs'].sum().nlargest(3).index
filtered_df = filtered_df[filtered_df['Category'].isin(top_3_categories)]

# Grouped dataframe
grouped_df = filtered_df.groupby(['Category', 'Type']).agg(
    Average_Installs=('Installs', 'mean'),
    Average_Revenue=('Revenue', 'mean')
).reset_index()

# Build figure
fig3= go.Figure()
fig3.add_trace(go.Bar(
    x=grouped_df['Category'][grouped_df['Type'] == 'Free'],
    y=grouped_df['Average_Installs'][grouped_df['Type'] == 'Free'],
    name='Average Installs (Free)',
    marker_color='mediumseagreen'
))
fig3.add_trace(go.Bar(
    x=grouped_df['Category'][grouped_df['Type'] == 'Paid'],
    y=grouped_df['Average_Installs'][grouped_df['Type'] == 'Paid'],
    name='Average Installs (Paid)',
    marker_color='cadetblue'
))
fig3.add_trace(go.Scatter(
    x=grouped_df['Category'],
    y=grouped_df['Average_Revenue'],
    mode='lines+markers',
    name='Average Revenue',
    line=dict(color='firebrick', width=4),
    yaxis='y2'
))

# Layout settings
fig3.update_layout(
      legend=dict(
        orientation="v",
        yanchor="top",
        y=1,
        xanchor="left",
        x=1.5,
        font=dict(size=16)
    ),
    margin=dict(t=100, b=70, l=50, r=70),
    width=800,
    height=400,
    title='Average Installs & Revenue for Top 3 Categories',
    xaxis_title='Category',
    yaxis=dict(title='Average Installs'),
    yaxis2=dict(
        title='Average Revenue',
        overlaying='y',
        side='right'
    ),

    barmode='group',
    plot_bgcolor='white',
    paper_bgcolor='white',
    title_font_size=16,
)
save_plot_as_html(
    fig=fig3,
    filename="Dual_Axis_Chart.html",
    insight="This dual-axis chart compares the average installs and revenue for the top 3 app categories."
)



Working on Dual Axis-chart


In [24]:
print("Working on Time Series Line Chart")

# Filtered dataframe
filtered_df = apps[
    (apps['App'].str.lower().str.startswith(('x', 'y', 'z'))) &
    (apps['Category'].str.startswith(('E', 'C', 'B'))) &
    (apps['Reviews'] > 500)
]

# Category translation
category_translation = {
    'BEAUTY': 'सुंदरता',
    'BUSINESS': 'வணிகம்',
    'DATING': 'Dating'
}
filtered_df['Category_Translated'] = filtered_df['Category'].map(category_translation).fillna(filtered_df['Category'])

# Group by month + category
grouped_df = filtered_df.groupby(
    [filtered_df['Last Updated'].dt.to_period('M'), 'Category_Translated']
)['Installs'].sum().reset_index()
grouped_df['Last Updated Month'] = grouped_df['Last Updated'].dt.to_timestamp()

# Build line chart
fig4= px.line(
    grouped_df,
    x='Last Updated Month',
    y='Installs',
    color='Category_Translated',
    title='Trend of Total Installs Over Time',
    labels={
        'Last Updated Month': 'Date',
        'Installs': 'Total Installs',
        'Category_Translated': 'Category'
    }
)

# Highlight periods of >20% growth
for category in grouped_df['Category_Translated'].unique():
    category_df = grouped_df[grouped_df['Category_Translated'] == category].sort_values('Last Updated Month')
    category_df['Installs_Prev'] = category_df['Installs'].shift(1)
    category_df['Growth'] = (category_df['Installs'] - category_df['Installs_Prev']) / category_df['Installs_Prev'] * 100

    significant_growth_periods = category_df[category_df['Growth'] > 20]

    for _, row in significant_growth_periods.iterrows():
        fig4.add_vrect(
            x0=row['Last Updated Month'],
            x1=row['Last Updated Month'] + pd.DateOffset(months=1),
            fillcolor='green',
            opacity=0.1,
            layer='below',
            line_width=0,
            annotation_text=">20% Growth",
            annotation_position="top left",
            annotation_font_size=10,
            annotation_font_color='green'
        )

fig4.update_layout(
    legend=dict(
        orientation="v",
        y=1,
        x=1.5,
        xanchor="right",
        font=dict(size=15)
    ),
    margin=dict(t=85, b=70, l=70, r=70),
    width=800,
    height=400,
    plot_bgcolor='white',
    paper_bgcolor='white',
    title_font_size=16,
    # Add grid settings for both axes
    xaxis=dict(
        showgrid=True,
        gridcolor='lightgray'
    ),
    yaxis=dict(
        showgrid=True,
        gridcolor='lightgray'
    )
)
save_plot_as_html(
    fig=fig4,
    filename="Time_Series_Line_Chart.html",
    insight="This time series line chart shows the trend of total installs over time for different app categories."
)



Working on Time Series Line Chart




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [25]:
print("Working on Bubble Chart")

# Merge with reviews
df_with_reviews = pd.merge(apps, reviews, on='App', how='inner')
df_with_reviews = df_with_reviews.dropna(subset=['Sentiment_Subjectivity'])

# Apply filters
filtered_df = df_with_reviews[
    (df_with_reviews['Rating'] > 3.5) &
    (df_with_reviews['Category'].isin([
        'GAME', 'BEAUTY', 'BUSINESS', 'COMICS', 'COMMUNICATION',
        'DATING', 'ENTERTAINMENT', 'SOCIAL', 'EVENTS'
    ])) &
    (df_with_reviews['Reviews'] > 500) &
    (~df_with_reviews['App'].str.lower().str.contains('s')) &
    (df_with_reviews['Sentiment_Subjectivity'] > 0.5) &
    (df_with_reviews['Installs'] > 50000)
]

# Group by App
grouped_df = filtered_df.groupby('App').agg(
    Size=('Size', 'mean'),
    Rating=('Rating', 'mean'),
    Installs=('Installs', 'mean'),
    Category=('Category', 'first')
).reset_index()

# Category translation
category_translation = {
    'BEAUTY': 'सुंदरता',
    'BUSINESS': 'வணிகம்',
    'DATING': 'Dating'
}
grouped_df['Category_Translated'] = grouped_df['Category'].map(category_translation).fillna(grouped_df['Category'])

# Color map
color_map = {
    'GAME': 'rgb(255, 105, 180)',
    'सुंदरता': 'rgb(63, 191, 191)',
    'வணிகம்': 'rgb(191, 63, 191)',
    'Dating': 'rgb(63, 63, 191)',
    'COMICS': 'rgb(191, 191, 63)',
    'COMMUNICATION': 'rgb(191, 63, 63)',
    'ENTERTAINMENT': 'rgb(63, 191, 63)',
    'SOCIAL': 'rgb(191, 63, 127)',
    'EVENTS': 'rgb(127, 63, 191)'
}

# Build scatter (bubble) chart
fig5 = px.scatter( grouped_df, 
x='Size', 
y='Rating', 
size='Installs', 
color='Category_Translated', 
hover_name='App', 
title='App Size vs. Average Rating (Installs as Bubble Size)', 
labels={'Size': 'Size (MB)', 'Rating': 'Average Rating', 'Installs': 'Number of Installs'}, 
color_discrete_map=color_map )

fig5.update_layout( 
    legend=dict( orientation="v",
     y=1, x=1.5, 
     xanchor="center",
      font=dict(size=15) ), 
      margin=dict(t=80, b=90, l=80, r=90), 
      width=800, 
      height=400, 
      plot_bgcolor='white', 
      paper_bgcolor='white', 
      title_font_size=16, 
      xaxis=dict(showgrid=True, gridcolor='black'), 
      yaxis=dict(showgrid=True, gridcolor='black') )

save_plot_as_html(
    fig=fig5,
    filename="Bubble_Chart.html",
    insight="This bubble chart visualizes the relationship between app size, average rating, and number of installs, with categories represented by different colors."
)

Working on Bubble Chart


In [26]:
plot_containers_split=plot_containers.split('</div>')

In [27]:
if len(plot_containers_split) > 1:
    final_plot=plot_containers_split[-2]+'</div>'
else:
    final_plot=plot_containers

In [28]:
dashboard_html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title> Google Play Store Analytics Dashboard</title>
    <script src="https://cdn.tailwindcss.com"></script>
    <script src="https://cdn.plot.ly/plotly-2.35.2.min.js"></script>
    <style>
        body {{
            font-family: 'Inter', sans-serif;
            background-color: #f3f4f6;
            color: #1f2937;
            padding: 2rem;
        }}
        .dashboard-container {{
            display: flex;
            flex-direction: column;
            gap: 2rem;
            align-items: center;
        }}
        .plot-card {{
            background-color: white;
            border-radius: 1rem;
            box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
            padding: 1.5rem;
            transition: transform 0.2s ease-in-out;
            display: none;
            width: 250%;
            max-width: 1000px;
            overflow: auto;
            height: 50%;
        }}
        .plot-card:hover {{
            transform: translateY(-5px);
            box-shadow: 0 0 20px 5px rgba(59, 130, 246, 0.4);
        }}
        .plot-title {{
            font-size: 1.25rem;
            font-weight: bold;
            text-align: center;
            margin-bottom: 1rem;
        }}
        #no-charts-message {{
            display: none;
            text-align: center;
            font-size: 1.5rem;
            color: #6b7280;
            margin-top: 4rem;
        }}
    </style>
</head>
<body>
    <div class="flex flex-col items-center">
        <h1 class="text-3xl font-bold mb-8 text-gray-800">Google Play Store Analytics Dashboard</h1>
        <div class="dashboard-container">

            <div class="plot-card" id="plot1-container" onclick="redirectTo('Grouped_bar_chart.html')">
                <div class="plot-title">Grouped Bar Chart</div>
                {Grouped_bar_chart}
                <button onclick="downloadPlot('plot1-container', 'grouped_bar_chart.png'); event.stopPropagation();"
                        class="mt-4 px-4 py-2 bg-blue-500 text-white rounded-lg hover:bg-blue-600 transition-colors">
                        Download Plot
                </button>
            </div>

            <div class="plot-card" id="plot2-container" onclick="redirectTo('Choropleth_map.html')">
                <div class="plot-title">Choropleth Map</div>
                {Choropleth_map}
                <button onclick="downloadPlot('plot2-container', 'choropleth_map.png'); event.stopPropagation();"
                        class="mt-4 px-4 py-2 bg-blue-500 text-white rounded-lg hover:bg-blue-600 transition-colors">
                        Download Plot
                </button>
            </div>

            <div class="plot-card" id="plot3-container" onclick="redirectTo('Dual_Axis_Chart.html')">
                <div class="plot-title">Dual-Axis Chart</div>
                {Dual_Axis_Chart}
                <button onclick="downloadPlot('plot3-container', 'dual_axis_chart.png'); event.stopPropagation();"
                        class="mt-4 px-4 py-2 bg-blue-500 text-white rounded-lg hover:bg-blue-600 transition-colors">
                        Download Plot
                </button>
            </div>

            <div class="plot-card" id="plot4-container" onclick="redirectTo('Time_Series_Line_Chart.html')">
                <div class="plot-title">Time Series Line Chart</div>
                {Time_Series_Line_Chart}
                <button onclick="downloadPlot('plot4-container', 'time_series_chart.png'); event.stopPropagation();"
                        class="mt-4 px-4 py-2 bg-blue-500 text-white rounded-lg hover:bg-blue-600 transition-colors">
                        Download Plot
                </button>
            </div>

            <div class="plot-card" id="plot5-container" onclick="redirectTo('Bubble_Chart.html')">
                <div class="plot-title">Bubble Chart</div>
                {Bubble_Chart}
                <button onclick="downloadPlot('plot5-container', 'bubble_chart.png'); event.stopPropagation();"
                        class="mt-4 px-4 py-2 bg-blue-500 text-white rounded-lg hover:bg-blue-600 transition-colors">
                        Download Plot
                </button>
            </div>

        </div>
        <div id="no-charts-message">
            <h2>No charts are available at this time. Please check back later.</h2>
        </div>
    </div>

    <script>
        function renderPlotsBasedOnTime() {{
            const now = new Date();
            const hour = now.getHours();
            let visiblePlotsCount = 0;

            const timeWindows = {{
                'plot1-container': {{start: 15, end: 17}},
                'plot2-container': {{start: 18, end: 20}},
                'plot3-container': {{start: 13, end: 14}},
                'plot4-container': {{start: 18, end: 21}},
                'plot5-container': {{start: 17, end: 19}}
            }};

            const noChartsMessage = document.getElementById('no-charts-message');

            for (const [containerId, window] of Object.entries(timeWindows)) {{
                const container = document.getElementById(containerId);
                if (hour >= window.start && hour <= window.end) {{
                    container.style.display = 'block';
                    visiblePlotsCount++;
                }} else {{
                    container.style.display = 'none';
                }}
            }}

            if (visiblePlotsCount === 0) {{
                noChartsMessage.style.display = 'block';
            }} else {{
                noChartsMessage.style.display = 'none';
            }}
        }}

        function downloadPlot(containerId, fileName) {{
            const plotDiv = document.getElementById(containerId).getElementsByClassName("plotly-graph-div")[0];
            if (plotDiv) {{
                Plotly.downloadImage(plotDiv, {{format: 'png', filename: fileName}});
            }}
        }}

        function redirectTo(page) {{
            window.location.href = page;
        }}

        renderPlotsBasedOnTime();
        setInterval(renderPlotsBasedOnTime, 60000);
    </script>
</body>
</html>
"""

In [29]:
with open("Grouped_bar_chart.html", "r", encoding="utf-8") as f:
    grouped_html = f.read()

with open("Choropleth_map.html", "r", encoding="utf-8") as f:
    choropleth_html = f.read()

with open("Dual_Axis_Chart.html", "r", encoding="utf-8") as f:
    dual_axis_html = f.read()

with open("Time_Series_Line_Chart.html", "r", encoding="utf-8") as f:
    time_series_html = f.read()

with open("Bubble_Chart.html", "r", encoding="utf-8") as f:
    bubble_html = f.read()

final_html = dashboard_html.format(
    Grouped_bar_chart=grouped_html,
    Choropleth_map=choropleth_html,
    Dual_Axis_Chart=dual_axis_html,
    Time_Series_Line_Chart=time_series_html,
    Bubble_Chart=dual_axis_html,
    plot_width=plot_width,plot_height=plot_height
)

In [30]:
dashboard_path=os.path.join(html_files_path,"index.html")

In [31]:
with open(dashboard_path, "w", encoding="utf-8") as f:
    f.write(final_html)

In [32]:
webbrowser.open('file://'+os.path.realpath(dashboard_path))

True