In [39]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import webbrowser
import os

In [40]:
#reading the dataset
apps=pd.read_csv('Play Store Data.csv')
reviews=pd.read_csv('User Reviews.csv')

In [41]:
#data cleaning
apps = apps.dropna(subset=['Rating'])
for column in apps.columns :
    apps[column].fillna(apps[column].mode()[0],inplace=True)
apps.drop_duplicates(inplace=True)
apps=apps=apps[apps['Rating']<=5]
reviews.dropna(subset=['Translated_Review'],inplace=True)

In [42]:
#Convert the Installs columns to numeric by removing commas and +
apps['Installs'] = apps['Installs'].astype(str).str.replace(',', '').str.replace('+', '').astype(int)

In [43]:
#Convert Price column to numeric after removing $
apps['Price'] = apps['Price'].astype(str).str.replace('$', '').astype(float)

In [44]:
apps.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8892 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             8892 non-null   object 
 1   Category        8892 non-null   object 
 2   Rating          8892 non-null   float64
 3   Reviews         8892 non-null   object 
 4   Size            8892 non-null   object 
 5   Installs        8892 non-null   int32  
 6   Type            8892 non-null   object 
 7   Price           8892 non-null   float64
 8   Content Rating  8892 non-null   object 
 9   Genres          8892 non-null   object 
 10  Last Updated    8892 non-null   object 
 11  Current Ver     8892 non-null   object 
 12  Android Ver     8892 non-null   object 
dtypes: float64(2), int32(1), object(10)
memory usage: 937.8+ KB


In [45]:
#merging the datasets
merged_df=pd.merge(apps,reviews,on='App',how='inner')

In [46]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59124 entries, 0 to 59123
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   App                     59124 non-null  object 
 1   Category                59124 non-null  object 
 2   Rating                  59124 non-null  float64
 3   Reviews                 59124 non-null  object 
 4   Size                    59124 non-null  object 
 5   Installs                59124 non-null  int32  
 6   Type                    59124 non-null  object 
 7   Price                   59124 non-null  float64
 8   Content Rating          59124 non-null  object 
 9   Genres                  59124 non-null  object 
 10  Last Updated            59124 non-null  object 
 11  Current Ver             59124 non-null  object 
 12  Android Ver             59124 non-null  object 
 13  Translated_Review       59124 non-null  object 
 14  Sentiment               59124 non-null

In [47]:
#data transformation
def convert_size(size):
    if 'M' in size:
        return float(size.replace('M',''))
    elif 'k' in size:
        return float(size.replace('k',''))/1024
    else:
        return np.nan
apps['Size']=apps['Size'].apply(convert_size)

In [48]:
apps.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,10000,Free,0.0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000,Free,0.0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,50000000,Free,0.0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000,Free,0.0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [49]:
apps['Log_Installs']=np.log(apps['Installs'])
apps['Reviews']=apps['Reviews'].astype(int)
apps['Log_Reviews']=np.log(apps['Reviews'])

In [50]:
def rating_group(rating):
    if rating >= 4:
        return 'Top rated app'
    elif rating >=3:
        return 'Above average'
    elif rating >=2:
        return 'Average'
    else:
        return 'Below Average'
apps['Rating_Group']=apps['Rating'].apply(rating_group)

In [51]:
apps['Revenue'] = apps['Price'] * apps['Installs']
apps['Last Updated'] = pd.to_datetime(apps['Last Updated'], errors='coerce')
apps['Year'] = apps['Last Updated'].dt.year

In [52]:
apps.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8892 entries, 0 to 10840
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   App             8892 non-null   object        
 1   Category        8892 non-null   object        
 2   Rating          8892 non-null   float64       
 3   Reviews         8892 non-null   int32         
 4   Size            7424 non-null   float64       
 5   Installs        8892 non-null   int32         
 6   Type            8892 non-null   object        
 7   Price           8892 non-null   float64       
 8   Content Rating  8892 non-null   object        
 9   Genres          8892 non-null   object        
 10  Last Updated    8892 non-null   datetime64[ns]
 11  Current Ver     8892 non-null   object        
 12  Android Ver     8892 non-null   object        
 13  Log_Installs    8892 non-null   float64       
 14  Log_Reviews     8892 non-null   float64       
 15  Rating_G

In [53]:
#sentiment analysis
model = SentimentIntensityAnalyzer()
def get_sentiment_score(review):
    return model.polarity_scores(str(review))['compound']
reviews['Sentiment_Score'] = reviews['Translated_Review'].apply(get_sentiment_score)

In [54]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37427 entries, 0 to 64230
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   App                     37427 non-null  object 
 1   Translated_Review       37427 non-null  object 
 2   Sentiment               37427 non-null  object 
 3   Sentiment_Polarity      37427 non-null  float64
 4   Sentiment_Subjectivity  37427 non-null  float64
 5   Sentiment_Score         37427 non-null  float64
dtypes: float64(3), object(3)
memory usage: 2.0+ MB


In [55]:
html_files_path="./"
if not os.path.exists(html_files_path):
    os.makedirs(html_files_path)

In [56]:
plot_containers=""

In [57]:
def save_plot_as_html(fig, filename, insight):
    global plot_containers
    filepath = os.path.join(html_files_path, filename)
    html_content = pio.to_html(fig, full_html=False, include_plotlyjs='inline')
    # Append the plot and its insight to plot_containers
    plot_containers += f"""
    <div class="plot-container" id="{filename}" onclick="openPlot('{filename}')">
        <div class="plot">{html_content}</div>
        <div class="insights">{insight}</div>
    </div>
    """
    fig.write_html(filepath, full_html=False, include_plotlyjs='inline')
plot_width=500
plot_height=400
plot_bg_color='black'
text_color='white'
title_font={'size':16}
axis_font={'size':12}

In [58]:
custom_colors = [
    '#a7b195',  # Pistachio
    '#8e9761',  # Citron
    '#8b914a',  # Conifer
    '#4a693f',  # Meadow
    '#3e5a3a',  # Fern
    '#4c6347',  # Willow
    '#75805a',  # Moss
    '#888e63',  # Nori
    '#898b5e',  # Zucchini
]
category_counts=apps['Category'].value_counts().nlargest(10)
fig1 = px.bar(
    x=category_counts.index,
    y=category_counts.values,
    labels={'x': 'Category', 'y': 'Count'},
    title='Top Categories on Play Store',
    color=category_counts.index,
    color_discrete_sequence=custom_colors,
    width=400,
    height=300
)

# Update layout for dark theme
fig1.update_layout(
    plot_bgcolor='rgba(255, 255, 255, 0.2)',  
    paper_bgcolor='black', 
    font_color='white',
    title_font={'size': 16},
    xaxis=dict(title_font={'size': 12}),
    yaxis=dict(title_font={'size': 12}),
    margin=dict(l=10, r=10, t=30, b=10)
)


save_plot_as_html(
    fig=fig1,
    filename="Category Graph 1.html",
    insight="Tools, entertainment, and productivity apps are the most popular types of apps on the Play Store."
)

In [59]:

type_color_palette = ['#8b914a', '#3e5a3a']  # Conifer & Fern

type_counts = apps['Type'].value_counts()

fig2 = px.pie(
    values=type_counts.values,
    names=type_counts.index,
    title='App Type Distribution',
    color_discrete_sequence=type_color_palette,
    width=400,
    height=300
)

fig2.update_layout(
    plot_bgcolor='rgba(255, 255, 255, 0.2)',   # soft glassy white
    paper_bgcolor='black',
    font_color='white',
    title_font={'size': 16},
    margin=dict(l=10, r=10, t=30, b=10)
)

save_plot_as_html(
    fig2,
    "Type Graph 2.html",
    "Most apps on the Playstore are free, indicating a strategy to attract users first and monetize through ads or in-app purchases."
)


In [60]:
rating_color = ['#8e9761']  # Citron

fig3 = px.histogram(
    apps,
    x='Rating',
    nbins=20,
    title='Rating Distribution',
    color_discrete_sequence=rating_color,
    width=400,
    height=300
)

fig3.update_layout(
    plot_bgcolor='rgba(255, 255, 255, 0.2)',   
    paper_bgcolor='black',
    font_color='white',
    title_font={'size': 16},
    xaxis=dict(title_font={'size': 12}),
    yaxis=dict(title_font={'size': 12}),
    margin=dict(l=10, r=10, t=30, b=10)
)

save_plot_as_html(
    fig3,
    "Rating Graph 3.html",
    "Ratings are skewed towards higher values, suggesting that most apps are rated favorably by users."
)


In [61]:
sentiment_green_palette = ['#a7b195', '#8e9761', '#4c6347', '#75805a', '#8b914a']
sentiment_counts=reviews['Sentiment_Score'].value_counts()
fig4 = px.bar(
    x=sentiment_counts.index,
    y=sentiment_counts.values,
    labels={'x': 'Sentiment Score', 'y': 'Count'},
    title='Sentiment Distribution',
    color=sentiment_counts.index,
    color_discrete_sequence=sentiment_green_palette,
    width=400,
    height=300
)

fig4.update_layout(
    plot_bgcolor='rgba(255, 255, 255, 0.2)',  
    paper_bgcolor='black',
    font_color='white',
    title_font={'size': 16},
    xaxis=dict(title_font={'size': 12}),
    yaxis=dict(title_font={'size': 12}),
    margin=dict(l=10, r=10, t=30, b=10)
)

save_plot_as_html(
    fig4,
    "Sentiment Graph 4.html",
    "Sentiments in reviews show a mix of positive and negative feedback, with a slight lean towards positive sentiments"
)


In [62]:
install_colors = ['#8b914a', '#4a693f', '#75805a', '#8e9761', '#3e5a3a',
                  '#4c6347', '#a7b195', '#888e63', '#898b5e', '#6d7548']
installs_by_category=apps.groupby('Category')['Installs'].sum().nlargest(10)
fig5 = px.bar(
    x=installs_by_category.values,
    y=installs_by_category.index,
    orientation='h',
    labels={'x': 'Installs', 'y': 'Category'},
    title='Installs by Category',
    color=installs_by_category.index,
    color_discrete_sequence=install_colors,
    width=400,
    height=300
)

fig5.update_layout(
    plot_bgcolor='rgba(255, 255, 255, 0.2)',  
    paper_bgcolor='black',
    font_color='white',
    title_font={'size': 16},
    xaxis=dict(title_font={'size': 12}),
    yaxis=dict(title_font={'size': 12}),
    margin=dict(l=10, r=10, t=30, b=10)
)

save_plot_as_html(
    fig5,
    "Installs Graph 5.html",
    "The categories with the most installs are social and communication apps, reflecting their broad appeal and daily usage."
)


In [63]:
updates_per_year = apps['Last Updated'].dt.year.value_counts().sort_index()
fig6 = px.line(
    x=updates_per_year.index,
    y=updates_per_year.values,
    labels={'x': 'Year', 'y': 'Number of Updates'},
    title='Number of Updates Over the Years',
    color_discrete_sequence=['#4a693f'],
    width=400,
    height=300
)
fig6.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig6, "Updates Graph 6.html", "Updates have been increasing over the years, showing that developers are actively maintaining and improving their apps.")

In [64]:
revenue_by_category=apps.groupby('Category')['Revenue'].sum().nlargest(10)
fig7=px.bar(
    x=installs_by_category.index,
    y=installs_by_category.values,
    labels={'x':'Category','y':'Revenue'},
    title='Revenue by Category',
    color=installs_by_category.index,
    color_discrete_sequence=px.colors.sequential.Greens,
    width=400,
    height=300
)
fig7.update_layout(
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white',
    title_font={'size':16},
    xaxis=dict(title_font={'size':12}),
    yaxis=dict(title_font={'size':12}),
    margin=dict(l=10,r=10,t=30,b=10)
)

save_plot_as_html(fig7,"Revenue Graph 7.html","Categories such as Business and Productivity lead in revenue generation, indicating their monetization potential")

In [65]:
genre_counts=apps['Genres'].str.split(';',expand=True).stack().value_counts().nlargest(10)
fig8=px.bar(
    x=genre_counts.index,
    y=genre_counts.values,
    labels={'x':'Genre','y':'Count'},
    title='Top Genres',
    color=installs_by_category.index,
    color_discrete_sequence=px.colors.sequential.Greens,
    width=400,
    height=300
)
fig8.update_layout(
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white',
    title_font={'size':16},
    xaxis=dict(title_font={'size':12}),
    yaxis=dict(title_font={'size':12}),
    margin=dict(l=10,r=10,t=30,b=10)
)

save_plot_as_html(fig8,"Genre Graph 8.html","Action and Casual genres are the most common, reflecting users' preference for engaging and easy-to-play games")

In [66]:
type_colors = ['#8b914a', '#4c6347']  

fig9 = px.scatter(
    apps,
    x='Last Updated',
    y='Rating',
    color='Type',
    title='Impact of Last Update on Rating',
    color_discrete_sequence=type_colors,
    width=400,
    height=300
)

fig9.update_layout(
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white',
    title_font={'size': 16},
    xaxis=dict(title_font={'size': 12}),
    yaxis=dict(title_font={'size': 12}),
    margin=dict(l=10, r=10, t=30, b=10)
)

save_plot_as_html(
    fig9,
    "Update Graph 9.html",
    "The Scatter Plot shows a weak correlation between the last update and ratings, suggesting that more frequent updates don't always result in better ratings."
)


In [67]:
type_colors = ['#8b914a', '#4c6347']
fig10=px.box(
    apps,
    x='Type',
    y='Rating',
    color='Type',
    title='Rating for Paid vs Free Apps',
    color_discrete_sequence=type_colors,
    width=400,
    height=300
)
fig10.update_layout(
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white',
    title_font={'size':16},
    xaxis=dict(title_font={'size':12}),
    yaxis=dict(title_font={'size':12}),
    margin=dict(l=10,r=10,t=30,b=10)
)

save_plot_as_html(fig10,"Paid Free Graph 10.html","Paid apps generally have higher ratings compared to free apps, suggesting that users expect higher quality from apps they pay for")

In [68]:
plot_containers_split=plot_containers.split('</div>')

In [69]:
if len(plot_containers_split) > 1:
    final_plot=plot_containers_split[-2]+'</div>'
else:
    final_plot=plot_containers

In [70]:
dashboard_html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Google Play Store Review Analytics</title>
    <style>
        body {{
            font-family:Times new roman;
            background-color:  #ffffff;
            color: #000000;
            margin: 0;
            padding: 0;
        }}
        .header {{
            display: flex;
            align-items: center;
            justify-content: center;
            padding: 20px;
            background-color:  #ffffff;
            flex-wrap: wrap;
        }}
        .header img {{
            margin: 0 10px;
            height: 50px;
        }}
        .header h1 {{
            font-size: 24px;
            margin: 0 10px;
            text-align: center;
            color: #000000;   
        }}
        .container {{
            display: flex;
            flex-wrap: wrap;
            justify-content: center;
            align-items: flex-start;
            padding: 20px;
            gap: 20px;
        }}
         .plot-container {{
            border: 2px solid #555
            margin: 10px;
            padding: 10px;
            width: {plot_width}px;
            height: {plot_height}px;
            overflow: hidden;
            position: relative;
            cursor: pointer;
        }}
        .insights {{
            display: none;
            position: absolute;
            right: 10px;
            top: 10px;
            background-color: rgba(0,0,0,0.7);
            padding: 5px;
            border-radius: 5px;
            color: #fff;
        }}
        .plot-container: hover .insights {{
            display: block;
        }}
    </style>
    <script>
        function openPlot(filename) {{
            window.open(filename, '_blank');
        }}
    </script>
</head>
<body>
    <div class="header">
        <h1>Google Play Store Reviews Analytics</h1>
    </div>
    <div class="container">
        {plots}
    </div>
</body>
</html>
"""


In [71]:
final_html=dashboard_html.format(plots=plot_containers,plot_width=plot_width,plot_height=plot_height)

In [72]:
dashboard_path=os.path.join(html_files_path,"web page.html")

In [73]:
with open(dashboard_path, "w", encoding="utf-8") as f:
    f.write(final_html)

In [74]:
webbrowser.open('file://'+os.path.realpath(dashboard_path))

True