In [107]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import datetime as dt
from datetime import datetime

In [108]:
import webbrowser
import os
html_files_path="../Code/HTML_File"
if not os.path.exists(html_files_path):
    os.makedirs(html_files_path)

In [109]:
plot_containers=""

In [110]:
# Save each Plotly figure to an HTML file
def save_plot_as_html(fig, filename, insight):
    global plot_containers
    filepath = os.path.join(html_files_path, filename)
    html_content = pio.to_html(fig, full_html=False, include_plotlyjs='inline')
    # Append the plot and its insight to plot_containers
    plot_containers += f"""
    <div class="plot-container" id="{filename}" onclick="openPlot('../Code/HTML_File/{filename}')">
        <div class="plot">{html_content}</div>
        <div class="insights">{insight}</div>
    </div>
    """
    fig.write_html(filepath, full_html=False, include_plotlyjs='inline')

In [111]:
# Read the data files
user_reviews = pd.read_csv('../Datasets/User Reviews.csv')
play_store_data = pd.read_csv('../Datasets/Play Store Data.csv')

In [112]:
# Handling missing values
user_reviews.isnull().sum()
user_reviews.dropna(subset=['Translated_Review'],inplace=True)

# Check duplicates
user_reviews.duplicated().sum()
user_reviews.drop_duplicates(inplace=True)

# check data types
user_reviews.info()

from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
sia = SentimentIntensityAnalyzer()
#Polarity Scores in SIA
#Positive, Negative, Neutral and Compound: -1 - Very negative ; +1 - Very positive
user_reviews['Sentiment_Score']=user_reviews['Translated_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])

def categorize_sentiment(score):
    if score > 0.05:
        return 'Positive'
    elif score < -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Apply the function to categorize sentiment
user_reviews['Sentiment'] = user_reviews['Sentiment_Score'].apply(categorize_sentiment)
user_reviews.head()


<class 'pandas.core.frame.DataFrame'>
Index: 29692 entries, 0 to 64230
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   App                     29692 non-null  object 
 1   Translated_Review       29692 non-null  object 
 2   Sentiment               29692 non-null  object 
 3   Sentiment_Polarity      29692 non-null  float64
 4   Sentiment_Subjectivity  29692 non-null  float64
dtypes: float64(2), object(3)
memory usage: 1.4+ MB


Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Sentiment_Score
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333,0.9531
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462,0.6597
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875,0.6249
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3,0.6369
5,10 Best Foods for You,Best way,Positive,1.0,0.3,0.6369


In [113]:
## check missing values
play_store_data.isnull().sum()

play_store_data = play_store_data.dropna(subset=['Rating'])
for column in play_store_data.columns :
    play_store_data[column].fillna(play_store_data[column].mode()[0],inplace=True)
    
play_store_data.duplicated().sum()
play_store_data.drop_duplicates(inplace=True)

play_store_data['Installs']=play_store_data['Installs'].str.replace(',','').str.replace('+','')

#Convert Size column
def convert_size(size):
    if 'M' in size:
        return float(size.replace('M',''))
    elif 'k' in size:
        return float(size.replace('k',''))/1024
    else:
        return np.nan
play_store_data['Size']=play_store_data['Size'].apply(convert_size)

# Handle non-numeric values in 'Installs' by setting non-numeric entries to NaN, then convert to float
play_store_data['Installs'] = pd.to_numeric(play_store_data['Installs'], errors='coerce')

# Also handle non-numeric entries in the 'Price' column, where "Free" can be treated as 0
play_store_data['Price'] = pd.to_numeric(play_store_data['Price'], errors='coerce').fillna(0)

# Convert Last updated Column to a date format
play_store_data['Last Updated']=pd.to_datetime(play_store_data['Last Updated'],errors='coerce')

#Convert Reviews column to numeric
play_store_data['Reviews']=pd.to_numeric(play_store_data['Reviews'],errors='coerce')

play_store_data1 = play_store_data.copy()

play_store_data.head()


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159.0,19.0,10000.0,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967.0,14.0,500000.0,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510.0,8.7,5000000.0,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644.0,25.0,50000000.0,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967.0,2.8,100000.0,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up


Task 1 : Visualize the sentiment distribution (positive, neutral, negative) of user reviews using a stacked bar chart, segmented by rating groups (e.g., 1-2 stars, 3-4 stars, 4-5 stars). Include only apps with more than 1,000 reviews and group by the top 5 categories.

In [114]:
# Define the rating groups using pd.cut()
rating_bins = [0, 2, 4, 5]  # Define the edges of the bins
rating_labels = ['1-2 stars', '3-4 stars', '4-5 stars']  # Labels for the bins

# Create a new column for the rating groups
play_store_data['Rating_Group'] = pd.cut(play_store_data['Rating'], bins=rating_bins, labels=rating_labels, include_lowest=True)

# Check the result
print(play_store_data[['Rating', 'Rating_Group']].head())



   Rating Rating_Group
0     4.1    4-5 stars
1     3.9    3-4 stars
2     4.7    4-5 stars
3     4.5    4-5 stars
4     4.3    4-5 stars


In [115]:
# Merge the two datasets on 'App' column
merged_data = pd.merge(play_store_data, user_reviews, on='App')

# Filter apps with more than 1,000 reviews
filtered_data = merged_data[merged_data['Reviews'] > 1000]

# Select the top 5 categories based on the number of reviews
top_5_categories = filtered_data['Category'].value_counts().nlargest(5).index
filtered_data = filtered_data[filtered_data['Category'].isin(top_5_categories)]

# Group by Rating_Group, Category, and Sentiment, then count the reviews
grouped_data = filtered_data.groupby(['Rating_Group', 'Category', 'Sentiment']).size().reset_index(name='TotalReviews')

# Create a Plotly Express bar chart (stacked bar)
fig1 = px.bar(
    grouped_data,
    x='Rating_Group',
    y='TotalReviews',
    color='Sentiment',
    facet_col='Category',  # Facet by top 5 categories
    title='Sentiment Distribution of User Reviews (Top 5 Categories) by Rating Groups',
    labels={'TotalReviews': 'Total Reviews', 'Rating_Group': 'Rating Group'},
    barmode='stack',  # Stacked bar for each sentiment
    color_discrete_map={'Positive': 'green', 'Neutral': 'yellow', 'Negative': 'red'}  # Custom colors for sentiments
)

# Update layout for the figure
fig1.update_layout(
    xaxis_tickangle=-45,  # Rotate x-axis labels for readability
    height=600,  # Height of the figure
    width=1000,  # Width of the figure
)

# Save plot as a html file
save_plot_as_html(fig1,"Task 1.html","The graph shows that the majority of user reviews across the top 5 app categories, especially in the GAME category, are overwhelmingly positive, with significantly fewer neutral and negative reviews in all rating groups.")


Task 2 : Use a grouped bar chart to compare the average rating and total review count for the top 10 app categories by number of installs. Filter out any categories where the average rating is below 4.0 and size below 10 M and last update should be Jan month . This graph should work only between 10 AM to 5 PM.

In [116]:
# Create columns for month and year
play_store_data['Last_Updated_Month'] = play_store_data['Last Updated'].dt.month_name()
play_store_data['Last_Updated_Year'] = play_store_data['Last Updated'].dt.year

# Filter the data based on the specified conditions
filtered_data = play_store_data[
    (play_store_data['Rating'] >= 4.0) &  # Average rating >= 4.0
    (play_store_data['Installs'] >= 10000000) &  # Size >= 10M installs
    (play_store_data['Last_Updated_Month'] == 'January')  # Last update in January
]

# Group the data by 'Category' and calculate the mean rating and total reviews
category_grouped = filtered_data.groupby('Category').agg(
    Average_Rating=('Rating', 'mean'),
    Total_Reviews=('Reviews', 'sum'),
    Total_Installs=('Installs', 'sum')
).reset_index()

# Sort by installs and pick the top 10 categories
top_10_categories = category_grouped.sort_values(by='Total_Installs', ascending=False).head(10)

# Ensure the graph only displays between 10 AM to 5 PM
current_time = dt.datetime.now().time()
if current_time >= dt.time(10, 0) and current_time <= dt.time(17, 0):
    # Create the figure
    fig2 = go.Figure()

    # Bar for Total Reviews
    fig2.add_trace(
        go.Bar(x=top_10_categories['Category'], y=top_10_categories['Total_Reviews'], 
               name="Total Reviews", yaxis='y', marker_color='orange')
    )

    # Line for Average Rating (scaled differently)
    fig2.add_trace(
        go.Scatter(x=top_10_categories['Category'], y=top_10_categories['Average_Rating'], 
                   name="Average Rating", yaxis='y2', marker=dict(color='blue'), mode='lines+markers')
    )

    # Update layout for dual y-axes
    fig2.update_layout(
        title="Average Rating and Total Review Count for Top 10 App Categories by Installs",
        xaxis=dict(title="Category"),
        yaxis=dict(title="Total Reviews", side='left'),
        yaxis2=dict(title="Average Rating", overlaying='y', side='right', range=[0, 5]),  # Scaling for rating (0-5)
        legend=dict(x=1.05, y=1, xanchor='left')
    )
    
    
else:
    print("The graph can only be displayed between 10 AM and 5 PM.")
    
# Save plot as a html file
save_plot_as_html(fig2,"Task 2.html","The Family and Game categories have the highest total reviews, while all top 10 categories maintain an average rating above 4.0.")

The graph can only be displayed between 10 AM and 5 PM.


Task 3 : Generate a heatmap to show the correlation matrix between installs, ratings, and review counts. Filter the data to include only apps that have been updated within the last year and have at least 100,000 installs and reviews count should be more than 1k and genres name should not be Starting with characters A , F , E , G , I , K . This Graph should work only between 3 pm to 6 pm.

In [117]:
# Step 1: Filter apps updated within the year 2018
start_date = datetime(2018, 1, 1)
end_date = datetime(2018, 12, 31)
play_store_data['Last Updated'] = pd.to_datetime(play_store_data['Last Updated'], errors='coerce')  # Convert to datetime
filtered_data = play_store_data[(play_store_data['Last Updated'] >= start_date) & (play_store_data['Last Updated'] <= end_date)]

# Convert 'Installs' and 'Reviews' to numeric, errors='coerce' will set non-numeric values to NaN
filtered_data['Installs'] = pd.to_numeric(filtered_data['Installs'], errors='coerce')
filtered_data['Reviews'] = pd.to_numeric(filtered_data['Reviews'], errors='coerce')

# Step 2: Filter apps with at least 100,000 installs and more than 1,000 reviews
filtered_data = filtered_data[(filtered_data['Installs'] >= 100000) & (filtered_data['Reviews'] >= 1000)]


# Step 3: Filter out genres starting with specific characters
filtered_data = filtered_data[~filtered_data['Genres'].str.startswith(('A', 'F', 'E', 'G', 'I', 'K'))]

# Check if it's between 3 PM and 6 PM
current_hour = datetime.now().hour
if 15 <= current_hour <= 18:
    if not filtered_data.empty:
        # Select relevant columns for the correlation matrix
        corr_data = filtered_data[['Installs', 'Rating', 'Reviews']].corr()

        # Generate the heatmap with Plotly
        fig3 = px.imshow(
            corr_data,
            text_auto=".2f",
            color_continuous_scale="RdBu",
            title="Correlation Matrix between Installs, Ratings, and Review Counts"
        )
        fig3.update_layout(width=600, height=500, title_x=0.5)
        
    else:
        print("No data available after applying the filters.")
else:
    print("This graph can only be displayed between 3 PM and 6 PM.")
    
# Save plot as a html file
save_plot_as_html(fig3,"Task 3.html","The correlation between Installs and Reviews is good when correlation between Installs and Rating, Rating and Reviews is not so good.")

Task 4 :  Plot a time series line chart to show the trend of total installs over time, segmented by app category. Highlight periods of significant growth by shading the areas under the curve where the increase in installs exceeds 20% month-over-month and content rating should be teen and app name should start with letter ‘E’ and installs should be more than 10k as well as this chart should work between 4 Pm to 8Pm.

In [118]:
# Function to check if the current time is between 4 PM and 8 PM
def is_valid_time():
    current_hour = datetime.now().hour
    return 16 <= current_hour <= 20

# Filter data based on the conditions
play_store_data = play_store_data[
    (play_store_data['Content Rating'] == 'Teen') &
    (play_store_data['Installs'] > 10000) &
    (play_store_data['App'].str.startswith('E'))
]

# Create separate columns for month and year
play_store_data['Year'] = play_store_data['Last Updated'].dt.year
play_store_data['Month'] = play_store_data['Last Updated'].dt.month

# Group by category, year, and month and sum the installs
category_trends = play_store_data.groupby(['Category', 'Year', 'Month'])['Installs'].sum().reset_index()

# Sort by date for calculating month-over-month growth
category_trends = category_trends.sort_values(['Category', 'Year', 'Month'])

# Calculate month-over-month percentage change
category_trends['MoM_Change'] = category_trends.groupby('Category')['Installs'].pct_change()

if is_valid_time():
    fig4 = go.Figure()

    # Plotting the line for each category and shading the regions where MoM_Change exceeds 20%
    categories = category_trends['Category'].unique()
    for category in categories:
        category_data = category_trends[category_trends['Category'] == category]
        
        # Plot the time series line for the category
        fig4.add_trace(go.Scatter(
            x=category_data['Year'].astype(str) + '-' + category_data['Month'].astype(str),
            y=category_data['Installs'],
            mode='lines',
            name=category,
            line=dict(width=2)
        ))

        # Filter rows where MoM_Change exceeds 20% for the current category
        significant_growth = category_data[category_data['MoM_Change'] > 0.2]
        if not significant_growth.empty:
            # Fill the area under the curve where growth exceeds 20%
            fig4.add_trace(go.Scatter(
                x=significant_growth['Year'].astype(str) + '-' + significant_growth['Month'].astype(str),
                y=significant_growth['Installs'],
                mode='none',
                fill='tozeroy',
                name=f'{category} >20% Growth',
                fillcolor='rgba(0,100,80,0.2)',  # Semi-transparent shading
                showlegend=True  # Showing legend for the shaded area
            ))

    # Customize the layout
    fig4.update_layout(
        title="Trend of Total Installs Over Time (Apps with 'Teen' Rating, Starting with 'E')",
        xaxis_title="Date",
        yaxis_title="Total Installs",
        legend_title="Category",
        hovermode="x unified",
        height=600
    )

else:
    print("This chart can only be displayed between 4 PM and 8 PM.")
    
# Save plot as a html file
save_plot_as_html(fig4,"Task 4.html","The graph shows a steady rise in total installs across app categories over time, with notable growth exceeding 20% month-over-month in select categories like Family and Game.")

Task 5 : Create a violin plot to visualize the distribution of ratings for each app category, but only include categories with more than 50 apps and app name should contain letter “C” and exclude apps with fewer than 10 reviews and rating should be less 4.0. This graph should not work between 6 PM to 11PM

In [119]:
# Check if current time is between 6 PM and 11 PM
def is_valid_time():
    current_time = datetime.now().time()
    return not (current_time >= datetime.strptime("18:00", "%H:%M").time() and 
                current_time <= datetime.strptime("23:00", "%H:%M").time())


if is_valid_time():
    #Filter apps containing 'C'
    filtered_data = play_store_data1[play_store_data1['App'].str.contains('C', case=False)]

    #Check category counts and filter categories with more than 50 apps
    category_counts = filtered_data['Category'].value_counts()
    valid_categories = category_counts[category_counts > 50].index
    filtered_data = filtered_data[filtered_data['Category'].isin(valid_categories)]
    
    #Filter apps with more than 10 reviews
    filtered_data = filtered_data[filtered_data['Reviews'] >= 10]

    #Filter apps with ratings less than 4.0
    filtered_data = filtered_data[filtered_data['Rating'] < 4.0]

    # Check if any data is available after filtering
    if not filtered_data.empty:
        # Create a violin plot to visualize the distribution of ratings
        fig5 = px.violin(filtered_data, y='Rating', x='Category', box=True, points='all')

        # Customize layout
        fig5.update_layout(
            title="Distribution of Ratings for Each App Category (Apps Containing 'C', Rating < 4.0)",
            yaxis_title="Rating",
            xaxis_title="Category",
            showlegend=False
        )

    else:
        print("No data available after applying the filters.")
else:
    print("This graph cannot be displayed between 6 PM and 11 PM.")
    
# Save plot as a html file
save_plot_as_html(fig5,"Task 5.html","The graph shows that apps with names containing 'C' and ratings below 4.0 are distributed fairly evenly across most categories, with noticeable clustering around ratings between 3 and 4.")

In [120]:
plot_containers_split=plot_containers.split('</div>')

In [121]:
if len(plot_containers_split) > 1:
    final_plot=plot_containers_split[-2]+'</div>'
else:
    final_plot=plot_containers

In [122]:
dashboard_html= """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name=viewport" content="width=device-width,initial-scale-1.0">
    <title> Google Play Store Review Analytics</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            background-color: #333;
            color: #fff;
            margin: 0;
            padding: 0;
        }}
        .header {{
            display: flex;
            align-items: center;
            justify-content: center;
            padding: 20px;
            background-color: #444
        }}
        .header img {{
            margin: 0 10px;
            height: 50px;
        }}
        .container {{
            display: flex;
            flex-wrap: wrap;
            justify_content: center;
            padding: 20px;
        }}
        .plot-container {{
            border: 2px solid #555
            margin: 10px;
            padding: 10px;
            width: {plot_width}px;
            height: {plot_height}px;
            overflow: hidden;
            position: relative;
            cursor: pointer;
        }}
        .insights {{
            display: none;
            position: absolute;
            right: 10px;
            top: 10px;
            background-color: rgba(0,0,0,0.7);
            padding: 5px;
            border-radius: 5px;
            color: #fff;
        }}
        .plot-container: hover .insights {{
            display: block;
        }}
        </style>
        <script>
            function openPlot(filename) {{
                window.open(filename, '_blank');
                }}
        </script>
    </head>
    <body>
        <div class= "header">
            <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/4/4a/Logo_2013_Google.png/800px-Logo_2013_Google.png" alt="Google Logo">
            <h1>Google Play Store Reviews Analytics</h1>
            <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Google_Play_Store_badge_EN.svg/1024px-Google_Play_Store_badge_EN.svg.png" alt="Google Play Store Logo">
        </div>
        <div class="container">
            {plots}
        </div>
    </body>
    </html>
    """

In [123]:
plot_width=1000
plot_height=600
plot_bg_color='black'
text_color='white'
title_font={'size':16}
axis_font={'size':12}

In [124]:
final_html=dashboard_html.format(plots=plot_containers,plot_width=plot_width,plot_height=plot_height)

In [125]:
dashboard_path=os.path.join(html_files_path,"../../index.html")

In [126]:
with open(dashboard_path, "w", encoding="utf-8") as f:
    f.write(final_html)

In [127]:
webbrowser.open('file://'+os.path.realpath(dashboard_path))

True