In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.io as pio
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk 
import webbrowser
import os

In [2]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\arock\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
apps_df = pd.read_csv("Play Store Data.csv")
reviews_df = pd.read_csv("User Reviews.csv")

In [4]:
print(apps_df.isnull().sum())

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64


In [5]:
# data cleaning
apps_df = apps_df.dropna(subset=['Rating'])
for col in apps_df.columns:
    apps_df[col].fillna(apps_df[col].mode()[0],inplace=True)
apps_df.drop_duplicates(inplace=True)
apps_df=apps_df[apps_df['Rating']<=5]
reviews_df.dropna(subset=['Translated_Review'],inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  apps_df[col].fillna(apps_df[col].mode()[0],inplace=True)


In [6]:
print(apps_df.isnull().sum())

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       0
Android Ver       0
dtype: int64


In [7]:
reviews_df.isnull().sum()

App                       0
Translated_Review         0
Sentiment                 0
Sentiment_Polarity        0
Sentiment_Subjectivity    0
dtype: int64

In [8]:
apps_df['Installs'] = apps_df['Installs'].str.replace(',','').str.replace('+','').astype(int)
apps_df['Price'] = apps_df['Price'].str.replace('$','').astype(float)

In [9]:
# Merge the data set
merge_df = pd.merge(apps_df,reviews_df,on='App',how='inner')

In [10]:
# data Transformation
def coverted_size(size):
    if 'M' in size:
        return float(size.replace('M',''))
    elif 'k' in size:
        return float(size.replace('k',''))/1024
    else:
        return np.nan
apps_df['Size'] = apps_df['Size'].apply(coverted_size)

In [11]:
apps_df['Reviews'] = apps_df['Reviews'].astype(int)

In [12]:
# Add log_installs and log_reviews columns
apps_df['Log_Installs'] = np.log1p(apps_df['Installs'])
apps_df['Log_Reviews'] = np.log1p(apps_df['Reviews'])

In [13]:
# Add Rating Group column
def rating_group(rating):
    if rating >= 4:
        return 'Top rated'
    elif rating >= 3:
        return 'Above average'
    elif rating >= 2:
        return 'Average'
    else:
        return 'Below average'

apps_df['Rating_Group'] = apps_df['Rating'].apply(rating_group)

In [14]:
sia = SentimentIntensityAnalyzer()

In [15]:
reviews_df['Sentimate_Score'] = reviews_df['Translated_Review'].apply(lambda x:sia.polarity_scores(str(x))['compound'])

In [16]:
apps_df['Last Updated'] = pd.to_datetime(apps_df['Last Updated'],errors='coerce')
apps_df['Year'] = apps_df['Last Updated'].dt.year

In [17]:
# Define the path for your HTML files
html_files_path = "./"

# Make sure the directory exists
if not os.path.exists(html_files_path):
    os.makedirs(html_files_path)

# Initialize plot_containers
plot_containers = ""

# Save each Plotly figure to an HTML file
def save_plot_as_html(fig, filename, insight):
    global plot_containers
    filepath = os.path.join(html_files_path, filename)
    html_content = pio.to_html(fig, full_html=False, include_plotlyjs='inline')
    # Append the plot and its insight to plot_containers
    plot_containers += f"""
    <div class="plot-container" id="{filename}" onclick="openPlot('{filename}')">
        <div class="plot">{html_content}</div>
        <div class="insights">{insight}</div>
    </div>
    """
    fig.write_html(filepath, full_html=False, include_plotlyjs='inline')

# Define your plots
plot_width = 520
plot_height = 520
plot_bg_color = 'black'
text_color = 'white'
title_font = {'size': 16}
axis_font = {'size': 12}

In [18]:
# Create a revenue column for paid apps
apps_df['Revenue'] = apps_df['Price'] * apps_df['Installs']

# Filter for paid apps only
paid_df = apps_df[apps_df['Type'] == 'Paid']


fig1 = px.scatter(
    paid_df,
    x='Installs',
    y='Revenue',
    color='Category',
    title='Revenue vs. Installs for Paid Apps',
    color_discrete_sequence=px.colors.qualitative.Vivid,
    labels={'Installs': 'Number of Installs', 'Revenue': 'Revenue (USD)'},
    width=plot_width,
    trendline='ols',
    height=plot_height
)
fig1.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig1.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig1, "Paid_apps.html", "The scatter plot shows that paid apps, especially in Games and Productivity, have a positive correlation between revenue and installs. This indicates users are willing to pay for high-quality apps that offer utility or entertainment.")


In [19]:
from plotly.subplots import make_subplots,go

In [20]:
# Calculate average installs and revenue for free vs. paid apps within the top 3 categories
top_categories = apps_df['Category'].value_counts().nlargest(3).index
filtered_df = apps_df[
    (apps_df['Installs'] > 10000) & 
    (apps_df['Revenue'] > 10000) &
    (apps_df['Android Ver'] > '4.0') & 
    (apps_df['Size'] > 15) &  # Size in bytes
    (apps_df['Content Rating'] == 'Everyone') &
    (apps_df['App'].str.len() <= 30) & 
    (apps_df['Category'].isin(top_categories))
]

# Group by category and type (Free or Paid) to get average installs and revenue
grouped_df = filtered_df.groupby(['Category', 'Type']).agg(
    avg_installs=('Installs', 'mean'),
    avg_revenue=('Revenue', 'mean')
).reset_index()

# Create a dual-axis chart with Plotly
fig2 = make_subplots(specs=[[{"secondary_y": True}]])  # Create dual axis

# Add bar chart for average installs
fig2.add_trace(
    go.Bar(x=grouped_df['Category'], y=grouped_df['avg_installs'], name="Average Installs", marker_color='blue'),
    secondary_y=False,
)

# Add line chart for average revenue
fig2.add_trace(
    go.Scatter(x=grouped_df['Category'], y=grouped_df['avg_revenue'], name="Average Revenue", mode='lines+markers', line=dict(color='green')),
    secondary_y=True
)

# Update layout for better appearance
fig2.update_layout(
    title_text="Average Installs & Revenue for Free vs.Paid Apps Top 3 Categories",
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    margin=dict(l=10, r=10, t=30, b=10),
    width=512,
    height=512 
)

# Add axis titles
fig2.update_xaxes(title_text="App Category")
fig2.update_yaxes(title_text="Average Installs", secondary_y=False)
fig2.update_yaxes(title_text="Average Revenue (USD)", secondary_y=True)

# Save the plot
save_plot_as_html(fig2, "average_installs_revenue.html", "The dual-axis chart shows that, after filtering for installs, revenue, and ratings, paid apps generally generate higher revenue, while free apps attract more installs. This suggests users prefer accessible free apps.")


In [21]:
apps_df['Last Updated'] > '2018-01-01'

0         True
1         True
2         True
3         True
4         True
         ...  
10834    False
10836    False
10837     True
10839    False
10840     True
Name: Last Updated, Length: 8892, dtype: bool

In [22]:
# Filter the data for apps updated in the last year and meeting criteria
import datetime
current_hour = datetime.datetime.now().hour
if 15 <= current_hour < 18:
    filtered_df = apps_df[
        (apps_df['Last Updated']>'2018-01-01') & 
        (apps_df['Installs'] >= 100000) & 
        (apps_df['Reviews'] >= 1000) & 
        (~apps_df['Genres'].str.startswith(('A', 'F', 'E', 'G', 'I', 'K')))
    ]
    
    # Create a correlation matrix for Installs, Ratings, and Reviews
    correlation_matrix = filtered_df[['Installs', 'Rating', 'Reviews']].corr()
    
    # Create a heatmap using Plotly
    fig3 = px.imshow(
        correlation_matrix,
        text_auto=True,
        color_continuous_scale='Viridis',
        labels={'color': 'Correlation'},
        title='Correlation Matrix: Installs, Ratings, and Reviews',
        width=plot_width,
        height=plot_height
        
        
    )
    
    # Update layout for better appearance
    fig3.update_layout(
        plot_bgcolor=plot_bg_color,
        paper_bgcolor=plot_bg_color,
        font_color=text_color,
        title_font=title_font,
        width=plot_width,
        height=plot_height,
        margin=dict(l=10, r=10, t=30, b=10)
    )
    
    # Save the plot
    save_plot_as_html(fig3, "correlation_matrix.html", "The heatmap shows the correlation between installs, ratings, and review counts for apps updated in the last year. Filtering for apps with over 100,000 installs and 1,000 reviews, while excluding certain genres, indicates that higher ratings correlate with more installs and reviews")
else:
    print("This plot can only be generated between 3 PM and 6 PM.")

This plot can only be generated between 3 PM and 6 PM.


In [23]:
# Split plot_containers to handle the last plot properly
plot_containers_split = plot_containers.split('</div>')
if len(plot_containers_split) > 1:
    final_plot = plot_containers_split[-2] + '</div>'
else:
    final_plot = plot_containers  # Use plot_containers as default if splitting isn't sufficient

In [24]:
# HTML template for the dashboard
dashboard_html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Google Play Store Reviews Analytics</title>
    <style>
    body {{
        font-family: Arial, sans-serif;
        background-color: #333;
        color: #fff;
        margin: 0;
        padding: 0;
    }}
    .header {{
        display: flex;
        align-items: center;
        justify-content: center;
        padding: 20px;
        background-color: #444;
    }}
    .header img {{
        margin: 0 10px;
        height: 50px;
    }}
    .container {{
        display: flex;
        flex-wrap: wrap;
        justify-content: center;
        padding: 20px;
    }}
    .plot-container {{
        border: 2px solid #555;
        margin: 10px;
        padding: 10px;
        width: 400px; 
        height: 300px; 
        overflow: hidden;
        position: relative;
        cursor: pointer;
    }}
    .insights {{
        display: none;
        position: absolute;
        right: 10px;
        top: 10px;
        background-color: rgba(0, 0, 0, 0.7);
        padding: 5px;
        border-radius: 5px;
        color: #fff;
    }}
    .plot-container:hover .insights {{
        display: block;
    }}
    h2 {{
        text-align: center;
        margin: 20px 0;
        font-size: 24px;
        color: #fff;
    }}
    /* Media query for tablets and smaller screens */
    @media (max-width: 768px) {{
        .plot-container {{
            width: 80vw; 
            height: 60vh; 
        }}
        .header {{
            flex-direction: column; 
            text-align: center; 
        }}
        .header img {{
            height: 40px; 
        }}
        h2 {{
            font-size: 20px; 
        }}
    }}
    /* Media query for very small screens like mobile phones */
    @media (max-width: 480px) {{
        .plot-container {{
            width: 95vw; 
            height: auto; 
        }}
        .header img {{
            height: 30px; 
        }}
        h2 {{
            font-size: 18px; 
        }}
    }}
</style>
    <script>
        function openPlot(filename) {{
            window.open(filename, '_blank');
        }}
    </script>
</head>
<body>
    <div class="header">
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/4/4a/Logo_2013_Google.png/800px-Logo_2013_Google.png" alt="Google Logo">
        <h1>Google Play Store Reviews Analytics</h1>
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Google_Play_Store_badge_EN.svg/1024px-Google_Play_Store_badge_EN.svg.png" alt="Google Play Store Logo">
    </div>
    <div class="container">
        {plots}
    </div>
</body>
</html>
"""


In [25]:
# Use these containers to fill in your dashboard HTML
final_html = dashboard_html.format(plots=plot_containers, plot_width=plot_width, plot_height=plot_height)

# Save the final dashboard to an HTML file
dashboard_path = os.path.join(html_files_path, "index.html")
with open(dashboard_path, "w", encoding="utf-8") as f:
    f.write(final_html)

# Automatically open the generated HTML file in a web browser
webbrowser.open('file://' + os.path.realpath(dashboard_path))

True