# Google Play Store Analytics 📊
## Training Project and Internship Tasks
**Name: Bhargavi K**
**Email: bargaveek@gmail.com**
**Phn.No: 6380969910**
**Project Title: Google Play Store Analytics**
**Duration: [11.06.2025] – [11.07.2025]**
## 📌Introduction
This project involves analyzing app performance, reviews, and trends on the Google Play Store using Python
and creating an interactive dashboard using Plotly and HTML.
## 🔍Objective
- A training project focused on installation of libraries, loading of datasets, data cleaning, data transformation, sentiment analysis, advanced      visualizations and dashboard creation.
- An internship task focused on generation of wordcloud,creation of dual axis chart and plotting time series graph to discover trends.
## 🛠️Tools & Technologies
Python, Pandas & NumPy, Plotly (for interactive charts), WordCloud, NLTK (for sentiment scores)
## 💡Conclusion
This project helped improve my skills in data cleaning, analysis, sentiment scoring, and interactive dashboard development using real-world datasets.

In [3]:
# TRAINING PROJECT : 10 different plots
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import webbrowser
import os

nltk.download('vader_lexicon')

# step1 : load datasets
apps_df = pd.read_csv(r"C:\Users\kaila\Downloads\Play Store Data.csv")
reviews_df = pd.read_csv(r"C:\Users\kaila\Downloads\User Reviews.csv")

# Step 2: Data Cleaning
apps_df = apps_df.dropna(subset=['Rating'])
for column in apps_df.columns:
    mode_val = apps_df[column].mode()[0]
    apps_df[column] = apps_df[column].fillna(mode_val)
apps_df.drop_duplicates(inplace=True)
apps_df = apps_df[apps_df['Rating'] <= 5]
reviews_df.dropna(subset=['Translated_Review'], inplace=True)

# Merge datasets on 'App' and handle non-matching apps
merged_df = pd.merge(apps_df, reviews_df, on='App', how='inner')

# Step 3: Data Transformation
apps_df['Reviews'] = apps_df['Reviews'].astype(int)
apps_df['Installs'] = apps_df['Installs'].str.replace(r'[+,]', '', regex=True).astype(int)
apps_df['Price'] = apps_df['Price'].str.replace('$', '').replace('Free', '0').astype(float)

def convert_size(size):
    if pd.isnull(size):
        return np.nan
    if 'M' in size:
        return float(size.replace('M', ''))
    elif 'k' in size:
        return float(size.replace('k', '')) / 1024
    else:
        return np.nan
apps_df['Size'] = apps_df['Size'].apply(convert_size)

# Add log_installs and log_reviews columns
apps_df['Log_Installs'] = np.log1p(apps_df['Installs'])
apps_df['Log_Reviews'] = np.log1p(apps_df['Reviews'])

# Add Rating Group column
def rating_group(rating):
    if rating >= 4:
        return 'Top rated'
    elif rating >= 3:
        return 'Above average'
    elif rating >= 2:
        return 'Average'
    else:
        return 'Below average'
apps_df['Rating_Group'] = apps_df['Rating'].apply(rating_group)

# Add Revenue column
apps_df['Revenue'] = apps_df['Price'] * apps_df['Installs']

# Sentiment Analysis
sia = SentimentIntensityAnalyzer()
reviews_df['Sentiment_Score'] = reviews_df['Translated_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])

# Extract year from 'Last Updated' and create 'Year' column
apps_df['Last Updated'] = pd.to_datetime(apps_df['Last Updated'], errors='coerce')
apps_df['Year'] = apps_df['Last Updated'].dt.year

# # Define the path for your HTML files and initialize container
html_files_path = "./"
os.makedirs(html_files_path, exist_ok=True)
plot_containers = ""

# Save plot function
def save_plot_as_html(fig, filename, insight):
    global plot_containers
    filepath = os.path.join(html_files_path, filename)
    html_content = pio.to_html(fig, full_html=False, include_plotlyjs='inline')
    plot_containers += f"""
    <div class="plot-container" id="{filename}" onclick="openPlot('{filename}')">
        <div class="plot">{html_content}</div>
        <div class="insights">{insight}</div>
    </div>
    """
    fig.write_html(filepath, full_html=False, include_plotlyjs='inline')

# Define plots
plot_width = 400
plot_height = 300
plot_bg_color = 'black'
text_color = 'white'
title_font = {'size': 16}
axis_font = {'size': 12}

# fig1 category axis plot
category_counts = apps_df['Category'].value_counts().nlargest(10)
fig1 = px.bar(
    x=category_counts.index, y=category_counts.values,
    labels={'x': 'Category', 'y': 'Count'},
    title='Top Categories on Play Store',
    color=category_counts.index,
    color_discrete_sequence=px.colors.sequential.Plasma,
    width=plot_width, height=plot_height
)
fig1.update_layout(
    plot_bgcolor=plot_bg_color, paper_bgcolor=plot_bg_color,
    font_color=text_color, title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig1.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig1, "category_analysis.html", "The top categories on the Play Store are dominated by tools, entertainment, and productivity apps.This suggests users are looking for apps that either provide utility or offer leisure activities.")

# fig2 Type analysis plot
type_counts = apps_df['Type'].value_counts()
fig2 = px.pie(
    values=type_counts.values, names=type_counts.index,
    title='App Type Distribution',
    color_discrete_sequence=px.colors.sequential.RdBu,
    width=plot_width, height=plot_height
)
fig2.update_traces(textposition='inside', textinfo='percent+label')
fig2.update_layout(
    plot_bgcolor=plot_bg_color, paper_bgcolor=plot_bg_color,
    font_color=text_color, title_font=title_font,
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig2, "type_analysis.html", "Most apps on the Play Store are free, indicating a strategy to attract users first and monetize through ads or in-app purchases.")

# fig3 Rating distribution plot
fig3 = px.histogram(
    apps_df, x='Rating', nbins=20, title='Rating Distribution',
    color_discrete_sequence=['#636EFA'], width=plot_width, height=plot_height
)
fig3.update_layout(
    plot_bgcolor=plot_bg_color, paper_bgcolor=plot_bg_color,
    font_color=text_color, title_font=title_font,
    xaxis=dict(title_font=axis_font), yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig3, "rating_distribution.html", "Ratings are skewed towards higher values, suggesting that most apps are rated favorably by users.")

# fig4 Sentiment Distribution
sentiment_counts = reviews_df['Sentiment_Score'].round(1).value_counts().sort_index()
fig4 = px.bar(
    x=sentiment_counts.index, y=sentiment_counts.values,
    labels={'x': 'Sentiment Score', 'y': 'Count'},
    title='Sentiment Distribution',
    color=sentiment_counts.index, color_discrete_sequence=px.colors.sequential.RdPu,
    width=plot_width, height=plot_height
)
fig4.update_layout(
    plot_bgcolor=plot_bg_color, paper_bgcolor=plot_bg_color,
    font_color=text_color, title_font=title_font,
    xaxis=dict(title_font=axis_font), yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
fig4.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig4, "sentiment_distribution.html", "Sentiments in reviews show a mix of positive and negative feedback, with a slight lean towards positive sentiments.")

#fig5 Installs by Category plot
installs_by_category = apps_df.groupby('Category')['Installs'].sum().nlargest(10)
fig5 = px.bar(
    x=installs_by_category.values, y=installs_by_category.index,
    orientation='h', labels={'x': 'Installs', 'y': 'Category'},
    title='Installs by Category',
    color=installs_by_category.index,
    color_discrete_sequence=px.colors.sequential.Blues,
    width=plot_width, height=plot_height
)
fig5.update_layout(
    plot_bgcolor=plot_bg_color, paper_bgcolor=plot_bg_color,
    font_color=text_color, title_font=title_font,
    margin=dict(l=10, r=10, t=30, b=10)
)
fig5.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig5, "installs_by_category.html", "The categories with the most installs are social and communication apps, which reflects their broad appeal and daily usage.")

# fig6  Updates Per Year Plot
updates_per_year = apps_df['Year'].value_counts().sort_index()
fig6 = px.line(
    x=updates_per_year.index, y=updates_per_year.values,
    labels={'x': 'Year', 'y': 'Number of Updates'},
    title='Number of Updates Over the Years',
    color_discrete_sequence=['#AB63FA'],
    width=plot_width, height=plot_height
)
fig6.update_layout(
    plot_bgcolor=plot_bg_color, paper_bgcolor=plot_bg_color,
    font_color=text_color, title_font=title_font,
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig6, "updates_per_year.html", "Updates have been increasing over the years, showing that developers are actively maintaining and improving their apps.")

# fig7 Revenue by Category 
revenue_by_category = apps_df.groupby('Category')['Revenue'].sum().nlargest(10)
fig7 = px.bar(
    x=revenue_by_category.index, y=revenue_by_category.values,
    labels={'x': 'Category', 'y': 'Revenue'},
    title='Revenue by Category',
    color=revenue_by_category.index,
    color_discrete_sequence=px.colors.sequential.Greens,
    width=plot_width, height=plot_height
)
fig7.update_layout(
    plot_bgcolor=plot_bg_color, paper_bgcolor=plot_bg_color,
    font_color=text_color, title_font=title_font,
    margin=dict(l=10, r=10, t=30, b=10)
)
fig7.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig7, "revenue_by_category.html", "Categories such as Business and Productivity lead in revenue generation, indicating their monetization potential.")

# fig8 Genre Count plot
genre_counts = apps_df['Genres'].str.split(';', expand=True).stack().value_counts().nlargest(10)
fig8 = px.bar(
    x=genre_counts.index, y=genre_counts.values,
    labels={'x': 'Genre', 'y': 'Count'},
    title='Top Genres',
    color=genre_counts.index,
    color_discrete_sequence=px.colors.sequential.OrRd,
    width=plot_width, height=plot_height
)
fig8.update_layout(
    plot_bgcolor=plot_bg_color, paper_bgcolor=plot_bg_color,
    font_color=text_color, title_font=title_font,
    margin=dict(l=10, r=10, t=30, b=10)
)
fig8.update_traces(marker=dict(line=dict(color=text_color, width=1)))
save_plot_as_html(fig8, "genres_counts.html", "Action and Casual genres are the most common, reflecting users' preference for engaging and easy-to-play games.")

# fig9  Impact of Last Update on Rating
fig9 = px.scatter(
    apps_df, x='Last Updated', y='Rating', color='Type',
    title='Impact of Last Update on Rating',
    color_discrete_sequence=px.colors.qualitative.Vivid,
    width=plot_width, height=plot_height
)
fig9.update_layout(
    plot_bgcolor=plot_bg_color, paper_bgcolor=plot_bg_color,
    font_color=text_color, title_font=title_font,
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig9, "update_on_rating.html", "The scatter plot shows a weak correlation between the last update date and ratings, suggesting that more frequent updates don't always result in better ratings.")

#fig10 Ratings for Paid vs Free Apps
fig10 = px.box(
    apps_df, x='Type', y='Rating', color='Type',
    title='Ratings for Paid vs Free Apps',
    color_discrete_sequence=px.colors.qualitative.Pastel,
    width=plot_width, height=plot_height
)
fig10.update_layout(
    plot_bgcolor=plot_bg_color, paper_bgcolor=plot_bg_color,
    font_color=text_color, title_font=title_font,
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig10, "ratings_paid_free.html", "Paid apps generally have higher ratings compared to free apps, suggesting that users expect higher quality from apps they pay for.")

# Split plot_containers to handle the last plot properly
plot_containers_split = plot_containers.split('</div>')
if len(plot_containers_split) > 1:
    final_plot = plot_containers_split[-2] + '</div>'
else:
    final_plot = plot_containers  # Use plot_containers as default if splitting isn't sufficient   
    
# Build HTML Dashboard 
dashboard_html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Google Play Store Reviews Analytics</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            background-color: #333;
            color: #fff;
            margin: 0;
            padding: 0;
        }}
        .header {{
            display: flex;
            align-items: center;
            justify-content: center;
            padding: 20px;
            background-color: #444;
        }}
        .header img {{
            margin: 0 10px;
            height: 50px;
        }}
        .container {{
            display: flex;
            flex-wrap: wrap;
            justify-content: center;
            padding: 20px;
        }}
        .plot-container {{
            border: 2px solid #555;
            margin: 10px;
            padding: 10px;
            width: 400px;
            height: 300px;
            overflow: hidden;
            position: relative;
            cursor: pointer;
        }}
        .insights {{
            display: none;
            position: absolute;
            right: 10px;
            top: 10px;
            background-color: rgba(0, 0, 0, 0.7);
            padding: 5px;
            border-radius: 5px;
            color: #fff;
        }}
        .plot-container:hover .insights {{
            display: block;
        }}
    </style>
    <script>
        function openPlot(filename) {{
            window.open(filename, '_blank');
        }}
    </script>
</head>
<body>
    <div class="header">
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/4/4a/Logo_2013_Google.png/800px-Logo_2013_Google.png" alt="Google Logo">
        <h1>Google Play Store Reviews Analytics</h1>
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Google_Play_Store_badge_EN.svg/1024px-Google_Play_Store_badge_EN.svg.png" alt="Google Play Store Logo">
    </div>
    <div class="container">
        {plots}
    </div>
</body>
</html>
"""
# Save and open the dashboard
final_html = dashboard_html.format(plots=plot_containers)
dashboard_path = os.path.join(html_files_path, "dashboard.html")

# Use these containers to fill in your dashboard HTML
with open(dashboard_path, "w", encoding="utf-8") as f:
    f.write(final_html)
    
# Automatically open the generated HTML file in a web browser
webbrowser.open('file://' + os.path.realpath(dashboard_path))

# INTERNSHIP TASK 1 :Generate a word cloud for the most frequent keywords found in 5-star reviews, but exclude common stopwords and app names.
#Additionally, filter the reviews to include only those from apps in the "Health & Fitness" category.

# import required libraries
import plotly.express as px
import os
import plotly.io as pio
import webbrowser
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import re
import base64
from io import BytesIO

# Load CSV files
apps_df = pd.read_csv(r"C:\Users\kaila\Downloads\Play Store Data.csv")
reviews_df = pd.read_csv(r"C:\Users\kaila\Downloads\User Reviews.csv")

apps_df.columns = apps_df.columns.str.strip()
reviews_df.columns = reviews_df.columns.str.strip()

# Define the path for your HTML files
html_files_path = "./"
if not os.path.exists(html_files_path):
    os.makedirs(html_files_path)

# Initialize plot_containers
plot_containers = ""

# Save Plotly figures to HTML
def save_plot_as_html(fig, filename, insight):
    global plot_containers
    filepath = os.path.join(html_files_path, filename)
    html_content = pio.to_html(fig, full_html=False, include_plotlyjs='inline')
    plot_containers += f"""
    <div class="plot-container" id="{filename}" onclick="openPlot('{filename}')">
        <div class="plot">{html_content}</div>
        <div class="insights">{insight}</div>
    </div>
    """
    fig.write_html(filepath, full_html=False, include_plotlyjs='inline')

# Generate Word Cloud for Health & Fitness 
health_apps = apps_df[apps_df['Category'] == 'HEALTH_AND_FITNESS']
health_app_names = health_apps['App'].unique()
health_reviews = reviews_df[reviews_df['App'].isin(health_app_names)]
health_reviews_positive = health_reviews[
    (health_reviews['Translated_Review'].notna()) &
    (health_reviews['Sentiment'] == 'Positive')
]

if len(health_reviews_positive) > 0:
    all_reviews_text = ' '.join(health_reviews_positive['Translated_Review'].astype(str))
    stopwords = set(STOPWORDS)
    stopwords.update([re.sub(r'\W+', '', app.lower()) for app in health_app_names])
    wordcloud = WordCloud(width=800, height=400, background_color='black', stopwords=stopwords).generate(all_reviews_text)
    
# Save word cloud to memory buffer
    buffer = BytesIO()
    wordcloud.to_image().save(buffer, format="PNG")
    encoded_wordcloud = base64.b64encode(buffer.getvalue()).decode('utf-8')
    
# Add to plot_containers
    plot_containers += f"""
    <div class="plot-container" id="wordcloud">
        <img src="data:image/png;base64,{encoded_wordcloud}" width="100%" height="100%"/>
        <div class="insights">Most frequent words used in 5-star reviews for Health & Fitness apps.</div>
    </div>
    """

# Build HTML dashboard
dashboard_html = f"""
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Google Play Store Reviews Analytics</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            background-color: #333;
            color: #fff;
            margin: 0;
            padding: 0;
        }}
        .header {{
            display: flex;
            align-items: center;
            justify-content: center;
            padding: 20px;
            background-color: #444;
        }}
        .header img {{
            margin: 0 10px;
            height: 50px;
        }}
        .container {{
            display: flex;
            flex-wrap: wrap;
            justify-content: center;
            padding: 20px;
        }}
        .plot-container {{
            border: 2px solid #555;
            margin: 10px;
            padding: 10px;
            width: 95%;
            overflow: hidden;
            position: relative;
            cursor: pointer;
        }}
        .insights {{
            display: none;
            position: absolute;
            right: 10px;
            top: 10px;
            background-color: rgba(0, 0, 0, 0.7);
            padding: 5px;
            border-radius: 5px;
            color: #fff;
        }}
        .plot-container:hover .insights {{
            display: block;
        }}
    </style>
</head>
<body>
    <div class="header">
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/4/4a/Logo_2013_Google.png/800px-Logo_2013_Google.png" alt="Google Logo">
        <h1>Google Play Store Reviews Analytics</h1>
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Google_Play_Store_badge_EN.svg/1024px-Google_Play_Store_badge_EN.svg.png" alt="Google Play Store Logo">
    </div>
    <div class="container">
        {plot_containers}
    </div>
</body>
</html>
"""

# Save and open the dashboard
output_folder = "html_output"
os.makedirs(output_folder, exist_ok=True)
dashboard_path = os.path.join(output_folder, "dashboard.html")

# Use these containers to fill in your dashboard HTML
with open(dashboard_path, "w", encoding="utf-8") as f:
    f.write(dashboard_html)
    
# Automatically open the generated HTML file in a web browser
webbrowser.open('file://' + os.path.realpath(dashboard_path))

#INTERNSHIP TASK 2 : Create a dual-axis chart comparing the average installs and revenue for free vs. paid apps within the top 3 app categories. Apply filters to exclude apps with fewer than 10,000 installs and revenue below $10,000 and android version should be more than 4.0 as well as size should be more than 15M and content rating should be Everyone and app name should not have more than 30 characters including space and special character .
#this graph should work only between 1 PM IST to 2 PM IST apart from that time we should not show this graph in dashboard itself.

# import required libraries
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.io as pio
import os
import webbrowser
import datetime
import pytz

# Load Data
apps_df = pd.read_csv(r"C:\Users\kaila\Downloads\Play Store Data.csv")

# Data Cleaning
apps_df = apps_df.dropna(subset=['Installs', 'Size', 'Price', 'Android Ver', 'Content Rating', 'App'])

# Clean Installs column
apps_df['Installs'] = apps_df['Installs'].str.replace("+", "", regex=False).str.replace(",", "", regex=False).astype(int)

# Clean Size column (convert M and K to numbers)
def size_to_mb(size):
    size = str(size).upper()
    if "M" in size:
        return float(size.replace("M", ""))
    elif "K" in size:
        return float(size.replace("K", "")) / 1024
    else:
        return np.nan

apps_df['Size_MB'] = apps_df['Size'].apply(size_to_mb)

# Clean Price column
apps_df['Price'] = apps_df['Price'].str.replace("$", "", regex=False).astype(float)

# Create Revenue column (Price * Installs for paid apps)
apps_df['Revenue'] = np.where(apps_df['Type'] == 'Paid', apps_df['Price'] * apps_df['Installs'], 0)

# Clean Android version column
apps_df['Android Ver'] = apps_df['Android Ver'].str.extract(r'(\d+\.?\d*)').astype(float)

# Apply Filters
filtered_df = apps_df[
    (apps_df['Installs'] >= 10000) &
    (apps_df['Revenue'] >= 10000) &
    (apps_df['Android Ver'] > 4.0) &
    (apps_df['Size_MB'] > 15) &
    (apps_df['Content Rating'] == 'Everyone') &
    (apps_df['App'].str.len() <= 30)
]

# Get top 3 categories by total installs
top_categories = filtered_df.groupby('Category')['Installs'].sum().sort_values(ascending=False).head(3).index.tolist()

# Filter to top categories
filtered_df = filtered_df[filtered_df['Category'].isin(top_categories)]

# Group by Category and Type
agg_df = filtered_df.groupby(['Category', 'Type']).agg({
    'Installs': 'mean',
    'Revenue': 'mean'
}).reset_index()

# IST timezone
ist = pytz.timezone('Asia/Kolkata')
current_time = datetime.datetime.now(ist)
current_hour = current_time.hour

# Initialize plot_containers
plot_containers = ""

# Only generate chart if time condition met
if current_hour==13:

    fig = go.Figure()

    for category in top_categories:
        cat_data = agg_df[agg_df['Category'] == category]

        fig.add_trace(go.Bar(
            x=cat_data['Type'] + ' (' + category + ')',
            y=cat_data['Installs'],
            name=f'{category} - Installs',
            yaxis='y1'
        ))

        fig.add_trace(go.Scatter(
            x=cat_data['Type'] + ' (' + category + ')',
            y=cat_data['Revenue'],
            name=f'{category} - Revenue',
            yaxis='y2',
            mode='lines+markers'
        ))

    fig.update_layout(
        title='Average Installs vs Revenue (Free vs Paid) for Top 3 Categories',
        xaxis=dict(title='Type (Free/Paid)'),
        yaxis=dict(title='Average Installs', side='left'),
        yaxis2=dict(title='Average Revenue ($)', overlaying='y', side='right'),
        legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1),
        template='plotly_dark'
    )

    # Convert Plotly figure to HTML string
    html_content = pio.to_html(fig, full_html=False, include_plotlyjs='inline')

    # Add to plot containers
    plot_containers += f"""
    <div class="plot-container" id="dual_axis_chart">
        <div class="plot">{html_content}</div>
        <div class="insights">Dual Axis Chart: Installs vs Revenue for Top 3 Categories.</div>
    </div>
    """
else:
    plot_containers += """
    <div class="plot-container" id="no_chart">
        <h2 style="color:red; text-align:center; padding:50px;">The chart is only available between 1 PM to 2 PM IST.</h2>
    </div>
    """
# Build HTML dashboard
dashboard_html = f"""
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Google Play Store Reviews Analytics</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            background-color: #333;
            color: #fff;
            margin: 0;
            padding: 0;
        }}
        .header {{
            display: flex;
            align-items: center;
            justify-content: center;
            padding: 20px;
            background-color: #444;
        }}
        .header img {{
            margin: 0 10px;
            height: 50px;
        }}
        .container {{
            display: flex;
            flex-wrap: wrap;
            justify-content: center;
            padding: 20px;
        }}
        .plot-container {{
            border: 2px solid #555;
            margin: 10px;
            padding: 10px;
            width: 95%;
            overflow: hidden;
            position: relative;
            cursor: pointer;
        }}
        .insights {{
            display: none;
            position: absolute;
            right: 10px;
            top: 10px;
            background-color: rgba(0, 0, 0, 0.7);
            padding: 5px;
            border-radius: 5px;
            color: #fff;
        }}
        .plot-container:hover .insights {{
            display: block;
        }}
    </style>
</head>
<body>
    <div class="header">
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/4/4a/Logo_2013_Google.png/800px-Logo_2013_Google.png" alt="Google Logo">
        <h1>Google Play Store Reviews Analytics</h1>
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Google_Play_Store_badge_EN.svg/1024px-Google_Play_Store_badge_EN.svg.png" alt="Google Play Store Logo">
    </div>
    <div class="container">
        {plot_containers}
    </div>
</body>
</html>
"""

# Save and open the dashboard
output_folder = "html_output"
os.makedirs(output_folder, exist_ok=True)
dashboard_path = os.path.join(output_folder, "dashboard.html")

# Use these containers to fill in your dashboard HTML
with open(dashboard_path, "w", encoding="utf-8") as f:
    f.write(dashboard_html)
    
# Automatically open the generated HTML file in a web browser
webbrowser.open('file://' + os.path.realpath(dashboard_path))

#INTERNSHIP TASK 3 : Plot a time series line chart to show the trend of total installs over time, segmented by app category. 
#Highlight periods of significant growth by shading the areas under the curve where the increase in installs exceeds 20% month-over-month 
#and app name should not starts with x, y ,z and app category should start with letter " E " or " C " or " B " and
#We have to translate the Beauty category in Hindi and Business category in Tamil and Dating category in German while showing it on Graph.
#reviews should be more than 500 the app name should not contain letter "S" as well as this graph should work only between
#6 PM IST to 9 PM IST apart from that time we should not show this graph in dashboard itself

#import required libraries
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.io as pio
import datetime
import pytz
import os
import webbrowser

# Load data
apps_df = pd.read_csv(r"C:\Users\kaila\Downloads\Play Store Data.csv")
reviews_df = pd.read_csv(r"C:\Users\kaila\Downloads\User Reviews.csv")

# Filter apps
apps_df = apps_df[~apps_df['App'].str.lower().str.startswith(('x', 'y', 'z'))]
apps_df = apps_df[~apps_df['App'].str.lower().str.contains('s')]
apps_df = apps_df[apps_df['Category'].str.upper().str.startswith(('E', 'C', 'B'))]
apps_df['Reviews'] = pd.to_numeric(apps_df['Reviews'], errors='coerce')
apps_df = apps_df[apps_df['Reviews'] > 500]

# Translate categories
translations = {
    'BEAUTY': 'सौंदर्य',        # Hindi
    'BUSINESS': 'வணிகம்',      # Tamil
    'DATING': 'Partnersuche'    # German
}
apps_df['Category'] = apps_df['Category'].replace(translations)

# Clean columns
apps_df['Installs'] = apps_df['Installs'].str.replace('[+,]', '', regex=True).astype(float)
apps_df['Last Updated'] = pd.to_datetime(apps_df['Last Updated'], errors='coerce')

# Merge review counts
review_counts = reviews_df['App'].value_counts().reset_index()
review_counts.columns = ['App', 'Review_Count']
apps_df = pd.merge(apps_df, review_counts, on='App', how='inner')

# Group by month
apps_df['Month'] = apps_df['Last Updated'].dt.to_period('M').dt.to_timestamp()
grouped = apps_df.groupby(['Month', 'Category'])['Installs'].sum().reset_index()
grouped['MoM_Growth'] = grouped.groupby('Category')['Installs'].pct_change()
grouped['High_Growth'] = grouped['MoM_Growth'] > 0.2

# Create Plotly chart
fig = go.Figure()
for category in grouped['Category'].unique():
    df_cat = grouped[grouped['Category'] == category]
    fig.add_trace(go.Scatter(
        x=df_cat['Month'], y=df_cat['Installs'],
        mode='lines+markers', name=category
    ))
    fig.add_trace(go.Scatter(
        x=df_cat['Month'],
        y=np.where(df_cat['High_Growth'], df_cat['Installs'], np.nan),
        mode='lines', fill='tozeroy',
        name=f'{category} >20% MoM Growth',
        line=dict(width=0),
        fillcolor='rgba(255,165,0,0.2)',
        showlegend=False
    ))

fig.update_layout(
    title='📈 Total Installs Over Time by Category',
    xaxis_title='Month',
    yaxis_title='Total Installs',
    template='plotly_dark',
    height=600
)

# Time restriction: only 6 PM to 9 PM IST
ist = pytz.timezone("Asia/Kolkata")
now_ist = datetime.datetime.now(ist)
hour_ist = now_ist.hour

# Prepare container for plot or message
plot_containers = ""
if 18 <= hour_ist < 21:
    html_content = pio.to_html(fig, full_html=False, include_plotlyjs='inline')
    plot_containers += f"""
    <div class="plot-container" id="time_series_chart">
        <div class="plot">{html_content}</div>
        <div class="insights">Time Series Chart: Installs trend by category with &gt;20% MoM growth shaded.</div>
    </div>
    """
else:
    plot_containers += """
    <div class="plot-container" id="no_chart">
        <h2 style="color:red; text-align:center; padding:50px;">
            The chart is only available between 6 PM to 9 PM IST.
        </h2>
    </div>
    """

# Dashboard HTML layout
dashboard_html = f"""
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Google Play Store Reviews Analytics</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            background-color: #333;
            color: #fff;
            margin: 0;
            padding: 0;
        }}
        .header {{
            display: flex;
            align-items: center;
            justify-content: center;
            padding: 20px;
            background-color: #444;
        }}
        .header img {{
            margin: 0 10px;
            height: 50px;
        }}
        .container {{
            display: flex;
            flex-wrap: wrap;
            justify-content: center;
            padding: 20px;
        }}
        .plot-container {{
            border: 2px solid #555;
            margin: 10px;
            padding: 10px;
            width: 95%;
            overflow: hidden;
            position: relative;
            cursor: pointer;
        }}
        .insights {{
            display: none;
            position: absolute;
            right: 10px;
            top: 10px;
            background-color: rgba(0, 0, 0, 0.7);
            padding: 5px;
            border-radius: 5px;
            color: #fff;
        }}
        .plot-container:hover .insights {{
            display: block;
        }}
    </style>
</head>
<body>
    <div class="header">
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/4/4a/Logo_2013_Google.png/800px-Logo_2013_Google.png" alt="Google Logo">
        <h1>Google Play Store Reviews Analytics</h1>
        <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Google_Play_Store_badge_EN.svg/1024px-Google_Play_Store_badge_EN.svg.png" alt="Google Play Store Logo">
    </div>
    <div class="container">
        {plot_containers}
    </div>
</body>
</html>
"""

# Save and open the dashboard
output_folder = "html_output"
os.makedirs(output_folder, exist_ok=True)
dashboard_path = os.path.join(output_folder, "dashboard.html")

# Use these containers to fill in your dashboard HTML
with open(dashboard_path, "w", encoding="utf-8") as f:
    f.write(dashboard_html)
    
# Automatically open the generated HTML file in a web browser
webbrowser.open('file://' + os.path.realpath(dashboard_path))


    


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\kaila\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True