In [8]:
import pandas as pd
import numpy as np
import os
import plotly.io as pio
from datetime import datetime
import pytz
import plotly.graph_objects as go
import plotly.express as px

In [9]:
def clean_and_prepare_data(apps_df, reviews_df):
    apps_df = apps_df.dropna(subset=['Rating'])
    for column in apps_df.columns:
        apps_df[column] = apps_df[column].fillna(apps_df[column].mode()[0])

    apps_df.drop_duplicates(inplace=True)
    apps_df = apps_df[apps_df['Rating'] <= 5]

    apps_df['Reviews'] = pd.to_numeric(apps_df['Reviews'], errors='coerce')
    apps_df['Reviews'] = apps_df['Reviews'].astype(int)

    apps_df['Installs'] = apps_df['Installs'].astype(str).str.replace(',', '').str.replace('+', '').astype(int)

    apps_df['Price'] = apps_df['Price'].astype(str).str.replace('$', '').astype(float)

    def convert_size(size):
        if 'M' in str(size):
            return float(size.replace('M', ''))
        elif 'k' in str(size):
            return float(size.replace('k', '')) / 1024
        elif 'G' in str(size):
            return float(size.replace('G', '')) * 1024
        else:
            return np.nan

    apps_df['Size'] = apps_df['Size'].apply(convert_size)

    apps_df = apps_df.dropna(subset=['Size'])

    apps_df['Revenue'] = apps_df['Price'] * apps_df['Installs']

    apps_df['Last Updated'] = pd.to_datetime(apps_df['Last Updated'])
    apps_df['Month'] = apps_df['Last Updated'].dt.month
    apps_df['Year'] = apps_df['Last Updated'].dt.year

    apps_df['Android Ver_Major'] = apps_df['Android Ver'].apply(
        lambda x: float(str(x).split('.')[0]) if 'and up' in str(x) else np.nan
    )

    reviews_df = reviews_df.dropna(subset=['Translated_Review', 'Sentiment_Subjectivity'])

    merged_df = pd.merge(apps_df, reviews_df, on='App', how='inner')

    print("Data cleaning and preparation complete.")
    return apps_df, merged_df

if __name__ == '__main__':
    try:
        apps_df_raw = pd.read_csv('Play Store Data.csv')
        reviews_df_raw = pd.read_csv('User Reviews.csv')
        apps_df, merged_df = clean_and_prepare_data(apps_df_raw.copy(), reviews_df_raw.copy())
        print("Data processing test successful!")
        print(apps_df.info())
        print(merged_df.info())
    except FileNotFoundError:
        print("Please ensure 'Play Store Data.csv' and 'User Reviews.csv' are in the same directory.")

Data cleaning and preparation complete.
Data processing test successful!
<class 'pandas.core.frame.DataFrame'>
Index: 7424 entries, 0 to 10840
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   App                7424 non-null   object        
 1   Category           7424 non-null   object        
 2   Rating             7424 non-null   float64       
 3   Reviews            7424 non-null   int64         
 4   Size               7424 non-null   float64       
 5   Installs           7424 non-null   int64         
 6   Type               7424 non-null   object        
 7   Price              7424 non-null   float64       
 8   Content Rating     7424 non-null   object        
 9   Genres             7424 non-null   object        
 10  Last Updated       7424 non-null   datetime64[ns]
 11  Current Ver        7424 non-null   object        
 12  Android Ver        7424 non-null   object        

In [10]:
def create_grouped_bar_chart(df):
    top_10_installs_df = df.groupby('Category').agg(
        Average_Rating=('Rating', 'mean'),
        Total_Reviews=('Reviews', 'sum')
    ).sort_values(by='Total_Reviews', ascending=False).head(10).reset_index()

    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=top_10_installs_df['Category'],
        y=top_10_installs_df['Average_Rating'],
        name='Average Rating',
        marker_color='rgb(102, 194, 165)'
    ))
    fig.add_trace(go.Bar(
        x=top_10_installs_df['Category'],
        y=top_10_installs_df['Total_Reviews'],
        name='Total Reviews',
        marker_color='rgb(252, 141, 98)',
        yaxis='y2'
    ))

    fig.update_layout(
        title='Average Rating and Total Reviews for Top 10 App Categories',
        xaxis_title='Category',
        yaxis=dict(
            title=dict(
                text='Average Rating',
                font=dict(color='rgb(102, 194, 165)')
            ),
            tickfont=dict(color='rgb(102, 194, 165)')
        ),
        yaxis2=dict(
            title=dict(
                text='Total Reviews',
                font=dict(color='rgb(252, 141, 98)')
            ),
            overlaying='y',
            side='right',
            tickfont=dict(color='rgb(252, 141, 98)')
        ),
        barmode='group',
        plot_bgcolor='white',
        paper_bgcolor='white',
        title_font_size=16,
        width=1000,
        height=500
    )
    return fig

def create_choropleth_map(df):
    installs_by_country_category = df.groupby(['Country', 'Category'])['Installs'].sum().reset_index()
    fig = px.choropleth(
        installs_by_country_category,
        locations='Country',
        locationmode='country names',
        color='Installs',
        hover_name='Country',
        animation_frame='Category',
        color_continuous_scale=px.colors.sequential.Teal,
        title='Global Installs by App Category',
    )
    fig.update_layout(title_font_size=16, width=1000, height=500)
    return fig

def create_dual_axis_chart(df):
    grouped_df = df.groupby(['Category', 'Type']).agg(
        Average_Installs=('Installs', 'mean'),
        Average_Revenue=('Revenue', 'mean')
    ).reset_index()

    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=grouped_df['Category'][grouped_df['Type'] == 'Free'],
        y=grouped_df['Average_Installs'][grouped_df['Type'] == 'Free'],
        name='Average Installs (Free)',
        marker_color='mediumseagreen'
    ))
    fig.add_trace(go.Bar(
        x=grouped_df['Category'][grouped_df['Type'] == 'Paid'],
        y=grouped_df['Average_Installs'][grouped_df['Type'] == 'Paid'],
        name='Average Installs (Paid)',
        marker_color='cadetblue'
    ))
    fig.add_trace(go.Scatter(
        x=grouped_df['Category'],
        y=grouped_df['Average_Revenue'],
        mode='lines+markers',
        name='Average Revenue',
        line=dict(color='firebrick', width=4),
        yaxis='y2'
    ))

    fig.update_layout(
        title='Average Installs & Revenue for Top 3 Categories',
        xaxis_title='Category',
        yaxis=dict(title='Average Installs'),
        yaxis2=dict(
            title='Average Revenue',
            overlaying='y',
            side='right'
        ),
        barmode='group',
        plot_bgcolor='white',
        paper_bgcolor='white',
        title_font_size=16,
        width=1000,
        height=500
    )
    return fig

def create_time_series_chart(df):
    grouped_df = df.groupby([df['Last Updated'].dt.to_period('M'), 'Category_Translated'])['Installs'].sum().reset_index()
    grouped_df['Last Updated Month'] = grouped_df['Last Updated'].dt.to_timestamp()

    fig = px.line(
        grouped_df,
        x='Last Updated Month',
        y='Installs',
        color='Category_Translated',
        title='Trend of Total Installs Over Time',
        labels={'Last Updated Month': 'Date', 'Installs': 'Total Installs', 'Category_Translated': 'Category'}
    )

    for category in grouped_df['Category_Translated'].unique():
        category_df = grouped_df[grouped_df['Category_Translated'] == category].sort_values('Last Updated Month')
        category_df['Installs_Prev'] = category_df['Installs'].shift(1)
        category_df['Growth'] = (category_df['Installs'] - category_df['Installs_Prev']) / category_df['Installs_Prev'] * 100

        significant_growth_periods = category_df[category_df['Growth'] > 20]

        for _, row in significant_growth_periods.iterrows():
            fig.add_vrect(
                x0=row['Last Updated Month'],
                x1=row['Last Updated Month'] + pd.DateOffset(months=1),
                fillcolor='green', opacity=0.1, layer='below', line_width=0,
                annotation_text=">20% Growth", annotation_position="top left",
                annotation_font_size=10, annotation_font_color='green'
            )

    fig.update_layout(plot_bgcolor='white', paper_bgcolor='white', title_font_size=16, width=1000, height=500)
    return fig

def create_bubble_chart(df):
    grouped_df = df.groupby('App').agg(
        Size=('Size', 'mean'),
        Rating=('Rating', 'mean'),
        Installs=('Installs', 'mean'),
        Category=('Category', 'first')
    ).reset_index()

    category_translation = {
        'BEAUTY': 'सुंदरता',
        'BUSINESS': 'வணிகம்',
        'DATING': 'Dating'
    }
    grouped_df['Category_Translated'] = grouped_df['Category'].map(category_translation).fillna(grouped_df['Category'])

    color_map = {
        'GAME': 'rgb(255, 105, 180)',
        'सुंदरता': 'rgb(63, 191, 191)',
        'வணிகம்': 'rgb(191, 63, 191)',
        'Dating': 'rgb(63, 63, 191)',
        'COMICS': 'rgb(191, 191, 63)',
        'COMMUNICATION': 'rgb(191, 63, 63)',
        'ENTERTAINMENT': 'rgb(63, 191, 63)',
        'SOCIAL': 'rgb(191, 63, 127)',
        'EVENTS': 'rgb(127, 63, 191)'
    }

    fig = px.scatter(
        grouped_df,
        x='Size',
        y='Rating',
        size='Installs',
        color='Category_Translated',
        hover_name='App',
        title='App Size vs. Average Rating (Installs as Bubble Size)',
        labels={'Size': 'Size (MB)', 'Rating': 'Average Rating', 'Installs': 'Number of Installs'},
        color_discrete_map=color_map
    )
    fig.update_layout(
        plot_bgcolor='white',
        paper_bgcolor='white',
        title_font_size=16,
        width=1000,
        height=500,
        xaxis=dict(
            showgrid=True,
            gridcolor='lightgray'
        ),
        yaxis=dict(
            showgrid=True,
            gridcolor='lightgray'
        )
    )
    return fig

In [11]:
def get_current_ist_hour():
    ist = pytz.timezone('Asia/Kolkata')
    now_ist = datetime.now(ist)
    return now_ist.hour

def is_time_in_range(start_hour, end_hour):
    current_hour = get_current_ist_hour()
    if start_hour <= end_hour:
        return start_hour <= current_hour < end_hour
    else:
        return current_hour >= start_hour or current_hour < end_hour

def generate_plot_html_snippet(plot_title, plot_content, html_file_path):
    pio.write_html(plot_content, file=html_file_path, auto_open=False, full_html=False, include_plotlyjs='cdn')
    with open(html_file_path, 'r', encoding='utf-8') as f:
        return f.read()

dashboard_html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Google Play Store Analytics Dashboard</title>
    <script src="https://cdn.tailwindcss.com"></script>
    <style>
        body {{
            font-family: 'Inter', sans-serif;
            background-color: #f3f4f6;
            color: #1f2937;
            padding: 2rem;
        }}
        .dashboard-container {{
            display: flex;
            flex-direction: column;
            gap: 2rem;
        }}
        .plot-card {{
            background-color: white;
            border-radius: 1rem;
            box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
            padding: 1.5rem;
            transition: transform 0.2s ease-in-out;
        }}
        .plot-card:hover {{
            transform: translateY(-5px);
        }}
        .plot-title {{
            font-size: 1.25rem;
            font-weight: bold;
            text-align: center;
            margin-bottom: 1rem;
        }}
    </style>
</head>
<body>
    <div class="flex flex-col items-center">
        <h1 class="text-3xl font-bold mb-8 text-gray-800">Play Store Analytics Dashboard</h1>
        <div class="dashboard-container">
            {plots}
        </div>
    </div>
</body>
</html>
"""

def main():
    try:
        apps_df_raw = pd.read_csv('Play Store Data.csv')
        reviews_df_raw = pd.read_csv('User Reviews.csv')

        apps_df, merged_df = clean_and_prepare_data(apps_df_raw.copy(), reviews_df_raw.copy())

        plots_dir = './'
        if not os.path.exists(plots_dir):
            os.makedirs(plots_dir)

        plot_htmls = []


        print("Working on Grouped Bar chart")
        if is_time_in_range(15, 17):
            filtered_df = apps_df[
                (apps_df['Rating'] >= 4.0) &
                (apps_df['Size'] < 10) &
                (apps_df['Last Updated'].dt.month == 1)
            ]
            fig = create_grouped_bar_chart(filtered_df)
            plot_htmls.append(f"""
                <div class="plot-card">
                    <div class="plot-title">Grouped Bar Chart</div>
                    {generate_plot_html_snippet('Grouped Bar Chart', fig, os.path.join(plots_dir, 'plot1.html'))}
                </div>
            """)

        print("Working on Choropleth Map")
        if is_time_in_range(18, 20):
            filtered_df = apps_df[
                (apps_df['Installs'] > 1000000) &
                (~apps_df['Category'].str.startswith(('A', 'C', 'G', 'S')))
            ]
            top_5_categories = filtered_df.groupby('Category')['Installs'].sum().nlargest(5).index
            filtered_df = filtered_df[filtered_df['Category'].isin(top_5_categories)]
            countries = ['United States', 'India', 'Brazil', 'Germany', 'Japan', 'South Korea', 'United Kingdom', 'France']
            filtered_df['Country'] = np.random.choice(countries, size=len(filtered_df), p=[0.25, 0.25, 0.1, 0.1, 0.1, 0.1, 0.05, 0.05])
            fig = create_choropleth_map(filtered_df)
            plot_htmls.append(f"""
                <div class="plot-card">
                    <div class="plot-title">Choropleth Map</div>
                    {generate_plot_html_snippet('Choropleth Map', fig, os.path.join(plots_dir, 'plot2.html'))}
                </div>
            """)

        print("Working on Dual Axis-chart")
        if is_time_in_range(13, 14):
            filtered_df = apps_df[
                (apps_df['Installs'] >= 10000) &
                (apps_df['Revenue'] >= 10000) &
                (apps_df['Android Ver_Major'] > 4.0) &
                (apps_df['Size'] > 15) &
                (apps_df['Content Rating'] == 'Everyone') &
                (apps_df['App'].str.len() <= 30)
            ]
            top_3_categories = filtered_df.groupby('Category')['Installs'].sum().nlargest(3).index
            filtered_df = filtered_df[filtered_df['Category'].isin(top_3_categories)]
            fig = create_dual_axis_chart(filtered_df)
            plot_htmls.append(f"""
                <div class="plot-card">
                    <div class="plot-title">Dual-Axis Chart</div>
                    {generate_plot_html_snippet('Dual-Axis Chart', fig, os.path.join(plots_dir, 'plot3.html'))}
                </div>
            """)


        print("Working on Time Series Line Chart")
        if is_time_in_range(18, 21):
            filtered_df = apps_df[
                (~apps_df['App'].str.lower().str.startswith(('x', 'y', 'z'))) &
                (apps_df['Category'].str.startswith(('E', 'C', 'B'))) &
                (apps_df['Reviews'] > 500) &
                (~filtered_df['App'].str.lower().str.contains('s'))
            ]
            category_translation = {
                'BEAUTY': 'सुंदरता',
                'BUSINESS': 'வணிகம்',
                'DATING': 'Dating'
            }
            filtered_df['Category_Translated'] = filtered_df['Category'].map(category_translation).fillna(filtered_df['Category'])
            fig = create_time_series_chart(filtered_df)
            plot_htmls.append(f"""
                <div class="plot-card">
                    <div class="plot-title">Time Series Line Chart</div>
                    {generate_plot_html_snippet('Time Series Line Chart', fig, os.path.join(plots_dir, 'plot4.html'))}
                </div>
            """)

        print("Working on Bubble Chart")
        if is_time_in_range(17, 19):
            df_with_reviews = pd.merge(apps_df, reviews_df_raw, on='App', how='inner')
            df_with_reviews = df_with_reviews.dropna(subset=['Sentiment_Subjectivity'])
            filtered_df = df_with_reviews[
                (df_with_reviews['Rating'] > 3.5) &
                (df_with_reviews['Category'].isin(['GAME', 'BEAUTY', 'BUSINESS', 'COMICS', 'COMMUNICATION', 'DATING', 'ENTERTAINMENT', 'SOCIAL', 'EVENTS'])) &
                (df_with_reviews['Reviews'] > 500) &
                (~df_with_reviews['App'].str.lower().str.contains('s')) &
                (df_with_reviews['Sentiment_Subjectivity'] > 0.5) &
                (df_with_reviews['Installs'] > 50000)
            ]
            fig = create_bubble_chart(filtered_df)
            plot_htmls.append(f"""
                <div class="plot-card">
                    <div class="plot-title">Bubble Chart</div>
                    {generate_plot_html_snippet('Bubble Chart', fig, os.path.join(plots_dir, 'plot5.html'))}
                </div>
            """)

        final_dashboard_html = dashboard_html_template.format(plots='\n'.join(plot_htmls) or "<p class='text-center text-gray-500'>No graphs to display at this time. Please check back during the specified time windows.</p>")

        dashboard_path = os.path.join(plots_dir, 'analyticsdashboard.html')
        with open(dashboard_path, 'w', encoding='utf-8') as f:
            f.write(final_dashboard_html)

        print(f"Dashboard saved to {dashboard_path}")

    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == '__main__':
    main()

Data cleaning and preparation complete.
Processing Task 1...
Processing Task 2...
Processing Task 3...
Processing Task 4...
Processing Task 5...
Dashboard saved to ./analyticsdashboard.html
