In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.io as pio
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk 
import webbrowser
import os
from plotly.subplots import make_subplots,go
import datetime

In [2]:
apps_df = pd.read_csv("Play Store Data.csv")
reviews_df = pd.read_csv("User Reviews.csv")

In [3]:
apps_df = apps_df.dropna(subset=['Rating'])
for col in apps_df.columns:
    apps_df[col].fillna(apps_df[col].mode()[0],inplace=True)
apps_df.drop_duplicates(inplace=True)
apps_df=apps_df[apps_df['Rating']<=5]
reviews_df.dropna(subset=['Translated_Review'],inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  apps_df[col].fillna(apps_df[col].mode()[0],inplace=True)


In [4]:
apps_df['Installs'] = apps_df['Installs'].str.replace(',','').str.replace('+','').astype(int)
apps_df['Price'] = apps_df['Price'].str.replace('$','').astype(float)

In [5]:
def coverted_size(size):
    if 'M' in size:
        return float(size.replace('M',''))
    elif 'k' in size:
        return float(size.replace('k',''))/1024
    else:
        return np.nan
apps_df['Size'] = apps_df['Size'].apply(coverted_size)

In [6]:
apps_df['Reviews'] = apps_df['Reviews'].astype(int)

In [7]:
apps_df['Log_Installs'] = np.log1p(apps_df['Installs'])
apps_df['Log_Reviews'] = np.log1p(apps_df['Reviews'])

In [8]:
def rating_group(rating):
    if rating >= 4:
        return 'Top rated'
    elif rating >= 3:
        return 'Above average'
    elif rating >= 2:
        return 'Average'
    else:
        return 'Below average'

apps_df['Rating_Group'] = apps_df['Rating'].apply(rating_group)

In [9]:
apps_df['Last Updated'] = pd.to_datetime(apps_df['Last Updated'],errors='coerce')
apps_df['Year'] = apps_df['Last Updated'].dt.year

In [10]:
html_files_path = "./"

# Make sure the directory exists
if not os.path.exists(html_files_path):
    os.makedirs(html_files_path)

# Initialize plot_containers
plot_containers = ""

# Save each Plotly figure to an HTML file
def save_plot_as_html(fig, filename, insight):
    global plot_containers
    filepath = os.path.join(html_files_path, filename)
    html_content = pio.to_html(fig, full_html=False, include_plotlyjs='inline')
    # Append the plot and its insight to plot_containers
    plot_containers += f"""
    <div class="plot-container" id="{filename}" onclick="openPlot('{filename}')">
        <div class="plot">{html_content}</div>
        <div class="insights">{insight}</div>
    </div>
    """
    fig.write_html(filepath, full_html=False, include_plotlyjs='inline')

# Define your plots
plot_width = 520
plot_height = 520
plot_bg_color = 'black'
text_color = 'white'
title_font = {'size': 16}
axis_font = {'size': 12}

# task 3
 Generate a heatmap to show the correlation matrix between installs, ratings, and review counts. Filter the data to include only apps that have been updated within the last year and have at least 100,000 installs and reviews count should be more than 1k and genres name should not be Starting with characters A , F , E , G , I , K . <b>This Graph should work only between 3 pm to 6 pm</b>


In [12]:
current_hour = datetime.datetime.now().hour
if 15 <= current_hour < 18:
    filtered_df = apps_df[
        (apps_df['Last Updated']>'2018-01-01') & 
        (apps_df['Installs'] >= 100000) & 
        (apps_df['Reviews'] >= 1000) & 
        (~apps_df['Genres'].str.startswith(('A', 'F', 'E', 'G', 'I', 'K')))
    ]
    
    # Create a correlation matrix for Installs, Ratings, and Reviews
    correlation_matrix = filtered_df[['Installs', 'Rating', 'Reviews']].corr()
    
    # Create a heatmap using Plotly
    fig3 = px.imshow(
        correlation_matrix,
        text_auto=True,
        color_continuous_scale='Viridis',
        labels={'color': 'Correlation'},
        title='Correlation Matrix: Installs, Ratings, and Reviews',
        width=plot_width,
        height=plot_height
        
        
    )
    
    # Update layout for better appearance
    fig3.update_layout(
        plot_bgcolor=plot_bg_color,
        paper_bgcolor=plot_bg_color,
        font_color=text_color,
        title_font=title_font,
        width=plot_width,
        height=plot_height,
        margin=dict(l=10, r=10, t=30, b=10)
    )
    
    # Save the plot
    save_plot_as_html(fig3, "correlation_matrix.html", "The heatmap shows the correlation between installs, ratings, and review counts for apps updated in the last year. Filtering for apps with over 100,000 installs and 1,000 reviews, while excluding certain genres, indicates that higher ratings correlate with more installs and reviews")
else:
    print("This plot can only be generated between 3 PM and 6 PM.")

This plot can only be generated between 3 PM and 6 PM.
