In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.io as pio
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk 
import webbrowser
import os
from plotly.subplots import make_subplots,go

# Load DataSet

In [2]:
apps_df = pd.read_csv("Play Store Data.csv")
reviews_df = pd.read_csv("User Reviews.csv")

In [3]:
apps_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [4]:
print(apps_df.isnull().sum())

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64


# data cleaning

In [5]:
apps_df = apps_df.dropna(subset=['Rating'])
for col in apps_df.columns:
    apps_df[col].fillna(apps_df[col].mode()[0],inplace=True)
apps_df.drop_duplicates(inplace=True)
apps_df=apps_df[apps_df['Rating']<=5]
reviews_df.dropna(subset=['Translated_Review'],inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  apps_df[col].fillna(apps_df[col].mode()[0],inplace=True)


In [6]:
apps_df['Installs'] = apps_df['Installs'].str.replace(',','').str.replace('+','').astype(int)
apps_df['Price'] = apps_df['Price'].str.replace('$','').astype(float)

In [7]:
def coverted_size(size):
    if 'M' in size:
        return float(size.replace('M',''))
    elif 'k' in size:
        return float(size.replace('k',''))/1024
    else:
        return np.nan
apps_df['Size'] = apps_df['Size'].apply(coverted_size)

In [8]:
apps_df['Reviews'] = apps_df['Reviews'].astype(int)

In [9]:
apps_df['Log_Installs'] = np.log1p(apps_df['Installs'])
apps_df['Log_Reviews'] = np.log1p(apps_df['Reviews'])

In [10]:
def rating_group(rating):
    if rating >= 4:
        return 'Top rated'
    elif rating >= 3:
        return 'Above average'
    elif rating >= 2:
        return 'Average'
    else:
        return 'Below average'

apps_df['Rating_Group'] = apps_df['Rating'].apply(rating_group)

In [11]:
apps_df['Last Updated'] = pd.to_datetime(apps_df['Last Updated'],errors='coerce')
apps_df['Year'] = apps_df['Last Updated'].dt.year

In [12]:
html_files_path = "./"

# Make sure the directory exists
if not os.path.exists(html_files_path):
    os.makedirs(html_files_path)

# Initialize plot_containers
plot_containers = ""

# Save each Plotly figure to an HTML file
def save_plot_as_html(fig, filename, insight):
    global plot_containers
    filepath = os.path.join(html_files_path, filename)
    html_content = pio.to_html(fig, full_html=False, include_plotlyjs='inline')
    # Append the plot and its insight to plot_containers
    plot_containers += f"""
    <div class="plot-container" id="{filename}" onclick="openPlot('{filename}')">
        <div class="plot">{html_content}</div>
        <div class="insights">{insight}</div>
    </div>
    """
    fig.write_html(filepath, full_html=False, include_plotlyjs='inline')

# Define your plots
plot_width = 520
plot_height = 520
plot_bg_color = 'black'
text_color = 'white'
title_font = {'size': 16}
axis_font = {'size': 12}

# Task2
. Create a dual-axis chart comparing the average installs and revenue for free vs. paid apps within the top 3 app categories. Apply filters to exclude apps with fewer than 10,000 installs and revenue below $10,000 and android version should be more than 4.0 as well as size should be more than 15M and content rating should be Everyone and app name should not have more than 30 characters including space and special character

In [15]:
# Create a revenue column for paid apps
apps_df['Revenue'] = apps_df['Price'] * apps_df['Installs']

# Filter for paid apps only
paid_df = apps_df[apps_df['Type'] == 'Paid']

In [16]:
# Calculate average installs and revenue for free vs. paid apps within the top 3 categories
top_categories = apps_df['Category'].value_counts().nlargest(3).index
filtered_df = apps_df[
    (apps_df['Installs'] > 10000) & 
    (apps_df['Revenue'] > 10000) &
    (apps_df['Android Ver'] > '4.0') & 
    (apps_df['Size'] > 15) &  # Size in bytes
    (apps_df['Content Rating'] == 'Everyone') &
    (apps_df['App'].str.len() <= 30) & 
    (apps_df['Category'].isin(top_categories))
]

# Group by category and type (Free or Paid) to get average installs and revenue
grouped_df = filtered_df.groupby(['Category', 'Type']).agg(
    avg_installs=('Installs', 'mean'),
    avg_revenue=('Revenue', 'mean')
).reset_index()

# Create a dual-axis chart with Plotly
fig2 = make_subplots(specs=[[{"secondary_y": True}]])  # Create dual axis

# Add bar chart for average installs
fig2.add_trace(
    go.Bar(x=grouped_df['Category'], y=grouped_df['avg_installs'], name="Average Installs", marker_color='blue'),
    secondary_y=False,
)

# Add line chart for average revenue
fig2.add_trace(
    go.Scatter(x=grouped_df['Category'], y=grouped_df['avg_revenue'], name="Average Revenue", mode='lines+markers', line=dict(color='green')),
    secondary_y=True
)

# Update layout for better appearance
fig2.update_layout(
    title_text="Average Installs & Revenue for Free vs.Paid Apps Top 3 Categories",
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    margin=dict(l=10, r=10, t=30, b=10),
    width=512,
    height=512 
)

# Add axis titles
fig2.update_xaxes(title_text="App Category")
fig2.update_yaxes(title_text="Average Installs", secondary_y=False)
fig2.update_yaxes(title_text="Average Revenue (USD)", secondary_y=True)

# Save the plot
save_plot_as_html(fig2, "average_installs_revenue.html", "The dual-axis chart shows that, after filtering for installs, revenue, and ratings, paid apps generally generate higher revenue, while free apps attract more installs. This suggests users prefer accessible free apps.")
