# Task 2 : Use a grouped bar chart to compare the average rating and total review count for the top 10 app categories by number of installs. Filter out any categories where the average rating is below 4.0 and size below 10 M and last update should be Jan month . This graph should work only between 10 AM to 5 PM.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import datetime as dt
from datetime import datetime

In [2]:
# Read the data files
user_reviews = pd.read_csv('User Reviews.csv')
play_store_data = pd.read_csv('Play Store Data.csv')

In [3]:
# Handling missing values
user_reviews.isnull().sum()
user_reviews.dropna(subset=['Translated_Review'],inplace=True)

# Check duplicates
user_reviews.duplicated().sum()
user_reviews.drop_duplicates(inplace=True)

# check data types
user_reviews.info()

from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
sia = SentimentIntensityAnalyzer()
#Polarity Scores in SIA
#Positive, Negative, Neutral and Compound: -1 - Very negative ; +1 - Very positive
user_reviews['Sentiment_Score']=user_reviews['Translated_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])

def categorize_sentiment(score):
    if score > 0.05:
        return 'Positive'
    elif score < -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Apply the function to categorize sentiment
user_reviews['Sentiment'] = user_reviews['Sentiment_Score'].apply(categorize_sentiment)
user_reviews.head()

<class 'pandas.core.frame.DataFrame'>
Index: 29692 entries, 0 to 64230
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   App                     29692 non-null  object 
 1   Translated_Review       29692 non-null  object 
 2   Sentiment               29692 non-null  object 
 3   Sentiment_Polarity      29692 non-null  float64
 4   Sentiment_Subjectivity  29692 non-null  float64
dtypes: float64(2), object(3)
memory usage: 1.4+ MB


Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Sentiment_Score
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333,0.9531
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462,0.6597
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875,0.6249
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3,0.6369
5,10 Best Foods for You,Best way,Positive,1.0,0.3,0.6369


In [4]:
## check missing values
play_store_data.isnull().sum()

play_store_data = play_store_data.dropna(subset=['Rating'])
for column in play_store_data.columns :
    play_store_data[column].fillna(play_store_data[column].mode()[0],inplace=True)
    
play_store_data.duplicated().sum()
play_store_data.drop_duplicates(inplace=True)

play_store_data['Installs']=play_store_data['Installs'].str.replace(',','').str.replace('+','')

#Convert Size column
def convert_size(size):
    if 'M' in size:
        return float(size.replace('M',''))
    elif 'k' in size:
        return float(size.replace('k',''))/1024
    else:
        return np.nan
play_store_data['Size']=play_store_data['Size'].apply(convert_size)

# Handle non-numeric values in 'Installs' by setting non-numeric entries to NaN, then convert to float
play_store_data['Installs'] = pd.to_numeric(play_store_data['Installs'], errors='coerce')

# Also handle non-numeric entries in the 'Price' column, where "Free" can be treated as 0
play_store_data['Price'] = pd.to_numeric(play_store_data['Price'], errors='coerce').fillna(0)

# Convert Last updated Column to a date format
play_store_data['Last Updated']=pd.to_datetime(play_store_data['Last Updated'],errors='coerce')

#Convert Reviews column to numeric
play_store_data['Reviews']=pd.to_numeric(play_store_data['Reviews'],errors='coerce')

play_store_data1 = play_store_data.copy()

play_store_data.head()


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159.0,19.0,10000.0,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967.0,14.0,500000.0,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510.0,8.7,5000000.0,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644.0,25.0,50000000.0,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967.0,2.8,100000.0,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up


In [5]:
# Create columns for month and year
play_store_data['Last_Updated_Month'] = play_store_data['Last Updated'].dt.month_name()
play_store_data['Last_Updated_Year'] = play_store_data['Last Updated'].dt.year

# Filter the data based on the specified conditions
filtered_data = play_store_data[
    (play_store_data['Rating'] >= 4.0) &  # Average rating >= 4.0
    (play_store_data['Installs'] >= 10000000) &  # Size >= 10M installs
    (play_store_data['Last_Updated_Month'] == 'January')  # Last update in January
]

# Group the data by 'Category' and calculate the mean rating and total reviews
category_grouped = filtered_data.groupby('Category').agg(
    Average_Rating=('Rating', 'mean'),
    Total_Reviews=('Reviews', 'sum'),
    Total_Installs=('Installs', 'sum')
).reset_index()

# Sort by installs and pick the top 10 categories
top_10_categories = category_grouped.sort_values(by='Total_Installs', ascending=False).head(10)

# Ensure the graph only displays between 10 AM to 5 PM
current_time = dt.datetime.now().time()
if current_time >= dt.time(10, 0) and current_time <= dt.time(17, 0):
    # Create the figure
    fig2 = go.Figure()

    # Bar for Total Reviews
    fig2.add_trace(
        go.Bar(x=top_10_categories['Category'], y=top_10_categories['Total_Reviews'], 
               name="Total Reviews", yaxis='y', marker_color='orange')
    )

    # Line for Average Rating (scaled differently)
    fig2.add_trace(
        go.Scatter(x=top_10_categories['Category'], y=top_10_categories['Average_Rating'], 
                   name="Average Rating", yaxis='y2', marker=dict(color='blue'), mode='lines+markers')
    )

    # Update layout for dual y-axes
    fig2.update_layout(
        title="Average Rating and Total Review Count for Top 10 App Categories by Installs",
        xaxis=dict(title="Category"),
        yaxis=dict(title="Total Reviews", side='left'),
        yaxis2=dict(title="Average Rating", overlaying='y', side='right', range=[0, 5]),  # Scaling for rating (0-5)
        legend=dict(x=1.05, y=1, xanchor='left')
    )
    
    fig2.show()
else:
    print("The graph can only be displayed between 10 AM and 5 PM.")
    