In [None]:
%pip install -q google_play_scraper 
%pip install -q transformers 
%pip install -q plotly-express 
%pip install pyyaml

In [2]:
import pandas as pd
import numpy as np
from google_play_scraper import app, Sort, reviews_all
import plotly.express as px

In [3]:
skill_cat_app = reviews_all("com.skillcat", sleep_milliseconds=0, lang='en', country='us', sort=Sort.NEWEST)  


In [None]:
skill_cat_app

In [5]:
dataframe = pd.json_normalize(skill_cat_app)

In [6]:
dataframe.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,0f63e0c3-9565-4c4a-aded-49b511397c03,lewis de ocampo,https://play-lh.googleusercontent.com/a-/ALV-U...,I really like it. But why is it always crashin...,3,0,6.0.1,2024-11-10 22:35:58,,NaT,6.0.1
1,d0379e6a-cb80-4e70-91ba-fe25952f82fb,Andrew Coffey,https://play-lh.googleusercontent.com/a-/ALV-U...,not free,1,0,,2024-11-09 04:07:53,,NaT,
2,116d668e-b0c9-4c01-8196-8eaabe29b906,Salami Saheed,https://play-lh.googleusercontent.com/a/ACg8oc...,I can't imagine how much I have learned about ...,5,0,5.3.4,2024-11-07 19:06:00,,NaT,5.3.4
3,8a2028f3-916c-47bf-a568-267210cae2aa,Bill Fitzgerald,https://play-lh.googleusercontent.com/a/ACg8oc...,awesome app! got 608 cert and other training o...,5,0,6.0.1,2024-11-07 08:59:23,,NaT,6.0.1
4,7f0a3992-49cd-4a40-8345-4259d9ecaf29,dj Ovamarz,https://play-lh.googleusercontent.com/a-/ALV-U...,uhhh,5,0,6.0.1,2024-11-07 00:48:55,,NaT,6.0.1


In [7]:
dataframe['score'].mean()

4.700098328416913

In [8]:
dataframe['reviewCreatedVersion'].value_counts()

reviewCreatedVersion
5.0.3    71
2.2.5    69
2.2.3    60
5.3.4    50
2.3      48
2.1.3    48
1.8      41
1.5.1    39
1.5      37
2.2.2    37
1.7      36
1.9.1    28
2.0.1    28
5.2.1    24
4.8      22
1.6.2    21
6.0.1    19
5.1.1    18
5.3.2    18
2.4      14
1.6      14
4.8.1    13
5.1      13
5.0.2    13
1.9.3    13
1.4      12
1.3.3    11
1.2.2    10
1.3.2    10
1.3.0    10
1.2.3     9
2.1.2     9
2.1.1     9
5.3.3     8
1.9.2     8
2.0       8
5.3.1     8
1.6.1     7
1.4.2     7
4.8.2     6
6.0       4
1.2.0     4
2.1       3
1.2.1     3
5.3       2
1.4.5     1
5.0.1     1
1.0       1
1.0.5     1
0.1.8     1
Name: count, dtype: int64

### We want to run a deep learning pre trained model on top these comments to get a sentiment analysis so that we can find if they are positive or negative

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

sentiment_analysis = pipeline("text-classification", model="siebert/sentiment-roberta-large-english")

In [10]:
print(sentiment_analysis("I love SkillCat!"))

[{'label': 'POSITIVE', 'score': 0.9988139867782593}]


In [39]:
dataframe.dtypes

reviewId                        object
userName                        object
userImage                       object
content                         object
score                          float64
thumbsUpCount                    int64
reviewCreatedVersion            object
at                      datetime64[ns]
replyContent                    object
repliedAt               datetime64[ns]
appVersion                      object
sentiment                       object
result                          object
dtype: object

In [12]:
# Convert the 'content' column to a string
dataframe['content'] = dataframe['content'].astype(str)

In [21]:
# Apply the sentiment analysis pipeline to the 'content' column
dataframe['result'] = dataframe['content'].apply(
    lambda x: sentiment_analysis(x)
)

In [22]:
# Extract the sentiment label from the dictionary
dataframe.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,sentiment,result
0,0f63e0c3-9565-4c4a-aded-49b511397c03,lewis de ocampo,https://play-lh.googleusercontent.com/a-/ALV-U...,I really like it. But why is it always crashin...,3,0,6.0.1,2024-11-10 22:35:58,,NaT,6.0.1,"[{'label': 'NEGATIVE', 'score': 0.999343454837...","[{'label': 'NEGATIVE', 'score': 0.999343454837..."
1,d0379e6a-cb80-4e70-91ba-fe25952f82fb,Andrew Coffey,https://play-lh.googleusercontent.com/a-/ALV-U...,not free,1,0,,2024-11-09 04:07:53,,NaT,,"[{'label': 'NEGATIVE', 'score': 0.997162103652...","[{'label': 'NEGATIVE', 'score': 0.997162103652..."
2,116d668e-b0c9-4c01-8196-8eaabe29b906,Salami Saheed,https://play-lh.googleusercontent.com/a/ACg8oc...,I can't imagine how much I have learned about ...,5,0,5.3.4,2024-11-07 19:06:00,,NaT,5.3.4,"[{'label': 'POSITIVE', 'score': 0.998816728591...","[{'label': 'POSITIVE', 'score': 0.998816728591..."
3,8a2028f3-916c-47bf-a568-267210cae2aa,Bill Fitzgerald,https://play-lh.googleusercontent.com/a/ACg8oc...,awesome app! got 608 cert and other training o...,5,0,6.0.1,2024-11-07 08:59:23,,NaT,6.0.1,"[{'label': 'POSITIVE', 'score': 0.998874843120...","[{'label': 'POSITIVE', 'score': 0.998874843120..."
4,7f0a3992-49cd-4a40-8345-4259d9ecaf29,dj Ovamarz,https://play-lh.googleusercontent.com/a-/ALV-U...,uhhh,5,0,6.0.1,2024-11-07 00:48:55,,NaT,6.0.1,"[{'label': 'NEGATIVE', 'score': 0.997138500213...","[{'label': 'NEGATIVE', 'score': 0.997138500213..."


In [23]:
dataframe.shape

(1017, 13)

In [24]:
y = sentiment_analysis("I love SkillCat!")

In [25]:
y[0]['label']

'POSITIVE'

In [26]:
y[0]['score']

0.9988139867782593

In [28]:
dataframe['sentiment'] = dataframe['result'].apply(
    lambda x: x[0]['label']
)
dataframe['score'] = dataframe['result'].apply(
    lambda x: x[0]['score']
)

In [29]:
dataframe.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,sentiment,result
0,0f63e0c3-9565-4c4a-aded-49b511397c03,lewis de ocampo,https://play-lh.googleusercontent.com/a-/ALV-U...,I really like it. But why is it always crashin...,0.999343,0,6.0.1,2024-11-10 22:35:58,,NaT,6.0.1,NEGATIVE,"[{'label': 'NEGATIVE', 'score': 0.999343454837..."
1,d0379e6a-cb80-4e70-91ba-fe25952f82fb,Andrew Coffey,https://play-lh.googleusercontent.com/a-/ALV-U...,not free,0.997162,0,,2024-11-09 04:07:53,,NaT,,NEGATIVE,"[{'label': 'NEGATIVE', 'score': 0.997162103652..."
2,116d668e-b0c9-4c01-8196-8eaabe29b906,Salami Saheed,https://play-lh.googleusercontent.com/a/ACg8oc...,I can't imagine how much I have learned about ...,0.998817,0,5.3.4,2024-11-07 19:06:00,,NaT,5.3.4,POSITIVE,"[{'label': 'POSITIVE', 'score': 0.998816728591..."
3,8a2028f3-916c-47bf-a568-267210cae2aa,Bill Fitzgerald,https://play-lh.googleusercontent.com/a/ACg8oc...,awesome app! got 608 cert and other training o...,0.998875,0,6.0.1,2024-11-07 08:59:23,,NaT,6.0.1,POSITIVE,"[{'label': 'POSITIVE', 'score': 0.998874843120..."
4,7f0a3992-49cd-4a40-8345-4259d9ecaf29,dj Ovamarz,https://play-lh.googleusercontent.com/a-/ALV-U...,uhhh,0.997139,0,6.0.1,2024-11-07 00:48:55,,NaT,6.0.1,NEGATIVE,"[{'label': 'NEGATIVE', 'score': 0.997138500213..."


In [31]:
dataframe['score'].mean()

0.998514002051677

In [33]:
dataframe['sentiment'].value_counts(normalize=True)

sentiment
POSITIVE    0.913471
NEGATIVE    0.086529
Name: proportion, dtype: float64

In [None]:
# Draw histogram of the sentiment scores
figure = px.histogram(
    dataframe,
    x='sentiment',
    color='sentiment',
    text_auto=True,
)

figure.show()

In [38]:
# Save the dataframe to a CSV file
dataframe.to_csv('skill_cat_reviews.csv')