In [25]:
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
reviews = pd.read_csv('../data/BaSalam.reviews.csv', low_memory=False)

In [8]:
def outliers_number(data: pd.Series, iqr=True,  modified=False, threshold=3, threshold_modified=4.5):
    if iqr:
        q1 = data.quantile(0.25)
        q3 = data.quantile(0.75)
        IQR = q3-q1

        return data[(data > q3+IQR*1.5) | (data < q1-IQR*1.5)].index
    else:
        if modified:
            median = data.median()
            mad = np.median(np.abs(data - median))
            
            modified_z_scores = 0.6745 * (data - median) / mad

            return data[np.abs(modified_z_scores) > threshold_modified].index
        else:
            z_score = (data - data.mean()) / data.std()
            return data[np.abs(data) > threshold].index

In [9]:
cols = reviews.columns.tolist()

In [10]:
miss_values = pd.Series([int(reviews[col].isna().sum()) for col in cols])

In [None]:
reviews.iloc[0]

In [11]:
def outlier_len(iqr, modified):
    outlier_iqr = []
    for col in cols:
        if (type(reviews[col][0]) in [np.int64, np.float64]):
            outlier_iqr.append(len(outliers_number(reviews[col], iqr, modified)))
        else:
            outlier_iqr.append(np.nan)

    return pd.Series(outlier_iqr)

In [None]:
pd.DataFrame(index=reviews.columns.tolist(),
             data={
            'miss value': miss_values.values,
            "type": reviews.dtypes.values,
            'outlier_iqr': outlier_len(True, False).values,
            'outlier_Z-score': outlier_len(False, False).values,
            'outlier_Z-score modified': outlier_len(False, True).values,
            'Most repeated': [reviews[col].value_counts().index[0] for col in cols]
            })

In [3]:
drop_cols = [
    'user_id_of_user', 'hashId', 'hash_id_of_user', 'reason_ids[0]',
    'reason_ids[1]', 'reason_ids[2]', 'reason_ids[3]', 'reason_ids[4]',
    'reason_ids[5]', 'reason_ids[6]', 'reason_ids[7]', 'variation_metadata'
]

In [4]:
data = reviews.drop(columns=drop_cols)

In [5]:
data.columns

Index(['_id', 'productId', 'star', 'user_id', 'isPost', 'isPublic', 'id',
       'createdAt', 'updatedAt', 'isPosted', 'isLikedByCurrentUser',
       'isDislikedByCurrentUser', 'likeCount', 'dislikeCount', 'attachments',
       'history_count', 'name_of_user', 'photo_of_user', 'description'],
      dtype='object')

isPosted
False    3101259
True       46574
Name: count, dtype: int64

In [100]:
star_by_time = data[['star', 'createdAt']].copy()
star_by_time.loc[:, 'createdAt'] = pd.to_datetime(star_by_time['createdAt'])
star_by_time.loc[:, 'star'] = star_by_time['star'].map({
    1:'1',
    2:'1',
    3:'1',
    4:'2',
    5:'3'
})


Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '['3' '3' '3' ... '3' '3' '3']' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.



In [101]:
star_by_time = star_by_time.sort_values(by='createdAt', ascending=False)
star_by_time['season'] = pd.PeriodIndex(star_by_time['createdAt'],freq='Q')

In [103]:
star_by_season = star_by_time.groupby('season')['star'].value_counts(normalize=True).mul(100).reset_index(name='percentage')
star_by_season = star_by_season.sort_values(by=['season', 'star'], ascending=[True, True])

In [104]:
star_by_season['season'] = star_by_season['season'].map(lambda x: str(f"{x.year}-season{x.quarter}"))

In [106]:
fig = px.bar(star_by_season, 
             x='season', 
             y='percentage', 
             color='star', 
             text='percentage')

fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')

fig.show()