In [67]:
import pandas as pd
import numpy as np
import plotly.express as px
import os

In [68]:
data_path = os.getenv('DATA_PATH')
reviews = pd.read_csv(f'{data_path}/BaSalam.reviews.csv', low_memory=False)

In [None]:
reviews.iloc[0]

In [None]:
reviews.info()

In [71]:
cols = reviews.columns.tolist()
miss_values = pd.Series([int(reviews[col].isna().sum()) for col in cols])

def outliers_number(data: pd.Series, iqr=True,  modified=False, threshold=3, threshold_modified=4.5):
    if iqr:
        q1 = data.quantile(0.25)
        q3 = data.quantile(0.75)
        IQR = q3-q1

        return data[(data > q3+IQR*1.5) | (data < q1-IQR*1.5)].index
    else:
        if modified:
            median = data.median()
            mad = np.median(np.abs(data - median))
            
            modified_z_scores = 0.6745 * (data - median) / mad

            return data[np.abs(modified_z_scores) > threshold_modified].index
        else:
            z_score = (data - data.mean()) / data.std()
            return data[np.abs(data) > threshold].index
        
def outlier_len(iqr, modified):
    outlier_iqr = []
    for col in cols:
        if (type(reviews[col][0]) in [np.int64, np.float64]):
            outlier_iqr.append(len(outliers_number(reviews[col], iqr, modified)))
        else:
            outlier_iqr.append(np.nan)

    return pd.Series(outlier_iqr)

pd.DataFrame(index=reviews.columns.tolist(),
             data={
            'miss value': miss_values.values,
            "type": reviews.dtypes.values,
            'outlier_iqr': outlier_len(True, False).values,
            'outlier_Z-score': outlier_len(False, False).values,
            'outlier_Z-score modified': outlier_len(False, True).values,
            'Most repeated': [reviews[col].value_counts().index[0] for col in cols]
            })

In [None]:
reviews[(reviews['productId']== 1715746) & (reviews['description'].notna())][['description', 'star', 'updatedAt']].sort_values(by='updatedAt', ascending=False)

In [None]:
from sentiment_analysis import comment_classification

id = 4940
print(comment_classification(reviews.iloc[id]['description']), reviews.iloc[id]['description'], reviews.iloc[id]['star'])

In [62]:
reviews_count_per_product = reviews.groupby('productId')['_id'].agg(['count']).reset_index().sort_values(
    by='count', ascending=False)
reviews_count_per_product.to_csv('reviews_count_per_product.csv', index=False)

In [63]:
reviews_count_per_product.to_csv('reviews_count_per_product.csv', index=False)

In [None]:
fig = px.bar(reviews_count_per_product, x='productId', y='count', text='count')

fig.show()

In [35]:
sample = reviews.sample(100000).copy()

In [None]:
sample['history_count']

In [79]:
drop_cols = [
    'id', 
    'user_id_of_user', 'hashId', 'hash_id_of_user', 'reason_ids[0]',
    'reason_ids[1]', 'reason_ids[2]', 'reason_ids[3]', 'reason_ids[4]',
    'reason_ids[5]', 'reason_ids[6]', 'reason_ids[7]', 'variation_metadata'
]

In [80]:
data = reviews.drop(columns=drop_cols).sample(100000)

In [None]:
star_by_time = data[['star', 'createdAt']].copy()
star_by_time.loc[:, 'createdAt'] = pd.to_datetime(star_by_time['createdAt'])
star_by_time.loc[:, 'star'] = star_by_time['star'].map({
    1:'1',
    2:'1',
    3:'1',
    4:'2',
    5:'3'
})

In [82]:
star_by_time = star_by_time.sort_values(by='createdAt', ascending=False)
star_by_time['season'] = pd.PeriodIndex(star_by_time['createdAt'],freq='Q')

In [83]:
star_by_season = star_by_time.groupby('season')['star'].value_counts(normalize=True).mul(100).reset_index(name='percentage')
star_by_season = star_by_season.sort_values(by=['season', 'star'], ascending=[True, True])

In [84]:
star_by_season['season'] = star_by_season['season'].map(lambda x: str(f"{x.year}-season{x.quarter}"))

In [None]:
fig = px.bar(star_by_season, 
             x='season', 
             y='percentage', 
             color='star', 
             text='percentage')

fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')

fig.show()