In [1]:
import pandas as pd
import numpy as np

In [2]:
reviews = pd.read_csv('data/BaSalam.reviews.csv', low_memory=False)

In [8]:
def outliers_number(data: pd.Series, iqr=True,  modified=False, threshold=3, threshold_modified=4.5):
    if iqr:
        q1 = data.quantile(0.25)
        q3 = data.quantile(0.75)
        IQR = q3-q1

        return data[(data > q3+IQR*1.5) | (data < q1-IQR*1.5)].index
    else:
        if modified:
            median = data.median()
            mad = np.median(np.abs(data - median))
            
            modified_z_scores = 0.6745 * (data - median) / mad

            return data[np.abs(modified_z_scores) > threshold_modified].index
        else:
            z_score = (data - data.mean()) / data.std()
            return data[np.abs(data) > threshold].index

In [9]:
cols = reviews.columns.tolist()

In [10]:
miss_values = pd.Series([int(reviews[col].isna().sum()) for col in cols])

In [11]:
def outlier_len(iqr, modified):
    outlier_iqr = []
    for col in cols:
        if (type(reviews[col][0]) in [np.int64, np.float64]):
            outlier_iqr.append(len(outliers_number(reviews[col], iqr, modified)))
        else:
            outlier_iqr.append(np.nan)

    return pd.Series(outlier_iqr)

In [12]:
pd.DataFrame(index=reviews.columns.tolist(),
             data={
            'miss value': miss_values.values,
            "type": reviews.dtypes.values,
            'outlier_iqr': outlier_len(True, False).values,
            'outlier_Z-score': outlier_len(False, False).values,
            'outlier_Z-score modified': outlier_len(False, True).values,
            'Most repeated': [reviews[col].value_counts().index[0] for col in cols]
            })

Unnamed: 0,miss value,type,outlier_iqr,outlier_Z-score,outlier_Z-score modified,Most repeated
_id,0,object,,,,661bad526a6e1c5d7e98fd66
productId,0,int64,0.0,3393574.0,0.0,2810984
star,0,int64,735113.0,3046546.0,735113.0,5
user_id,0,int64,0.0,3393574.0,0.0,980257
isPost,0,bool,,,,False
isPublic,0,bool,,,,True
id,0,int64,0.0,3393574.0,0.0,8294857
createdAt,0,object,,,,2021-01-27T11:04:34
updatedAt,0,object,,,,2021-01-27T11:04:34
hashId,0,object,,,,Infinity


In [80]:
drop_cols = [
    'user_id_of_user', 'hashId', 'hash_id_of_user', 'reason_ids[0]',
    'reason_ids[1]', 'reason_ids[2]', 'reason_ids[3]', 'reason_ids[4]',
    'reason_ids[5]', 'reason_ids[6]', 'reason_ids[7]', 'variation_metadata'
]

In [88]:
data = reviews.drop(columns=drop_cols)

In [108]:
reviews['createdAt'] = pd.to_datetime(reviews['createdAt'])
sells_by_date = reviews['createdAt'].dt.date.value_counts().sort_index()
sells_by_date = pd.DataFrame({
    'day': pd.to_datetime(sells_by_date.index),
    'amount': sells_by_date.values
})
sells_by_date

Unnamed: 0,day,amount
0,2018-09-22,1
1,2018-09-23,4
2,2018-09-24,5
3,2018-09-25,11
4,2018-09-26,3
...,...,...
1978,2024-04-05,2739
1979,2024-04-06,3266
1980,2024-04-07,1601
1981,2024-04-08,1028
