In [2]:
import pandas as pd
from sklearn.cluster import KMeans
from tqdm import tqdm
from hazm import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score
import re
import emoji
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA

from langdetect import detect, DetectorFactory
import langid


In [None]:
df = pd.read_csv('../data/BaSalam.reviews.csv', low_memory=True, encoding='utf-8')

In [None]:
df.info()

In [5]:
STOPWORDS = set([
    "از", "به", "در", "با", "که", "را", "تا", "و", "یا", "اما", "اگر", "برای", "بر",
    "این", "آن", "یک", "هر", "هم", "همه", "چند", "چنین", "دیگر", "چون", "مثل",
    "مانند", "چرا", "زیرا", "ولی", "آیا", "اگرچه", "لذا", "نیز", "باید", "می",
    "باشد", "است", "بود", "هست", "شد", "شو", "باش", "کرد", "کن", "کند", "کرده",
    "شده", "می‌شود", "خواهد", "خواهند", "خواهی", "خواهیم", "توان", "تواند",
    "توانند", "توانست", "توانسته", "بوده", "بودند", "باشند", "هستند", "دارم", "داری", "دارد", "دارند", "داریم", "داشت",
    "داشتند", "داشته", "داشتم", "ای", "ایم", "اید", "اند", "ام", "ت", "ها", "های", "هایی",
    "شان", "ش", "مان", "تان", "اینها", "آنها", "چیز", "چیزی", "چرا", "چه", "که",
    "کدام", "چگونه", "چقدر", "چراکه", "آنان", "او", "آن", "ایشان", "ما", "شما",
    "آنچه", "آنجا", "اینجا", "اینجاست", "آنجاست", "همان", "خود", "همه‌اش",
    "هیچ", "هیچ‌کدام", "هرگز", "هیچگاه", "حالا", "اکنون", "دیروز", "امروز",
    "فردا", "شب", "روز", "بعد", "قبل", "ساعت", "وقت", "زمان", "چندین", "بار",
    "کم", "بیشتر", "کمتر", "حتی", "فقط", "تنها", "بالا", "پایین", "روی", "زیر",
    "جلو", "پشت", "نزدیک", "دور", "وسط", "بیرون", "درون", "داخل", "کنار",
    "اینجا", "آنجا", "هیچ‌جا", "هرجا", "هرکجا", "جا", "مکان", "محل", "چپ", "راست",
    "بعدا", "سپس", "آنگاه", "دیگر", "چیزهای", "یعنی", "خب", "آره", "نه", "باشه",
    "آها", "بله", "نمیدانم", "کسی", "دیگری", "هیچ‌کسی", "چیزها"
])

In [6]:
def is_sticker(token):
    # بررسی فرمت فایل
    if re.match(r'.*\.(webp|png|gif|jpg)$', token):
        return True
    # بررسی ایموجی
    if emoji.is_emoji(token):
        return True
    # بررسی لینک
    if re.match(r'https?://[^\s]+', token):
        return True
    return False

In [7]:
def preprocessing(comment):
    normalizer = Normalizer()
    stemmer = Stemmer()
    # حذف ایموجی‌ها
    comment = emoji.replace_emoji(comment, replace=" ")
    # حذف لینک‌ها
    comment = re.sub(r'https?://\S+|www\.\S+', ' ', comment)
    # حذف علامت‌های نگارشی
    comment = re.sub(r'[^\w\s]', ' ', comment)
    # حذف اعداد
    comment = re.sub(r'\d+', ' ', comment)
    text = comment
    tokens = word_tokenize(text)
    filtered = []
    for token in tokens:
        token = str(token)
        token = token.lower()
        token = re.sub(r'[\u200c\u200b\u200d]', ' ', token)
        if not token in STOPWORDS and not token.isdigit() and not is_sticker(token):
            filtered.append(token)
    return ' '.join(filtered)

In [8]:
def tokenize(text):
    return word_tokenize(text)

In [9]:
def detect_language(text):
    lang, confidence = langid.classify(text)
    return lang

In [81]:
class Clustering:
    '''
    This class is used to cluster the comments based on the stars.
    It use different algorithms to cluster the comments and visualize the clusters in 3D.
    '''
    def __init__(self, df: pd.DataFrame, comments_col: str, star_col: str):
        self.df = df
        self.data = self.df[[comments_col, star_col]]
        self.comments_col = comments_col
        self.star_col = star_col
        self.english_comments = None


    def preprocess(self, ignore_eng: bool=True, sample:bool=False,
                    balance_stars:bool=False, sample_size:int=10000, random_state:int=42, n_samples_per_star:int=1000):
        
        '''
        This function is used to preprocess the comments and stars.
        It can sample the comments and stars if the sample is True.
        '''
        self.data = self.data[(self.data[self.comments_col].notna()) & (self.data[self.star_col].notna())]

        if sample:
            if balance_stars:
                self.data = self.data.groupby(self.star_col, group_keys=False).apply(
                    lambda x: x.sample(n=min(n_samples_per_star, len(x)), random_state=random_state)
                )
            else:
                self.data = self.data.sample(n=sample_size, random_state=random_state) 
                
        self.data['len_comment_before_preprocessing'] = self.data[self.comments_col].apply(word_tokenize).map(len)
        self.data = self.data[self.data['len_comment_before_preprocessing'] > 5]

        if ignore_eng: 
            '''
            ignore english comments
            '''   
            self.data['language'] = self.data[self.comments_col].apply(detect_language)
            self.data = self.data[self.data['language'] == 'fa']
            # self.data = self.data.reset_index(drop=True)
        else:
            pass
            #  dont know how to handle english comments

        
        #  plot the distribution of comments based on stars
        fig = go.Figure()

        groupby_rate = self.data.groupby(self.star_col)[self.star_col].count()

        fig.add_trace(go.Bar(
            x=list(sorted(groupby_rate.index)),
            y=groupby_rate.tolist(),
            text=groupby_rate.tolist(),
            textposition='auto'
        ))

        fig.update_layout(
            title_text='Distribution of star within comments',
            xaxis_title_text='Rate',
            yaxis_title_text='Frequency',
            bargap=0.2,
            bargroupgap=0.2)

        fig.show()
        
        # self.data = self.data.reset_index(drop=True)
        self.data['cleaned_comment'] = self.data[self.comments_col].apply(preprocessing)
        self.data['len_comment_after_preprocessing'] = self.data['cleaned_comment'].apply(word_tokenize).map(len)

    def vectorize(self):
        ''' 
        This function is used to vectorize the comments.
        '''
        self.vectorizer_pipeline = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize, analyzer='word', ngram_range=(1, 3), min_df=1, lowercase=False)),
            ('tfidf', TfidfTransformer(sublinear_tf=True))
            ])
        self.vectorized_comments = self.vectorizer_pipeline.fit_transform(self.data['cleaned_comment'])
    
    def kmeans_clustering(self, elbow:bool=False, n_clusters:int=4):
        '''
        This function is used to cluster the comments using KMeans.
        It can use elbow method to find the optimal number of clusters.
        '''
        if elbow:
            # Elbow for finding number of clusters
            cluster_range = range(2, 7)
            wcss = []

            for n_clusters in cluster_range:
                self.kmeans = KMeans(n_clusters=n_clusters, random_state=42)
                self.kmeans.fit(self.vectorized_comments)

                wcss.append(self.kmeans.inertia_)

            self.data['cluster_kmeans'] = self.kmeans.predict(self.vectorized_comments)
            # Plot the Elbow
            plt.figure(figsize=(8, 5))
            plt.plot(cluster_range, wcss, marker='o', linestyle='-', color='b')
            plt.title('Elbow Method for Optimal Number of Clusters')
            plt.xlabel('Number of Clusters')
            plt.ylabel('Within-Cluster-Sum of Squared Errors (WCSS)')
            plt.xticks(cluster_range)
            plt.grid(True)
            plt.show()

        else:
            self.kmeans = KMeans(n_clusters=n_clusters, random_state=42)
            self.kmeans.fit(self.vectorized_comments)
            self.data['cluster_kmeans'] = self.kmeans.predict(self.vectorized_comments)
    
    def dbscan_clustering(self):
        '''
        This function is used to cluster the comments using DBSCAN.
        '''
        dbscan = DBSCAN(eps=0.5, min_samples=50, metric='cosine')
        clusters = dbscan.fit_predict(self.vectorized_comments)

        self.data['cluster_dbscan'] = clusters
            
    def plot_3d(self, hover_data:list=None, color:str='cluster_kmeans', pca:bool=False):
        '''
        This function is used to plot the clusters in 3D.
        color: [star, cluster_kmeans, cluster_dbscan] 
        '''
        # Reduce TF-IDF vectors to 3 dimensions
        if pca:
            pca = PCA(n_components=3)
            reduced_data = pca.fit_transform(self.vectorized_comments)
        else: 
            svd = TruncatedSVD(n_components=3)
            reduced_data = svd.fit_transform(self.vectorized_comments)

        self.data['x'] = reduced_data[:, 0]
        self.data['y'] = reduced_data[:, 1]
        self.data['z'] = reduced_data[:, 2]

        #  3D scatter plot
        fig = px.scatter_3d(self.data, x='x', y='y', z='z', color=color,
                            labels={'x': 'Component 1', 'y': 'Component 2', 'z': 'Component 3'},
                            title=f'3D Visualization on {color}',
                            hover_data=['cleaned_comment', 'star'])
        fig.show()

In [None]:
c = Clustering(df, 'description', 'star')
c.preprocess(sample=True, balance_stars=True)

In [None]:
c.vectorize()

In [None]:
c.kmeans_clustering(elbow=True)

In [88]:
c.kmeans_clustering(n_clusters=5)

In [82]:
c.dbscan_clustering()

In [None]:
c.plot_3d()

In [None]:
c.plot_3d(color='cluster_dbscan', pca=True)

In [None]:
c.plot_3d(color='star')

In [None]:
c.data[['x', 'y', 'z', 'star']].corr()