# Step1 : Download asd set Dataset

### Download Dataset

In [75]:
!pip install datasets




### Import Library

In [76]:
from datasets import load_dataset

### Set Persian Dataset

In [77]:
persian_dataset = load_dataset("SeyedAli/Persian-Text-Sentiment")

In [78]:
persian_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 55852
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 13964
    })
})

In [79]:
min(persian_dataset['train']['label'])

0

In [80]:
max(persian_dataset['train']['label'])

1

### Print Dataset

In [81]:
for doc_index in range(0,10):
  print(f"[{doc_index}]\t label:{persian_dataset['train'][doc_index]['label']} \t text:{persian_dataset['train'][doc_index]['text'][0:100]}")

[0]	 label:1 	 text:این هندس فری تازه به دستم رسیده  و برای کسایی که انتظار شنیدن یک موسیقی با کیفیت دارن حتما پیشنهاد م
[1]	 label:1 	 text:عالی بود عالی، امیدوارم کیفیت همینجور بمونه
[2]	 label:1 	 text:من وقتی این ایرباد به لپ تاپ ایسوس  وصل میکنم و با نرم افزار حرفه ای   آهنگها را پخش میکنم   صدای گر
[3]	 label:1 	 text:با عرض سلام ضمن عرض خسته نباشید چلوکباب کوبیده سفارشی بسیار عالی بود ولی جوجه در چلو جوجه کباب مخصوص
[4]	 label:1 	 text:خوشمزه و داغ مخصوصا با سس چیلی که فرستاده بودن بسیار خوب بود ممنون ازجوجه داغ و اسنپ
[5]	 label:0 	 text:متاسفانه با وجود تاکید زیاد به خشک و برشته نبودن نان‌ها، نان سنگک به قدری خشک و بیسکویتی بود که قسمت
[6]	 label:0 	 text:طعم چیزکیک اوکی بود عالی نبود ولی اوکی بود اما بسته‌بندی و ارسالش بسیار نامناسب بود کیک کاملا از ریخ
[7]	 label:0 	 text:جوجه کباب خیلی پخته و خشک شده بود 
[8]	 label:0 	 text:سالاد فصل بسیار کیفیت بدی داشت وکاهوداخل ساندویچ از کیفیت خوبی برخوردارنبود
[9]	 label:0 	 text:من  تا مایع ظرفشویی سفارش دادم ولی یکی ارسال شد


# Step2 : Set Variables

### set Variables

In [82]:
MinDFList = [2, 3, 10, 50, 100]
ItemCountList = [100, 200, 500, 1000]
MinDF = 10 # for filter term frequency
ItemCount = 200 # number of item used from dataset
MaxIteration = 100 # number of iteration for clustering
NumberOfClusters = 2 # number of clusters
KMeansWay = 'k-means++' # kmeans way how its calculated
clusterName = "cluster" # csv File name

### Set Options for displaying

In [98]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# Step3 : Vectorization Doc with TF-IDF

### Import Library

In [83]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

### Vectorize

In [84]:
def Vectorize(Min_DF):
    vectorizer = TfidfVectorizer(strip_accents='unicode', min_df=Min_DF)
    X = vectorizer.fit_transform(persian_dataset['train']['text'])

    vocab=vectorizer.get_feature_names_out()

    print(X.shape)

    doc_vetors=X.toarray()

    print([(i,vocab[i],j) for i,j in enumerate(doc_vetors[0]) if j>0])
    return X, vectorizer

X, vectorizer = Vectorize(MinDF)

(55852, 4591)
[(70, 'ادم', 0.18621131199485072), (103, 'از', 0.07056396577345739), (291, 'انتظار', 0.15758240074456317), (337, 'انگار', 0.15329214228657592), (348, 'اهنگ', 0.5095942646638163), (438, 'این', 0.2697752139295794), (475, 'با', 0.16396999851262734), (667, 'برای', 0.11100121633259262), (784, 'بعد', 0.13186176476702166), (845, 'به', 0.13809467213661994), (954, 'تا', 0.1134868216654906), (969, 'تازه', 0.11183745889514918), (1264, 'حالا', 0.17539631713031353), (1274, 'حتما', 0.16466460788199724), (1507, 'دارن', 0.19965633676158456), (1595, 'دستم', 0.13324218159867873), (1797, 'رسیده', 0.20612646352069003), (2553, 'فکر', 0.15832197111586907), (3084, 'موسیقی', 0.25479713233190815), (3266, 'میکنم', 0.14836067780281506), (3269, 'میکنه', 0.17015991334709685), (3679, 'هدست', 0.2735757000931026), (4075, 'پیشنهاد', 0.15626618676707035), (4303, 'کسایی', 0.2377964434781449), (4379, 'که', 0.07239395000134217), (4423, 'کیفیت', 0.07949044900097112), (4574, 'یک', 0.11042471165957501)]


# Step4 : KMeans Clustering

### Import KMeans Library

In [85]:
from sklearn.cluster import KMeans
from time import time

### Create kmeans cluster

In [86]:
def KMeansCreate(Number_Of_Clusters, KMeans_Way, Item_Count, Max_Iteration, x):
    km = KMeans(n_clusters=Number_Of_Clusters, init=KMeans_Way, max_iter=Max_Iteration)
    t0 = time()
    km.fit(x[:Item_Count])
    labels=persian_dataset['train']['label'][:Item_Count]
    print("done in %0.3fs" % (time() - t0))
    return km

KM = KMeansCreate(NumberOfClusters, KMeansWay, ItemCount, MaxIteration, X)

done in 0.138s




### Import calinski_harabasz Library

In [87]:
from sklearn.metrics import  calinski_harabasz_score

### Create calinski_harabasz_score

In [88]:
def CalinskiHarabasz(km, Item_Count):
    ch_index = calinski_harabasz_score(X[:Item_Count].toarray(), km.labels_)
    print(f"Calinski-Harabasz Index: {ch_index:.2f}")
    return ch_index

chIndex = CalinskiHarabasz(KM, ItemCount)

Calinski-Harabasz Index: 2.54


### Import Pandas Library

In [89]:
import pandas as pd

### Create Pandas frame

In [90]:
def CreateFrame(Item_Count, km):
    df = pd.DataFrame({"text":persian_dataset['train']['text'],"label":persian_dataset['train']['label']})

    df_cl=df[:Item_Count]

    df_cl['cluster'] = km.labels_
    return df_cl

df_cl = CreateFrame(ItemCount, KM)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl['cluster'] = km.labels_


### Print frame

In [99]:
df_cl.head(100)

Unnamed: 0,text,label,cluster
0,این هندس فری تازه به دستم رسیده و برای کسایی که انتظار شنیدن یک موسیقی با کیفیت دارن حتما پیشنهاد میکنم بعد از شنیدن آهنگ با این هدست آدم به این فکر میکنه انگار تا حالا آهنگ نشنیده,1,0
1,عالی بود عالی، امیدوارم کیفیت همینجور بمونه,1,1
2,من وقتی این ایرباد به لپ تاپ ایسوس وصل میکنم و با نرم افزار حرفه ای آهنگها را پخش میکنم صدای گرم و شفافی بهم میده و نهایت قدرت این ایرباد نشون میده,1,0
3,با عرض سلام ضمن عرض خسته نباشید چلوکباب کوبیده سفارشی بسیار عالی بود ولی جوجه در چلو جوجه کباب مخصوص قدری خشک بود در مجموع از نحوه سرویس دهی و زمان بندی و کیفیت غذا رضایت کامل داشته و تشکر فراوان دارم,1,0
4,خوشمزه و داغ مخصوصا با سس چیلی که فرستاده بودن بسیار خوب بود ممنون ازجوجه داغ و اسنپ,1,1
5,متاسفانه با وجود تاکید زیاد به خشک و برشته نبودن نان‌ها، نان سنگک به قدری خشک و بیسکویتی بود که قسمت عمده آن در بسته بندی کاملا خرد شده بود اطراف نان کاملا سوخته بود واقعا ناراضی بودم و خرید مجدد نخواهم کرد,0,0
6,طعم چیزکیک اوکی بود عالی نبود ولی اوکی بود اما بسته‌بندی و ارسالش بسیار نامناسب بود کیک کاملا از ریخت افتاده و به بدنه‌های جعبه مالیده بود,0,0
7,جوجه کباب خیلی پخته و خشک شده بود,0,0
8,سالاد فصل بسیار کیفیت بدی داشت وکاهوداخل ساندویچ از کیفیت خوبی برخوردارنبود,0,0
9,من تا مایع ظرفشویی سفارش دادم ولی یکی ارسال شد,0,0


# Step5 : write into a CSV file

### write into CSV

In [92]:
def WriteIntoCSV(df_cl, cluster_Name):
    clusters = df_cl.groupby('cluster')
    for cluster in clusters.groups:
        f = open(cluster_Name+str(cluster)+ '.csv', 'w') # create csv file
        data = clusters.get_group(cluster)[['text']] # get title and overview columns
        f.write(data.to_csv(index_label='id')) # set index to id
        f.close()

WriteIntoCSV(df_cl, clusterName)

### Print cluster features

In [93]:
def PrintClusterFeatures(km):
    print("Cluster centroids: \n")
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names_out()

    for i in range(2):
        print("Cluster %d:" % i)
        for j in order_centroids[i, :10]: #print out 10 feature terms of each cluster
            print (' %s' % terms[j])
        print('------------')

PrintClusterFeatures(KM)

Cluster centroids: 

Cluster 0:
 به
 از
 با
 سفارش
 که
 بود
 من
 در
 رو
 این
------------
Cluster 1:
 بود
 خیلی
 خوب
 هم
 بسیار
 عالی
 کیفیت
 خوشمزه
 زمینی
 سیب
------------


# Step 6 : Loop for set deferent Variables

In [104]:
import os

if not os.path.exists("/ClusterFolder/"):
    os.makedirs("/ClusterFolder/")

dfList = []
idList = []

for MinDFL in MinDFList:
    for ItemCountL in ItemCountList:
        X, vectorizer = Vectorize(MinDFL)
        KM = KMeansCreate(NumberOfClusters, KMeansWay, ItemCountL, MaxIteration, X)
        chIndex = CalinskiHarabasz(KM, ItemCountL)
        df_cl = CreateFrame(ItemCountL, KM)
        id = []
        id.append(MinDFL)
        id.append(ItemCountL)
        idList.append(id)
        dfList.append(df_cl)
        clusterName = "/ClusterFolder/" + "Cluster" + str(MinDFL) + "_" + str(ItemCountL) + "_"
        WriteIntoCSV(df_cl, clusterName)

(55852, 12785)
[(192, 'ادم', 0.15487384198008022), (290, 'از', 0.058688767978759715), (892, 'انتظار', 0.13106288533335736), (996, 'انگار', 0.12749463374134673), (1020, 'اهنگ', 0.42383473256276455), (1248, 'این', 0.22437478907525976), (1320, 'با', 0.13637551535978049), (1803, 'برای', 0.09232084052104222), (2145, 'بعد', 0.10967077080852611), (2307, 'به', 0.11485474325732307), (2627, 'تا', 0.09438814375535234), (2665, 'تازه', 0.09301635196501928), (3503, 'حالا', 0.14587890076129956), (3525, 'حتما', 0.13695322903655566), (4144, 'دارن', 0.16605620581627392), (4386, 'دستم', 0.11081887752643922), (4937, 'رسیده', 0.17143747604387696), (6132, 'شنیدن', 0.48951506645885523), (6988, 'فری', 0.26197937438994034), (7112, 'فکر', 0.13167799353270213), (8613, 'موسیقی', 0.21191736628138227), (9094, 'میکنم', 0.12339308457654906), (9098, 'میکنه', 0.141523730479871), (10210, 'هدست', 0.2275356920689149), (11439, 'پیشنهاد', 0.12996817678220432), (12032, 'کسایی', 0.19777772046242661), (12198, 'که', 0.060210784

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl['cluster'] = km.labels_


(55852, 12785)
[(192, 'ادم', 0.15487384198008022), (290, 'از', 0.058688767978759715), (892, 'انتظار', 0.13106288533335736), (996, 'انگار', 0.12749463374134673), (1020, 'اهنگ', 0.42383473256276455), (1248, 'این', 0.22437478907525976), (1320, 'با', 0.13637551535978049), (1803, 'برای', 0.09232084052104222), (2145, 'بعد', 0.10967077080852611), (2307, 'به', 0.11485474325732307), (2627, 'تا', 0.09438814375535234), (2665, 'تازه', 0.09301635196501928), (3503, 'حالا', 0.14587890076129956), (3525, 'حتما', 0.13695322903655566), (4144, 'دارن', 0.16605620581627392), (4386, 'دستم', 0.11081887752643922), (4937, 'رسیده', 0.17143747604387696), (6132, 'شنیدن', 0.48951506645885523), (6988, 'فری', 0.26197937438994034), (7112, 'فکر', 0.13167799353270213), (8613, 'موسیقی', 0.21191736628138227), (9094, 'میکنم', 0.12339308457654906), (9098, 'میکنه', 0.141523730479871), (10210, 'هدست', 0.2275356920689149), (11439, 'پیشنهاد', 0.12996817678220432), (12032, 'کسایی', 0.19777772046242661), (12198, 'که', 0.060210784

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl['cluster'] = km.labels_


(55852, 12785)
[(192, 'ادم', 0.15487384198008022), (290, 'از', 0.058688767978759715), (892, 'انتظار', 0.13106288533335736), (996, 'انگار', 0.12749463374134673), (1020, 'اهنگ', 0.42383473256276455), (1248, 'این', 0.22437478907525976), (1320, 'با', 0.13637551535978049), (1803, 'برای', 0.09232084052104222), (2145, 'بعد', 0.10967077080852611), (2307, 'به', 0.11485474325732307), (2627, 'تا', 0.09438814375535234), (2665, 'تازه', 0.09301635196501928), (3503, 'حالا', 0.14587890076129956), (3525, 'حتما', 0.13695322903655566), (4144, 'دارن', 0.16605620581627392), (4386, 'دستم', 0.11081887752643922), (4937, 'رسیده', 0.17143747604387696), (6132, 'شنیدن', 0.48951506645885523), (6988, 'فری', 0.26197937438994034), (7112, 'فکر', 0.13167799353270213), (8613, 'موسیقی', 0.21191736628138227), (9094, 'میکنم', 0.12339308457654906), (9098, 'میکنه', 0.141523730479871), (10210, 'هدست', 0.2275356920689149), (11439, 'پیشنهاد', 0.12996817678220432), (12032, 'کسایی', 0.19777772046242661), (12198, 'که', 0.060210784



done in 0.118s
Calinski-Harabasz Index: 5.67


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl['cluster'] = km.labels_


(55852, 12785)
[(192, 'ادم', 0.15487384198008022), (290, 'از', 0.058688767978759715), (892, 'انتظار', 0.13106288533335736), (996, 'انگار', 0.12749463374134673), (1020, 'اهنگ', 0.42383473256276455), (1248, 'این', 0.22437478907525976), (1320, 'با', 0.13637551535978049), (1803, 'برای', 0.09232084052104222), (2145, 'بعد', 0.10967077080852611), (2307, 'به', 0.11485474325732307), (2627, 'تا', 0.09438814375535234), (2665, 'تازه', 0.09301635196501928), (3503, 'حالا', 0.14587890076129956), (3525, 'حتما', 0.13695322903655566), (4144, 'دارن', 0.16605620581627392), (4386, 'دستم', 0.11081887752643922), (4937, 'رسیده', 0.17143747604387696), (6132, 'شنیدن', 0.48951506645885523), (6988, 'فری', 0.26197937438994034), (7112, 'فکر', 0.13167799353270213), (8613, 'موسیقی', 0.21191736628138227), (9094, 'میکنم', 0.12339308457654906), (9098, 'میکنه', 0.141523730479871), (10210, 'هدست', 0.2275356920689149), (11439, 'پیشنهاد', 0.12996817678220432), (12032, 'کسایی', 0.19777772046242661), (12198, 'که', 0.060210784



done in 0.127s
Calinski-Harabasz Index: 9.51


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl['cluster'] = km.labels_


(55852, 9540)
[(148, 'ادم', 0.15487384198008022), (223, 'از', 0.058688767978759715), (656, 'انتظار', 0.13106288533335736), (737, 'انگار', 0.12749463374134673), (753, 'اهنگ', 0.42383473256276455), (925, 'این', 0.22437478907525976), (984, 'با', 0.13637551535978049), (1346, 'برای', 0.09232084052104222), (1608, 'بعد', 0.10967077080852611), (1728, 'به', 0.11485474325732307), (1980, 'تا', 0.09438814375535234), (2009, 'تازه', 0.09301635196501928), (2623, 'حالا', 0.14587890076129956), (2641, 'حتما', 0.13695322903655566), (3103, 'دارن', 0.16605620581627392), (3282, 'دستم', 0.11081887752643922), (3698, 'رسیده', 0.17143747604387696), (4588, 'شنیدن', 0.48951506645885523), (5220, 'فری', 0.26197937438994034), (5306, 'فکر', 0.13167799353270213), (6430, 'موسیقی', 0.21191736628138227), (6810, 'میکنم', 0.12339308457654906), (6813, 'میکنه', 0.141523730479871), (7666, 'هدست', 0.2275356920689149), (8518, 'پیشنهاد', 0.12996817678220432), (8965, 'کسایی', 0.19777772046242661), (9096, 'که', 0.06021078447227599



done in 0.078s
Calinski-Harabasz Index: 1.60


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl['cluster'] = km.labels_


(55852, 9540)
[(148, 'ادم', 0.15487384198008022), (223, 'از', 0.058688767978759715), (656, 'انتظار', 0.13106288533335736), (737, 'انگار', 0.12749463374134673), (753, 'اهنگ', 0.42383473256276455), (925, 'این', 0.22437478907525976), (984, 'با', 0.13637551535978049), (1346, 'برای', 0.09232084052104222), (1608, 'بعد', 0.10967077080852611), (1728, 'به', 0.11485474325732307), (1980, 'تا', 0.09438814375535234), (2009, 'تازه', 0.09301635196501928), (2623, 'حالا', 0.14587890076129956), (2641, 'حتما', 0.13695322903655566), (3103, 'دارن', 0.16605620581627392), (3282, 'دستم', 0.11081887752643922), (3698, 'رسیده', 0.17143747604387696), (4588, 'شنیدن', 0.48951506645885523), (5220, 'فری', 0.26197937438994034), (5306, 'فکر', 0.13167799353270213), (6430, 'موسیقی', 0.21191736628138227), (6810, 'میکنم', 0.12339308457654906), (6813, 'میکنه', 0.141523730479871), (7666, 'هدست', 0.2275356920689149), (8518, 'پیشنهاد', 0.12996817678220432), (8965, 'کسایی', 0.19777772046242661), (9096, 'که', 0.06021078447227599

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl['cluster'] = km.labels_


(55852, 9540)
[(148, 'ادم', 0.15487384198008022), (223, 'از', 0.058688767978759715), (656, 'انتظار', 0.13106288533335736), (737, 'انگار', 0.12749463374134673), (753, 'اهنگ', 0.42383473256276455), (925, 'این', 0.22437478907525976), (984, 'با', 0.13637551535978049), (1346, 'برای', 0.09232084052104222), (1608, 'بعد', 0.10967077080852611), (1728, 'به', 0.11485474325732307), (1980, 'تا', 0.09438814375535234), (2009, 'تازه', 0.09301635196501928), (2623, 'حالا', 0.14587890076129956), (2641, 'حتما', 0.13695322903655566), (3103, 'دارن', 0.16605620581627392), (3282, 'دستم', 0.11081887752643922), (3698, 'رسیده', 0.17143747604387696), (4588, 'شنیدن', 0.48951506645885523), (5220, 'فری', 0.26197937438994034), (5306, 'فکر', 0.13167799353270213), (6430, 'موسیقی', 0.21191736628138227), (6810, 'میکنم', 0.12339308457654906), (6813, 'میکنه', 0.141523730479871), (7666, 'هدست', 0.2275356920689149), (8518, 'پیشنهاد', 0.12996817678220432), (8965, 'کسایی', 0.19777772046242661), (9096, 'که', 0.06021078447227599



done in 0.082s
Calinski-Harabasz Index: 5.70


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl['cluster'] = km.labels_


(55852, 9540)
[(148, 'ادم', 0.15487384198008022), (223, 'از', 0.058688767978759715), (656, 'انتظار', 0.13106288533335736), (737, 'انگار', 0.12749463374134673), (753, 'اهنگ', 0.42383473256276455), (925, 'این', 0.22437478907525976), (984, 'با', 0.13637551535978049), (1346, 'برای', 0.09232084052104222), (1608, 'بعد', 0.10967077080852611), (1728, 'به', 0.11485474325732307), (1980, 'تا', 0.09438814375535234), (2009, 'تازه', 0.09301635196501928), (2623, 'حالا', 0.14587890076129956), (2641, 'حتما', 0.13695322903655566), (3103, 'دارن', 0.16605620581627392), (3282, 'دستم', 0.11081887752643922), (3698, 'رسیده', 0.17143747604387696), (4588, 'شنیدن', 0.48951506645885523), (5220, 'فری', 0.26197937438994034), (5306, 'فکر', 0.13167799353270213), (6430, 'موسیقی', 0.21191736628138227), (6810, 'میکنم', 0.12339308457654906), (6813, 'میکنه', 0.141523730479871), (7666, 'هدست', 0.2275356920689149), (8518, 'پیشنهاد', 0.12996817678220432), (8965, 'کسایی', 0.19777772046242661), (9096, 'که', 0.06021078447227599



done in 0.106s
Calinski-Harabasz Index: 9.61


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl['cluster'] = km.labels_


(55852, 4591)
[(70, 'ادم', 0.18621131199485072), (103, 'از', 0.07056396577345739), (291, 'انتظار', 0.15758240074456317), (337, 'انگار', 0.15329214228657592), (348, 'اهنگ', 0.5095942646638163), (438, 'این', 0.2697752139295794), (475, 'با', 0.16396999851262734), (667, 'برای', 0.11100121633259262), (784, 'بعد', 0.13186176476702166), (845, 'به', 0.13809467213661994), (954, 'تا', 0.1134868216654906), (969, 'تازه', 0.11183745889514918), (1264, 'حالا', 0.17539631713031353), (1274, 'حتما', 0.16466460788199724), (1507, 'دارن', 0.19965633676158456), (1595, 'دستم', 0.13324218159867873), (1797, 'رسیده', 0.20612646352069003), (2553, 'فکر', 0.15832197111586907), (3084, 'موسیقی', 0.25479713233190815), (3266, 'میکنم', 0.14836067780281506), (3269, 'میکنه', 0.17015991334709685), (3679, 'هدست', 0.2735757000931026), (4075, 'پیشنهاد', 0.15626618676707035), (4303, 'کسایی', 0.2377964434781449), (4379, 'که', 0.07239395000134217), (4423, 'کیفیت', 0.07949044900097112), (4574, 'یک', 0.11042471165957501)]
done in

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl['cluster'] = km.labels_


(55852, 4591)
[(70, 'ادم', 0.18621131199485072), (103, 'از', 0.07056396577345739), (291, 'انتظار', 0.15758240074456317), (337, 'انگار', 0.15329214228657592), (348, 'اهنگ', 0.5095942646638163), (438, 'این', 0.2697752139295794), (475, 'با', 0.16396999851262734), (667, 'برای', 0.11100121633259262), (784, 'بعد', 0.13186176476702166), (845, 'به', 0.13809467213661994), (954, 'تا', 0.1134868216654906), (969, 'تازه', 0.11183745889514918), (1264, 'حالا', 0.17539631713031353), (1274, 'حتما', 0.16466460788199724), (1507, 'دارن', 0.19965633676158456), (1595, 'دستم', 0.13324218159867873), (1797, 'رسیده', 0.20612646352069003), (2553, 'فکر', 0.15832197111586907), (3084, 'موسیقی', 0.25479713233190815), (3266, 'میکنم', 0.14836067780281506), (3269, 'میکنه', 0.17015991334709685), (3679, 'هدست', 0.2735757000931026), (4075, 'پیشنهاد', 0.15626618676707035), (4303, 'کسایی', 0.2377964434781449), (4379, 'که', 0.07239395000134217), (4423, 'کیفیت', 0.07949044900097112), (4574, 'یک', 0.11042471165957501)]




done in 0.177s
Calinski-Harabasz Index: 2.40


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl['cluster'] = km.labels_


(55852, 4591)
[(70, 'ادم', 0.18621131199485072), (103, 'از', 0.07056396577345739), (291, 'انتظار', 0.15758240074456317), (337, 'انگار', 0.15329214228657592), (348, 'اهنگ', 0.5095942646638163), (438, 'این', 0.2697752139295794), (475, 'با', 0.16396999851262734), (667, 'برای', 0.11100121633259262), (784, 'بعد', 0.13186176476702166), (845, 'به', 0.13809467213661994), (954, 'تا', 0.1134868216654906), (969, 'تازه', 0.11183745889514918), (1264, 'حالا', 0.17539631713031353), (1274, 'حتما', 0.16466460788199724), (1507, 'دارن', 0.19965633676158456), (1595, 'دستم', 0.13324218159867873), (1797, 'رسیده', 0.20612646352069003), (2553, 'فکر', 0.15832197111586907), (3084, 'موسیقی', 0.25479713233190815), (3266, 'میکنم', 0.14836067780281506), (3269, 'میکنه', 0.17015991334709685), (3679, 'هدست', 0.2735757000931026), (4075, 'پیشنهاد', 0.15626618676707035), (4303, 'کسایی', 0.2377964434781449), (4379, 'که', 0.07239395000134217), (4423, 'کیفیت', 0.07949044900097112), (4574, 'یک', 0.11042471165957501)]




done in 0.081s
Calinski-Harabasz Index: 5.79


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl['cluster'] = km.labels_


(55852, 4591)
[(70, 'ادم', 0.18621131199485072), (103, 'از', 0.07056396577345739), (291, 'انتظار', 0.15758240074456317), (337, 'انگار', 0.15329214228657592), (348, 'اهنگ', 0.5095942646638163), (438, 'این', 0.2697752139295794), (475, 'با', 0.16396999851262734), (667, 'برای', 0.11100121633259262), (784, 'بعد', 0.13186176476702166), (845, 'به', 0.13809467213661994), (954, 'تا', 0.1134868216654906), (969, 'تازه', 0.11183745889514918), (1264, 'حالا', 0.17539631713031353), (1274, 'حتما', 0.16466460788199724), (1507, 'دارن', 0.19965633676158456), (1595, 'دستم', 0.13324218159867873), (1797, 'رسیده', 0.20612646352069003), (2553, 'فکر', 0.15832197111586907), (3084, 'موسیقی', 0.25479713233190815), (3266, 'میکنم', 0.14836067780281506), (3269, 'میکنه', 0.17015991334709685), (3679, 'هدست', 0.2735757000931026), (4075, 'پیشنهاد', 0.15626618676707035), (4303, 'کسایی', 0.2377964434781449), (4379, 'که', 0.07239395000134217), (4423, 'کیفیت', 0.07949044900097112), (4574, 'یک', 0.11042471165957501)]
done in

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl['cluster'] = km.labels_


(55852, 1718)
[(24, 'ادم', 0.24028797492756176), (35, 'از', 0.0910560817005077), (104, 'انتظار', 0.20334508979874255), (118, 'انگار', 0.19780891959650423), (155, 'این', 0.34811923693755487), (172, 'با', 0.21158767676028226), (255, 'برای', 0.14323650481444672), (292, 'بعد', 0.1701550571058665), (313, 'به', 0.17819803083129337), (362, 'تا', 0.14644394192185362), (369, 'تازه', 0.14431559624961424), (480, 'حالا', 0.2263322534033856), (485, 'حتما', 0.21248400403999237), (587, 'دارن', 0.25763749971979744), (622, 'دستم', 0.17193635364194362), (705, 'رسیده', 0.26598658248933976), (995, 'فکر', 0.20429943497215733), (1236, 'میکنم', 0.1914453340466486), (1239, 'میکنه', 0.21957517271106466), (1518, 'پیشنهاد', 0.20164664093527157), (1596, 'کسایی', 0.30685367734222246), (1634, 'که', 0.0934175021725356), (1654, 'کیفیت', 0.10257485870167847), (1712, 'یک', 0.14249258040442211)]
done in 0.051s
Calinski-Harabasz Index: 1.72


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl['cluster'] = km.labels_


(55852, 1718)
[(24, 'ادم', 0.24028797492756176), (35, 'از', 0.0910560817005077), (104, 'انتظار', 0.20334508979874255), (118, 'انگار', 0.19780891959650423), (155, 'این', 0.34811923693755487), (172, 'با', 0.21158767676028226), (255, 'برای', 0.14323650481444672), (292, 'بعد', 0.1701550571058665), (313, 'به', 0.17819803083129337), (362, 'تا', 0.14644394192185362), (369, 'تازه', 0.14431559624961424), (480, 'حالا', 0.2263322534033856), (485, 'حتما', 0.21248400403999237), (587, 'دارن', 0.25763749971979744), (622, 'دستم', 0.17193635364194362), (705, 'رسیده', 0.26598658248933976), (995, 'فکر', 0.20429943497215733), (1236, 'میکنم', 0.1914453340466486), (1239, 'میکنه', 0.21957517271106466), (1518, 'پیشنهاد', 0.20164664093527157), (1596, 'کسایی', 0.30685367734222246), (1634, 'که', 0.0934175021725356), (1654, 'کیفیت', 0.10257485870167847), (1712, 'یک', 0.14249258040442211)]
done in 0.071s
Calinski-Harabasz Index: 2.93


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl['cluster'] = km.labels_


(55852, 1718)
[(24, 'ادم', 0.24028797492756176), (35, 'از', 0.0910560817005077), (104, 'انتظار', 0.20334508979874255), (118, 'انگار', 0.19780891959650423), (155, 'این', 0.34811923693755487), (172, 'با', 0.21158767676028226), (255, 'برای', 0.14323650481444672), (292, 'بعد', 0.1701550571058665), (313, 'به', 0.17819803083129337), (362, 'تا', 0.14644394192185362), (369, 'تازه', 0.14431559624961424), (480, 'حالا', 0.2263322534033856), (485, 'حتما', 0.21248400403999237), (587, 'دارن', 0.25763749971979744), (622, 'دستم', 0.17193635364194362), (705, 'رسیده', 0.26598658248933976), (995, 'فکر', 0.20429943497215733), (1236, 'میکنم', 0.1914453340466486), (1239, 'میکنه', 0.21957517271106466), (1518, 'پیشنهاد', 0.20164664093527157), (1596, 'کسایی', 0.30685367734222246), (1634, 'که', 0.0934175021725356), (1654, 'کیفیت', 0.10257485870167847), (1712, 'یک', 0.14249258040442211)]
done in 0.073s
Calinski-Harabasz Index: 6.51


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl['cluster'] = km.labels_


(55852, 1718)
[(24, 'ادم', 0.24028797492756176), (35, 'از', 0.0910560817005077), (104, 'انتظار', 0.20334508979874255), (118, 'انگار', 0.19780891959650423), (155, 'این', 0.34811923693755487), (172, 'با', 0.21158767676028226), (255, 'برای', 0.14323650481444672), (292, 'بعد', 0.1701550571058665), (313, 'به', 0.17819803083129337), (362, 'تا', 0.14644394192185362), (369, 'تازه', 0.14431559624961424), (480, 'حالا', 0.2263322534033856), (485, 'حتما', 0.21248400403999237), (587, 'دارن', 0.25763749971979744), (622, 'دستم', 0.17193635364194362), (705, 'رسیده', 0.26598658248933976), (995, 'فکر', 0.20429943497215733), (1236, 'میکنم', 0.1914453340466486), (1239, 'میکنه', 0.21957517271106466), (1518, 'پیشنهاد', 0.20164664093527157), (1596, 'کسایی', 0.30685367734222246), (1634, 'که', 0.0934175021725356), (1654, 'کیفیت', 0.10257485870167847), (1712, 'یک', 0.14249258040442211)]
done in 0.078s
Calinski-Harabasz Index: 11.29


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl['cluster'] = km.labels_


(55852, 1104)
[(13, 'ادم', 0.25246785686226814), (21, 'از', 0.09567159491910927), (68, 'انتظار', 0.21365238539477724), (74, 'انگار', 0.20783559399435358), (94, 'این', 0.3657649439538037), (103, 'با', 0.22231277826632292), (153, 'برای', 0.1504969751642672), (178, 'بعد', 0.17877999352545826), (190, 'به', 0.18723065502805822), (225, 'تا', 0.15386699304706655), (228, 'تازه', 0.15163076432736325), (302, 'حالا', 0.2378047381388383), (305, 'حتما', 0.2232545391989032), (383, 'دارن', 0.2706968062851221), (408, 'دستم', 0.18065158164398623), (465, 'رسیده', 0.27946909309734186), (661, 'فکر', 0.21465510507191263), (811, 'میکنم', 0.20114944664879422), (813, 'میکنه', 0.23070514989872778), (986, 'پیشنهاد', 0.2118678443885952), (1059, 'که', 0.09815271269415937), (1067, 'کیفیت', 0.10777424360153567), (1099, 'یک', 0.14971534220272154)]
done in 0.050s
Calinski-Harabasz Index: 2.04


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl['cluster'] = km.labels_


(55852, 1104)
[(13, 'ادم', 0.25246785686226814), (21, 'از', 0.09567159491910927), (68, 'انتظار', 0.21365238539477724), (74, 'انگار', 0.20783559399435358), (94, 'این', 0.3657649439538037), (103, 'با', 0.22231277826632292), (153, 'برای', 0.1504969751642672), (178, 'بعد', 0.17877999352545826), (190, 'به', 0.18723065502805822), (225, 'تا', 0.15386699304706655), (228, 'تازه', 0.15163076432736325), (302, 'حالا', 0.2378047381388383), (305, 'حتما', 0.2232545391989032), (383, 'دارن', 0.2706968062851221), (408, 'دستم', 0.18065158164398623), (465, 'رسیده', 0.27946909309734186), (661, 'فکر', 0.21465510507191263), (811, 'میکنم', 0.20114944664879422), (813, 'میکنه', 0.23070514989872778), (986, 'پیشنهاد', 0.2118678443885952), (1059, 'که', 0.09815271269415937), (1067, 'کیفیت', 0.10777424360153567), (1099, 'یک', 0.14971534220272154)]
done in 0.057s
Calinski-Harabasz Index: 3.21


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl['cluster'] = km.labels_


(55852, 1104)
[(13, 'ادم', 0.25246785686226814), (21, 'از', 0.09567159491910927), (68, 'انتظار', 0.21365238539477724), (74, 'انگار', 0.20783559399435358), (94, 'این', 0.3657649439538037), (103, 'با', 0.22231277826632292), (153, 'برای', 0.1504969751642672), (178, 'بعد', 0.17877999352545826), (190, 'به', 0.18723065502805822), (225, 'تا', 0.15386699304706655), (228, 'تازه', 0.15163076432736325), (302, 'حالا', 0.2378047381388383), (305, 'حتما', 0.2232545391989032), (383, 'دارن', 0.2706968062851221), (408, 'دستم', 0.18065158164398623), (465, 'رسیده', 0.27946909309734186), (661, 'فکر', 0.21465510507191263), (811, 'میکنم', 0.20114944664879422), (813, 'میکنه', 0.23070514989872778), (986, 'پیشنهاد', 0.2118678443885952), (1059, 'که', 0.09815271269415937), (1067, 'کیفیت', 0.10777424360153567), (1099, 'یک', 0.14971534220272154)]
done in 0.068s
Calinski-Harabasz Index: 7.18


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl['cluster'] = km.labels_


(55852, 1104)
[(13, 'ادم', 0.25246785686226814), (21, 'از', 0.09567159491910927), (68, 'انتظار', 0.21365238539477724), (74, 'انگار', 0.20783559399435358), (94, 'این', 0.3657649439538037), (103, 'با', 0.22231277826632292), (153, 'برای', 0.1504969751642672), (178, 'بعد', 0.17877999352545826), (190, 'به', 0.18723065502805822), (225, 'تا', 0.15386699304706655), (228, 'تازه', 0.15163076432736325), (302, 'حالا', 0.2378047381388383), (305, 'حتما', 0.2232545391989032), (383, 'دارن', 0.2706968062851221), (408, 'دستم', 0.18065158164398623), (465, 'رسیده', 0.27946909309734186), (661, 'فکر', 0.21465510507191263), (811, 'میکنم', 0.20114944664879422), (813, 'میکنه', 0.23070514989872778), (986, 'پیشنهاد', 0.2118678443885952), (1059, 'که', 0.09815271269415937), (1067, 'کیفیت', 0.10777424360153567), (1099, 'یک', 0.14971534220272154)]
done in 0.070s
Calinski-Harabasz Index: 12.19


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cl['cluster'] = km.labels_


# Step7 : calulate accuracy

In [112]:
accuracy_df = pd.DataFrame(columns=['MinDF', 'ItemCount', 'accuracy'])

for df, id_values in zip(dfList, idList):
    # Extract MinDF and ItemCount from id_values
    MinDF, ItemCount = id_values
    # Calculate the accuracy as the percentage of rows where 'label' and 'cluster' are equal
    accuracy = (df['label'] == df['cluster']).mean() * 100
    # Create a DataFrame for the new row and concatenate it with accuracy_df
    new_row_df = pd.DataFrame({'MinDF': [MinDF], 'ItemCount': [ItemCount], 'accuracy': [accuracy]})
    accuracy_df = pd.concat([accuracy_df, new_row_df], ignore_index=True)


Unnamed: 0,MinDF,ItemCount,accuracy
0,2,100,50.0
1,2,200,57.5
2,2,500,70.2
3,2,1000,31.0
4,3,100,43.0
5,3,200,68.0
6,3,500,27.0
7,3,1000,69.9
8,10,100,51.0
9,10,200,42.5


# Step8 : Print Accuracy dataframe

In [113]:
accuracy_df

Unnamed: 0,MinDF,ItemCount,accuracy
0,2,100,50.0
1,2,200,57.5
2,2,500,70.2
3,2,1000,31.0
4,3,100,43.0
5,3,200,68.0
6,3,500,27.0
7,3,1000,69.9
8,10,100,51.0
9,10,200,42.5


# Step9 : Zip Folder

In [97]:
import os
import zipfile

def zip_folder(folder_path, zip_file_path):
    # Create a ZIP file
    with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Walk through the folder
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                # Create a full file path
                full_path = os.path.join(root, file)
                # Add the file to the ZIP file
                zipf.write(full_path, os.path.relpath(full_path, folder_path))

# Example usage
folder_path = '/ClusterFolder/'
zip_file_path = '/ClustersZipfile.zip'
zip_folder(folder_path, zip_file_path)


In [None]:
dataset = load_dataset("saied/persian_news_dataset", split="train",streaming=True)

In [None]:
dataset

In [None]:
docs=[]
document_count=1000
counter=0
skip_counter=0
for doc in dataset:
    # print(f"{doc['category']}\t\t{doc['title']}\t\t\t\t{doc['text']}")
    if len(doc['title'])==0 or len(doc['category'])==0:
      skip_counter += 1
      if skip_counter%10000 ==0:
        print(f'{skip_counter} skipped')
      continue
    docs.append(doc)
    counter += 1
    if counter == document_count:
      break
print(len(docs))

In [None]:
docs[0]

In [None]:
len(set([doc['category'] for doc in docs]))

In [None]:
!pip install stanza

In [None]:
import stanza
stanza.download('fa')       # This downloads the English models for the neural pipeline

In [None]:
nlp = stanza.Pipeline(lang='fa', processors='tokenize,pos')

In [None]:
doc = nlp("محسن محمدي متولد شهر تهران است. او در ۲۰ سالگی در مشهد به دانشگاه رفت.")

In [None]:
doc

In [None]:
print(*[f'word: {word.text+" "}\tpos: {word.pos}' for sent in doc.sentences for word in sent.words], sep='\n')