In [2]:
!python --version

Python 3.10.14


In [8]:
import polars as pl
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

import sklearn
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score, adjusted_mutual_info_score, \
                         fowlkes_mallows_score, homogeneity_completeness_v_measure, davies_bouldin_score, calinski_harabasz_score, \
                         mean_squared_error, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

import re
from string import punctuation

from openTSNE import TSNE

In [23]:
df = pl.read_csv("BAAD16_1500w.csv")

In [24]:
df.head()

label,text
str,str
"""shunil_gongopaddhay""","""থেকে কত দূরে চলে এসেছে ভরত। সে…"
"""humayun_ahmed""","""এতে ভয় কমে যায়। বল একটা গল্প।’…"
"""shomresh""","""হবে। ওই দেখুন ওর এক চোখ কানা। …"
"""humayun_ahmed""","""হত! আবার চাদর মুড়ি দিয়ে নিজেকে…"
"""shordindu""","""হইয়া উঠিল। মনে যথেষ্ট কৌতূহল, …"


In [9]:
color_palette = [
  "#0000FF", "#00C850", "#00FF00", "#00FFFF", "#4169E1", "#87CEFA", "#ADFF2F", "#B600C6",
  "#C62E2E", "#F863FF", "#FD0101", "#FF007D", "#FF4BCD", "#FF7F50", "#FFA500", "#FFFF00"
]
custom_cmap = mcolors.ListedColormap(color_palette)

In [26]:
def tokenize_bangla(text):
  r = re.compile(r'([\s\।{}]+)'.format(re.escape('!"#$%&\'()*+,’।-./:;<=>?@[\\]^_`{|}~')))
  list_ = r.split(text)
  list_ = [item.replace(" ", "").replace("\n", "").replace("\t", "") if not item.isspace() else item for item in list_ if item.strip()]
  return list_

In [31]:
df = df.with_columns([ 
    pl.col("text").map_elements(tokenize_bangla, return_dtype=list[str]).map_elements(lambda x: ' '.join(x), return_dtype=str).alias("final_text")
])

In [32]:
df.head()

label,text,final_text
str,str,str
"""shunil_gongopaddhay""","""থেকে কত দূরে চলে এসেছে ভরত। সে…","""থেকে কত দূরে চলে এসেছে ভরত । স…"
"""humayun_ahmed""","""এতে ভয় কমে যায়। বল একটা গল্প।’…","""এতে ভয় কমে যায় । বল একটা গল্প …"
"""shomresh""","""হবে। ওই দেখুন ওর এক চোখ কানা। …","""হবে । ওই দেখুন ওর এক চোখ কানা …"
"""humayun_ahmed""","""হত! আবার চাদর মুড়ি দিয়ে নিজেকে…","""হত ! আবার চাদর মুড়ি দিয়ে নিজেক…"
"""shordindu""","""হইয়া উঠিল। মনে যথেষ্ট কৌতূহল, …","""হইয়া উঠিল । মনে যথেষ্ট কৌতূহল …"


In [34]:
vectorizer = TfidfVectorizer(max_features=1100, ngram_range=(1, 2), max_df=0.85, sublinear_tf=True)
X = vectorizer.fit_transform(df['final_text'])
print(X.toarray().shape)

(8987, 1100)


In [35]:
pca = PCA(n_components=100)
X_pca_for_tsne = pca.fit_transform(X.toarray())
print(X_pca_for_tsne.shape)

(8987, 100)


In [None]:
tsne = TSNE(
    n_components=3,
    perplexity=20,
    metric="euclidean",
    n_jobs=-1,
    random_state=2024,
    verbose=True,
)
X_tsne_embedding = tsne.fit(X_pca_for_tsne)
print(X_tsne_embedding.shape)

In [None]:
X_tsne_embedding.transform()