In [46]:
import json
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import FastText

In [68]:
dataset_file_path = 'dataset.json'

with open(dataset_file_path, 'r', encoding='utf-8') as dataset_file:
    data = json.load(dataset_file)
    
print(data[0])

{'illness_type': 'داء السكري', 'disease_name': 'التغذية الصحية لمرضى السكري المصابين بأمراض الكلى', 'words': ['الخط', 'غذاء', 'مرضى', 'كلى', 'مكمل', 'عشب', 'امن', 'حال', 'مرضى', 'كلى', 'فيتامين', 'يمك', 'تضر', 'مرضى', 'كلى', 'ايض', 'لذل', 'يجب', 'استشار', 'طبيب', 'تناول', 'نوع', 'تعتمد', 'الخط', 'غذاء', 'مريض', 'كلى', 'حال', 'مريض', 'مرحل', 'متقدم', 'متاخر', 'مرض', 'وعل', 'يوصى', 'الحد', 'تجنب', 'اطعم', 'غذاء', 'يوصى', 'مريض', 'كلى', 'الات', 'الحد', 'صوديوم', 'مض', 'الو', 'يفقد', 'مريض', 'كلى', 'تدريج', 'قدر', 'تحقيق', 'تواز', 'مياه', 'صوديوم', 'جسم', 'لذل', 'يساعد', 'الحد', 'صوديوم', 'خفض', 'ضغط', 'الدم', 'تقليل', 'احتباس', 'سوايل', 'جسم', 'امر', 'شايع', 'مرضى', 'كلى', 'تركيز', 'طعام', 'طازج', 'والمط', 'منزل', 'تناول', 'كم', 'صغير', 'طعام', 'مطاعم', 'اطعم', 'معلب', 'لان', 'غالب', 'تحتو', 'كثير', 'صوديوم', 'اختيار', 'منتج', 'غذاء', 'تحتو', 'نسب', 'صوديوم', '5', '%', 'ملصق', 'غذاء', 'استبدال', 'ملح', 'تعزيز', 'نكه', 'اعشاب', 'توابل', 'خردل', 'والخل', 'وف', 'غضو', 'اسبوع', 'اسبوع', 'سيعت

In [69]:
df = pd.DataFrame(data)
df

Unnamed: 0,illness_type,disease_name,words
0,داء السكري,التغذية الصحية لمرضى السكري المصابين بأمراض الكلى,"[الخط, غذاء, مرضى, كلى, مكمل, عشب, امن, حال, م..."
1,داء السكري,الحموضة الكيتونية,"[حموض, كيتون, حال, طب, طارء, يجب, معالج, فور, ..."
2,أمراض القلب والدورة الدموية,انخفاض ضغط الدم الموضعي,"[انخفاض, ضغط, الدم, وضع, حال, ينخفض, ضغط, الدم..."
3,أمراض العظام,هشاشة العظام,"[مقدم, عظام, اعضاء, صلب, تكو, هيكل, عظم, جهاز,..."
4,أمراض الجهاز الهضمي,القرحة الهضمية (قرحة المعدة),"[قرح, هضم, جرح, عميق, طان, قنا, هضم, سبب, تاكل..."
...,...,...,...
417,الأمراض الجلدية,الهربس التناسلي,"[هربس, تناسل, مرض, ينتقل, طريق, اتصال, جنس, يس..."
418,الأمراض الجلدية,الذئبة الحمراء,"[ذيب, حمراء, مرض, مناع, ذات, مزم, غير, معد, يم..."
419,الأمراض الجلدية,السيلان,"[سيل, مرض, ينتقل, طريق, اتصال, جنس, ويم, يسبب,..."
420,الأمراض الجلدية,ليزر إزالة الشعر,"[ازال, شعر, ليزر, اجراء, تجميل, يستخدم, ليزر, ..."


In [80]:
# One Hot Encoding
mlb = MultiLabelBinarizer()

# Fit and transform the 'words' column
one_hot_encoded_words = mlb.fit_transform(df['words'])

# Create a DataFrame from the encoded words
one_hot_vectors = pd.DataFrame(one_hot_encoded_words, columns=mlb.classes_)

# Concatenate the original DataFrame with the one-hot encoded words DataFrame
final_df = pd.concat([df[['illness_type', 'disease_name']], words_df], axis=1)

# Save the final DataFrame to a CSV file
final_df.to_csv('encoded_dataset.csv', index=False)

one_hot_vectors.head()

Unnamed: 0,Unnamed: 1,%,&,+,0,1,2,3,4,5,...,٨,٩,٪,‌,‏,–,•,⁄,,
0,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [81]:
#Bag of words Encoding
df = pd.DataFrame(data)

# Convert the 'words' column from a list of words to space-separated strings
df['words'] = df['words'].apply(lambda x: ' '.join(x))

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the 'words' column
bag_of_words = vectorizer.fit_transform(df['words'])

# Convert bag of words matrix to DataFrame
bow_vectors = pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names_out())

# Concatenate the original DataFrame with the bag of words DataFrame
final_df = pd.concat([df[['illness_type', 'disease_name']], bag_of_words_df], axis=1)

# Save the final DataFrame to a CSV file
final_df.to_csv('bag_of_words_dataset.csv', index=False)

bow_vectors.head()

Unnamed: 0,000,01,02,03,04,05,10,100,100مجم,109,...,٢٥٠,٢٦٠,٣٠,٣٠٠,٣٨,٤٠,٤٥,٥٥,٦٥,٧٠
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [82]:
# TF-IDF
# Convert data to DataFrame
df = pd.DataFrame(data)

# Convert the 'words' column from a list of words to space-separated strings
df['words'] = df['words'].apply(lambda x: ' '.join(x))

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the 'words' column
tfidf_representation = vectorizer.fit_transform(df['words'])

# Convert TF-IDF matrix to DataFrame
tfidf_vectors = pd.DataFrame(tfidf_representation.toarray(), columns=vectorizer.get_feature_names_out())

# Concatenate the original DataFrame with the TF-IDF DataFrame
final_df = pd.concat([df[['illness_type', 'disease_name']], tfidf_df], axis=1)

# Save the final DataFrame to a CSV file
final_df.to_csv('tfidf_dataset.csv', index=False)

tfidf_vectors.head()

Unnamed: 0,000,01,02,03,04,05,10,100,100مجم,109,...,٢٥٠,٢٦٠,٣٠,٣٠٠,٣٨,٤٠,٤٥,٥٥,٦٥,٧٠
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [83]:
from gensim.models import Word2Vec
# Extract words from the dataset
words = [entry['words'] for entry in data]

# Train the Word2Vec model
model = Word2Vec(words, sg=0, window=5, min_count=1, workers=4)

# Save the trained model
model.save("word2vec_cbow.model")

# Load the trained model (if needed)
# model = Word2Vec.load("word2vec_cbow.model")

# Get the word vectors
cbow_vectors = model.wv

# Example usage of word vectors
print("Vector for 'عمى':", cbow_vectors['عمى'])

# You can use word_vectors similarly for other words

Vector for 'عمى': [-0.11351692  0.18056272  0.17576724  0.11055798  0.11931759 -0.41590396
  0.01627954  0.54802704 -0.18968351  0.0064     -0.13791266 -0.18036583
 -0.3690142   0.01791674  0.10580117 -0.11474491 -0.10493687 -0.18629186
 -0.11408094 -0.5806818   0.30420142  0.1691483   0.04888216 -0.32650468
 -0.02279266  0.14107747 -0.11967427 -0.34027302 -0.47381213  0.20472744
  0.27548     0.03454091  0.07755369 -0.2626637  -0.1427137   0.2549083
  0.11126612 -0.27652574 -0.30531463 -0.53070116 -0.01983293 -0.06594672
 -0.1638924  -0.02605522  0.06459172 -0.24889544 -0.19409002 -0.15754725
  0.26227102  0.15841764 -0.03224924 -0.20549597 -0.13694467  0.25477862
 -0.17188376  0.28598526  0.21840097  0.02268785 -0.1455353  -0.017521
  0.01611256 -0.04377453 -0.0301366  -0.07161519 -0.63877136  0.28231338
  0.12611333  0.16345176 -0.48251417  0.2376007  -0.15142553  0.05540434
  0.35310805  0.02163318  0.30481178  0.21624626  0.13699633  0.03049604
 -0.2198984   0.1436126  -0.2519174 

In [76]:
from gensim.models import FastText

# Assuming 'data' is your dataset containing text
# Extract words from the dataset
words = [entry['words'] for entry in data]

# Train the FastText model
fasttext_model = FastText(sentences=words, vector_size=100, window=5, min_count=1, workers=4)

# Save the trained model
fasttext_model.save("fasttext_model.model")

# Load the trained model (if needed)
# fasttext_model = FastText.load("fasttext_model.model")

# Get the word vectors
fastText_vectors = fasttext_model.wv

# Example usage of word vectors
print("Vector for 'عمى':", fastText_vectors['عمى'])

# You can use word_vectors similarly for other words

Vector for 'عمى': [-0.1270061   0.3958135   0.45689824  0.2114548   0.26942214 -0.30927464
  0.22709677  0.43576163 -0.412777    0.1154881  -0.02792269 -0.11379868
 -0.09885193  0.10772035  0.41232064  0.40348315  0.10645175 -0.06317022
 -0.11608602 -0.7766631   0.06737906 -0.22182044  0.28072268  0.10441529
 -0.22174014 -0.02767122  0.16616747 -0.8121708   0.18250537  0.11248346
 -0.14371483 -0.07707077  0.19830368 -0.02758579 -0.06088215 -0.07991599
  0.03540096 -0.37606028  0.01773177  0.08784646 -0.32745615 -0.0112821
 -0.14539614  0.41034022  0.25806397  0.04747258  0.26555657  0.20422737
  0.08199985 -0.06348313 -0.22344534 -0.0341691   0.00723101 -0.36018533
  0.5142819  -0.15977032  0.15727365  0.02976179  0.41264495 -0.05047553
 -0.02826021  0.355759    0.11376052 -0.3051381   0.14660124  0.03575468
  0.06149881  0.36761886  0.6092188  -0.01455515 -0.19639634 -0.40057582
  0.26515287  0.02641103  0.13888328  0.23677993  0.3749513   0.04455417
 -0.29433706  0.2491109  -0.020760

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Assuming you have vectors for each technique
# one_hot_vectors, bow_vectors, tfidf_vectors, word2vec_vectors, glove_vectors, fasttext_vectors

# Concatenate all vectors into a single matrix
all_vectors = np.concatenate([
    one_hot_vectors,
    bow_vectors,
    tfidf_vectors,
    cbow_vectors,
    fastText_vectors
])

# Define labels for each vector set
labels = ['One-Hot Encoding'] * len(one_hot_vectors) + \
         ['Bag of Words'] * len(bow_vectors) + \
         ['TF-IDF'] * len(tfidf_vectors) + \
         ['Word2Vec'] * len(cbow_vectors) + \
         ['FastText'] * len(fastText_vectors)

# Apply t-SNE to reduce dimensionality to 2D
tsne = TSNE(n_components=2, random_state=42)
tsne_vectors = tsne.fit_transform(all_vectors)

# Plot t-SNE embeddings
plt.figure(figsize=(10, 8))
for i, label in enumerate(labels):
    plt.scatter(tsne_vectors[i, 0], tsne_vectors[i, 1], label=label)

plt.title('t-SNE Visualization of Encoded/Vectorized Vectors')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend()
plt.grid(True)
plt.show()