In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

# Install gensim if not already installed
!pip install gensim
from gensim.models import Word2Vec


Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m55.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [9]:
#Load data

data_path = '/content/alldata_1_for_kaggle.csv'
df = pd.read_csv(data_path, encoding='latin-1')

print(f'Shape: {df.shape}')
print(f'Columns: {list(df.columns)}')

Shape: (987, 3)
Columns: ['Unnamed: 0', '0', 'a']


In [4]:
df.columns = ['Serial_Number', 'Target_Labels', 'Research_Text']

In [5]:
print(df.dtypes)
print('\n Missing Values')
print(df.isnull().sum())

Serial_Number     int64
Target_Labels    object
Research_Text    object
dtype: object

 Missing Values
Serial_Number    0
Target_Labels    0
Research_Text    0
dtype: int64


In [6]:
#check duplicate
print(f'Number of Duplicates: {df.duplicated().sum()}')

Number of Duplicates: 0


In [7]:
# Drop unnecessary columns
df = df.drop(columns=['Serial_Number'])

In [11]:
cbow_texts = df["a"].astype(str)

nltk.download('stopwords') # Download stopwords
stop_words = set(stopwords.words("english"))

def cbow_clean_and_tokenize(text):
    text = text.lower()                              # lowercasing
    text = re.sub(r"http\S+|www\S+", "", text)       # remove URLs
    text = re.sub(r"[^a-z\s]", " ", text)            # punctuation removal
    text = re.sub(r"\s+", " ", text).strip()         # normalization
    tokens = [w for w in text.split() if w not in stop_words]  # tokenization + stopword removal
    return tokens

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
cbow_sentences = cbow_texts.apply(cbow_clean_and_tokenize).tolist()

cbow_sentences[:2]


[['thyroid',
  'surgery',
  'children',
  'single',
  'institution',
  'osama',
  'ibrahim',
  'almosallama',
  'ali',
  'aseerib',
  'ahmed',
  'alhumaida',
  'ali',
  'alzahranic',
  'saif',
  'alsobhib',
  'saud',
  'alshanafeybfrom',
  'adepartment',
  'surgery',
  'college',
  'medicine',
  'qassim',
  'university',
  'buraidah',
  'al',
  'qassim',
  'saudi',
  'arabia',
  'bdepartment',
  'surgery',
  'king',
  'faisal',
  'specialist',
  'hospital',
  'research',
  'center',
  'riyadh',
  'saudi',
  'arabia',
  'cdepartment',
  'medicine',
  'king',
  'faisal',
  'specialist',
  'hospital',
  'research',
  'center',
  'riyadh',
  'saudi',
  'arabia',
  'correspondence',
  'dr',
  'osama',
  'ibrahim',
  'almosallam',
  'department',
  'surgery',
  'college',
  'medicine',
  'qassim',
  'university',
  'po',
  'box',
  'buraidah',
  'al',
  'qassim',
  'saudi',
  'arabia',
  'osama',
  'iaahotmailcom',
  'orcid',
  'orcid',
  'citation',
  'almosallam',
  'oi',
  'aseeri',
  'al

In [13]:
with open("cbow_preprocessed.txt", "w", encoding="utf-8") as f:
    for sent in cbow_sentences:
        f.write(" ".join(sent) + "\n")

print("CBOW corpus saved as cbow_preprocessed.txt")


CBOW corpus saved as cbow_preprocessed.txt
