In [3]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Embedding, GlobalAveragePooling1D
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from tensorflow.keras.models import load_model
import nltk
import joblib

In [4]:
# Download NLTK datasets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [6]:
full_df = pd.read_csv('/content/data.csv')

In [7]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [8]:
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text.lower())  # Remove punctuation and lowercase
    tokens = word_tokenize(text)  # Tokenization
    filtered_tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]  # Lemmatization
    return ' '.join(lemmatized_tokens) # Return tokenized list

# Apply preprocessing on title and text
full_df['clean_text'] = full_df['title'] + " " + full_df['text']
full_df['clean_text'] = full_df['clean_text'].apply(preprocess_text)


In [10]:
from collections import Counter
import itertools

all_words = list(itertools.chain(*full_df['clean_text'].apply(lambda x: x.split())))
word_counts = Counter(all_words)

print("Total unique words:", len(word_counts))

Total unique words: 205333


In [12]:
rare_words = [word for word, count in word_counts.items() if count < 5]
print(f"Number of rare words (appear <5 times): {len(rare_words)}")

Number of rare words (appear <5 times): 161602


In [13]:
import numpy as np

word_freq_values = np.array(sorted(word_counts.values(), reverse=True))
cumulative_freq = np.cumsum(word_freq_values) / np.sum(word_freq_values)

max_features = np.argmax(cumulative_freq >= 0.95) + 1
print(f"Suggested max_features to cover 95%: {max_features}")

Suggested max_features to cover 95%: 19817


# Pretrained Word2Vec (Google News)


In [23]:
import gensim.downloader as api
# Load Pretrained Word2Vec
print("Loading Google News Word2Vec model...")
word2vec_model = api.load("word2vec-google-news-300")
joblib.dump(word2vec_model, 'word2vec_google_news.pkl')

Loading Google News Word2Vec model...


['word2vec_google_news.pkl']

In [24]:
# Split data before applying embeddings
X_train_text, X_test_text, y_train, y_test = train_test_split(
    full_df['clean_text'], full_df['label'], test_size=0.2, random_state=42, stratify=full_df['label']
)

# Convert words to embeddings & pad sequences
max_len = 700  # Maximum sequence length

def get_sentence_embedding(tokens, model, vector_size=300):
    word_vectors = [model[word] for word in tokens if word in model]

    if len(word_vectors) == 0:
        return np.zeros((max_len, vector_size))  # Return zero-matrix if no valid words

    word_vectors = np.array(word_vectors)

    # Truncate if longer than max_len, pad if shorter
    if len(word_vectors) > max_len:
        word_vectors = word_vectors[:max_len]  # Truncate
    else:
        pad_size = max_len - len(word_vectors)
        word_vectors = np.vstack([word_vectors, np.zeros((pad_size, vector_size))])  # Pad with zeros

    return word_vectors

In [None]:
# Convert text to embeddings
X_train = np.array([get_sentence_embedding(tokens, word2vec_model, 300) for tokens in X_train_text])
X_test = np.array([get_sentence_embedding(tokens, word2vec_model, 300) for tokens in X_test_text])

In [None]:
# Build Conv1D Model
model = Sequential([
    Conv1D(filters=128, kernel_size=5, activation='relu', input_shape=(max_len, 300)),
    MaxPooling1D(pool_size=2),
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    GlobalAveragePooling1D(),
    Dense(64, activation='relu'),
    Dense(len(set(y_train)), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [None]:
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

In [None]:
test_acc = model.evaluate(X_test, y_test)
print(f"Conv1D Model Accuracy: {test_acc:.4f}")

In [None]:
prdictions = model.predict(X_test)
print(classification_report(y_test, prdictions))

In [None]:
model.save("conv1d_google_embeddings_model.h5")