In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse

# Load the dataset
file_path = r'../data/preprocessing/preprocessed.csv'
df = pd.read_csv(file_path)

# Fill NaN values with an empty string
df['Processed_Title'].fillna('', inplace=True)

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000,  # Larger number of features given the dataset size
                                   ngram_range=(1, 2),  # Unigrams and bigrams
                                   min_df=3,  # Lower min_df since dataset isn't very large
                                   max_df=0.85,  # Exclude terms that are too common
                                   sublinear_tf=True)  # Apply sublinear scaling

# Apply the vectorizer to the processed titles
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Processed_Title'])

# Convert the matrix to a DataFrame
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# Optionally, save the matrix to a CSV file
output_file = r'../data/feature_extraction/TFIDF_Features.csv'
tfidf_df.to_csv(output_file, index=False)

print(f"TF-IDF features saved to {output_file}")


TF-IDF features saved to ../data/feature_extraction/TFIDF_Features.csv


In [16]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec

# Load your dataset
file_path = r'../data/preprocessing/preprocessed.csv'
df = pd.read_csv(file_path)

# Fill NaN values with an empty string
df['Processed_Title'].fillna('', inplace=True)

# Tokenize your preprocessed text
tokenized_text = [text.split() for text in df['Processed_Title']]

# Create and train the Word2Vec model
word2vec_model = Word2Vec(tokenized_text, vector_size=100, window=5, min_count=1, workers=4)

# Define a function to average word vectors for a text
def document_vector(doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.wv]
    if not doc:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(word2vec_model.wv[doc], axis=0)

# Apply the function to each document
df['Doc_Vector'] = [document_vector(doc) for doc in tokenized_text]

# Split the vectors into their own columns for CSV output
vector_df = pd.DataFrame(df['Doc_Vector'].tolist())

# Concatenate the original dataframe with the vector dataframe
df = pd.concat([df, vector_df], axis=1)

# Drop the 'Doc_Vector' column
df.drop('Doc_Vector', axis=1, inplace=True)

# Save the dataframe to a CSV file
output_file_path = r'../data/feature_extraction/word_vectors.csv'
df.to_csv(output_file_path, index=False)


In [None]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch

# Load the dataset
file_path = r'../data/preprocessing/preprocessed.csv'
df = pd.read_csv(file_path)

# Replace NaN values in 'Processed_Title' column with empty string
df['Processed_Title'].fillna("", inplace=True)

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to encode text using BERT
def bert_encode(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding='max_length')
    outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].detach().numpy() # bert model will contain the inputs and the output will be the last hidden state

# Apply BERT encoding to each item in 'Processed_Title'
bert_embeddings = df['Processed_Title'].apply(bert_encode)

# Convert embeddings to a DataFrame
bert_embeddings_df = pd.DataFrame([embedding[0] for embedding in bert_embeddings])

# Concatenate original dataframe with embeddings
final_df = pd.concat([df, bert_embeddings_df], axis=1)

# Save the final DataFrame to a new CSV file
output_file_path = r'../data/feature_extraction/BERT_Features.csv'
final_df.to_csv(output_file_path, index=False)

print(f"BERT features dataset saved to {output_file_path}")