In [3]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import pickle

class BestFeatureExtract:
    def __init__(self, no_feature):
        """
        Initialize the BestFeatureExtract class.
        
        Parameters:
        - no_feature (int): The number of features to extract.
        """
        self.no_feature = no_feature
        self.important_indices = []
    
    def training(self, x_train, y_train):
        """
        Train the RandomForestClassifier to identify important features.
        
        Parameters:
        - x_train (ndarray): The training data.
        - y_train (ndarray): The training labels.
        """
        model = RandomForestClassifier()
        model.fit(x_train, y_train)
        feature_importances = model.feature_importances_
        self.important_indices = np.argsort(feature_importances)[-self.no_feature:]
    
    def best_feature_extract(self, x_data):
        """
        Extract the best features from the dataset using the important indices.
        
        Parameters:
        - x_data (ndarray): The dataset from which features are extracted.
        
        Returns:
        - ndarray: Dataset reduced to the selected features.
        """
        if not self.important_indices:
            raise ValueError("Important indices are not initialized. Run 'training' or 'load_indices' first.")
        return x_data[:, self.important_indices]
    
    def save_indices(self, all_features):
        """
        Save the important feature indices to a file.
        
        Parameters:
        - all_features (int): Total number of features in the original dataset.
        """
        file_name = f'best_{self.no_feature}_of_{all_features}.pkl'
        with open(file_name, 'wb') as file:
            pickle.dump(self.important_indices, file)
        print(f"Indices saved to {file_name}")
    
    def load_indices(self, all_features):
        """
        Load the important feature indices from a file.
        
        Parameters:
        - all_features (int): Total number of features in the original dataset.
        """
        file_name = f'best_{self.no_feature}_of_{all_features}.pkl'
        try:
            with open(file_name, 'rb') as file:
                self.important_indices = pickle.load(file)
            print(f"Indices loaded from {file_name}")
        except FileNotFoundError:
            raise FileNotFoundError(f"File '{file_name}' not found. Ensure the indices have been saved first.")


In [5]:
from gensim.models import Word2Vec
import pandas as pd
from sklearn.preprocessing import normalize
from sklearn.feature_selection import mutual_info_classif

class NLP_Converter:
    def __init__(self, vector_size, window, mint_count, worker, sentences):
        
        # Initialize the Word2Vec model
        self.model = Word2Vec(
            sentences=sentences, 
            vector_size=vector_size, 
            window=window, 
            min_count=mint_count, 
            workers=worker
        )
        self.vector_size = vector_size

    def get_vector(self, text):
        vectors = [self.model.wv[word] for word in text if word in self.model.wv]
        
        if vectors:
            # Compute the average vector of all token vectors
            return sum(vectors) / len(vectors)
        else:
            # Return a zero vector if no tokens exist in the Word2Vec vocabulary
            return [0] * self.vector_size

    def create_nlp_extracted_feature(self, df):
        # Apply the `get_vector` method to the 'url' column to generate vector representations
        df["vector"] = df["url"].apply(self.get_vector).apply(lambda x: normalize([x])[0])
        
        # Create column names for individual vector components
        vector_columns = [f'vector_{i+1}' for i in range(self.vector_size)]
        
        # Convert the 'vector' column into a DataFrame with individual vector components as separate columns
        vectorized_data = pd.DataFrame(df['vector'].to_list(), columns=vector_columns)
        
        # Concatenate the original DataFrame with the vectorized components
        df = pd.concat([df[['vector']], vectorized_data], axis=1)
        
        return df


In [4]:
import pandas as pd
data = pd.read_csv("./dataset/v1/preprocess.csv")

In [None]:
nlp_preprocesser = NLP_Converter(1000, 5, 1, 4, data["preprocess_url"])
nlp_preprocesser = nlp_preprocesser.create_nlp_extracted_feature(data)

In [None]:
data

In [None]:
best_feature_extraction = BestFeatureExtract(200)
best_feature_extraction.training(data[], data["label"])
best_feature_extraction.save_indices(1000)

best_nlp_feature = self.best_feature_extraction.best_feature_extract(nlp_feature,data["label"])
data = pd.concat([data, best_nlp_feature], axis=1)
pd.DataFrame.to_csv("./dataset/v1/preprocess.csv", index=False, header=True, sep=',')