In [14]:
import os
os.chdir('../')
%pwd

'd:\\Python\\Industry level\\EndtoEnd combination'

In [15]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class FeatureEngineeringConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path

In [16]:
from resumeScreening.constants import *
from resumeScreening.utils.common import read_yaml,create_directories

In [17]:
class CofigurationManager:
    def __init__(
            self,
            config_filepath = CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_feature_engineering_config(self) -> FeatureEngineeringConfig:
        config = self.config.feature_engineering

        create_directories([config.root_dir])

        feature_engineering_config = FeatureEngineeringConfig(
            root_dir= config.root_dir,
            train_data_path = config.train_data_path,
            test_data_path=config.test_data_path
        )

        return feature_engineering_config

In [18]:


import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from resumeScreening import logger
from pathlib import Path

Note: you may need to restart the kernel to use updated packages.


In [24]:
class FeatureEngineering:
    def __init__(self, config: FeatureEngineeringConfig):
        self.config = config

    def run(self):
        logger.info("Reading train and test data...")
        train_df = pd.read_csv(self.config.train_data_path)
        test_df = pd.read_csv(self.config.test_data_path)

        X_train_texts = train_df["Cleaned_Resume"]
        y_train_labels = train_df["Category"]
        X_test_texts = test_df["Cleaned_Resume"]
        y_test_labels = test_df["Category"]

         # === ML: TF-IDF ===
        logger.info("Fitting TF-IDF Vectorizer...")
        tfidf = TfidfVectorizer(max_features=3000)
        X_train_tfidf = tfidf.fit_transform(X_train_texts)
        X_test_tfidf = tfidf.transform(X_test_texts)

         # Save TF-IDF vectorizer

        with open(Path(self.config.root_dir)/'vectorizer.pkl','wb') as f:
            pickle.dump(tfidf,f)

        # === Label Encoding ===
        logger.info("Encoding labels...")
        label_encoder = LabelEncoder()
        y_train_encoded = label_encoder.fit_transform(y_train_labels)
        y_test_encoded = label_encoder.transform(y_test_labels)

        #Save label encoder
        with open(Path(self.config.root_dir)/'labelEncoder.pkl','wb') as f:
            pickle.dump(label_encoder,f)

         # Save ML features
        with open(Path(self.config.root_dir) / "X_train_tfidf.pkl", "wb") as f:
            pickle.dump(X_train_tfidf, f)
        with open(Path(self.config.root_dir) / "X_test_tfidf.pkl", "wb") as f:
            pickle.dump(X_test_tfidf, f)
        np.save(Path(self.config.root_dir )/ "y_train_ml.npy", y_train_encoded)
        np.save(Path(self.config.root_dir) / "y_test_ml.npy", y_test_encoded)

        # === DL: Tokenizer + Padding ===
        logger.info("Fitting tokenizer for DL...")
        tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
        tokenizer.fit_on_texts(X_train_texts)

        X_train_seq = tokenizer.texts_to_sequences(X_train_texts)
        X_test_seq = tokenizer.texts_to_sequences(X_test_texts)

        X_train_pad = pad_sequences(X_train_seq, maxlen=300)
        X_test_pad = pad_sequences(X_test_seq, maxlen=300)

        y_train_dl = to_categorical(y_train_encoded)
        y_test_dl = to_categorical(y_test_encoded)

        # Save DL features
        with open(Path(self.config.root_dir )/ "tokenizer.pkl", "wb") as f:
            pickle.dump(tokenizer, f)
        np.save(Path(self.config.root_dir)/ "X_train_pad.npy", X_train_pad)
        np.save(Path(self.config.root_dir)/ "X_test_pad.npy", X_test_pad)
        np.save(Path(self.config.root_dir)/ "y_train_dl.npy", y_train_dl)
        np.save(Path(self.config.root_dir)/ "y_test_dl.npy", y_test_dl)

        logger.info("[✓] Feature engineering completed and files saved.")

In [22]:
%pwd
os.chdir('d:\\Python\\Industry level\\EndtoEnd combination\\AI-Resume-Screening-with-ML-ops')

In [25]:
try:
    config = CofigurationManager()
    get_feature_engineering_config = config.get_feature_engineering_config()
    feature_engineering = FeatureEngineering(config= get_feature_engineering_config)
    feature_engineering.run()
except Exception as e:
    raise e

[2025-08-06 00:06:13,457: INFO: yaml file: config\config.yaml loaded successfully]
[2025-08-06 00:06:13,470: INFO: yaml file: params.yaml loaded successfully]
[2025-08-06 00:06:13,473: INFO: Created directory at: artifacts]
[2025-08-06 00:06:13,479: INFO: Created directory at: artifacts/feature_engineering]
[2025-08-06 00:06:13,479: INFO: Reading train and test data...]
[2025-08-06 00:06:13,617: INFO: Fitting TF-IDF Vectorizer...]
[2025-08-06 00:06:13,875: INFO: Encoding labels...]
[2025-08-06 00:06:13,890: INFO: Fitting tokenizer for DL...]
[2025-08-06 00:06:14,156: INFO: [✓] Feature engineering completed and files saved.]


--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\Ankit\AppData\Local\Programs\Python\Python311\Lib\logging\__init__.py", line 1113, in emit
    stream.write(msg + self.terminator)
  File "C:\Users\Ankit\AppData\Local\Programs\Python\Python311\Lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode character '\u2713' in position 33: character maps to <undefined>
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "d:\Python\Industry level\EndtoEnd combination\AI-Resume-Screening-with-ML-ops\resume\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "d:\Python\Industry level\EndtoEnd combination\AI-Resume-Screening-with-ML-ops\resume\Lib\site-packages\traitlets\config\appli