In [59]:
import re
import string
import pickle
import logging

import pandas as pd
from scipy.stats import skew
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
from nltk.corpus import abc
from nltk import pos_tag
import pandas.core.series as pdSeries

from config import *


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class DataReader():
    def __init__(self):
        self.mandatory_cols = config["mandatory_cols"]
        self.native_cols = config["data_source_native_cols"]
        self.data_source_path = config["source_data"]
        self.seperator = config["seperator"]

    def retrieve_data_source(self) -> pd.DataFrame:
        """
        """
        pdf = self.read_dataFile()
        self.validate_datasource(pdf)
        return pdf


    def read_dataFile(self):
        """
        """
        try:
            pdf = pd.read_csv(self.data_source_path, delimiter = self.seperator, names=self.native_cols)
        except Exception:
            raise Exception("cannot read the data_handling source!")
        return pdf

    def validate_datasource(self, pdf:  pd.DataFrame):
        """
        """
        if set(self.mandatory_cols).issubset(set(pdf.columns)):
            pass
        else:
            logger.ERROR("!!! Mandatory cols are not present in the data_handling source provided !!")
            raise Exception("Mandatory cols are not present in the data_handling source provided ")



class FeatEngineering(object):

    def __init__(self):
        self.run_mode = config["run_mode"]

        #self.punctuation = set(string.punctuation)
        #self.stop_words = set(stopwords.words('english'))
        #self.lambda_not_number = lambda x: not any(re.match("^\d*$", x.replace(punc, "")) for punc in self.punctuation)
        #self.lambda_no_punc = lambda x: self.no_punc(x)
        #self.accepted_POS = ["NN", "VB", "CD"]

        self.skewness_threshold = config["skewness_threshold"]
        self.fill_nan_approach = config["fill_nan_approach"]
        self.feature_cols = config["feature_col"]
        self.target_col = config["native_target_col"]
        self.final_feature_col = config['final_feature_col']
        self.pickle_folder = config["model_folder"]
        if self.run_mode == RUN_MODE.TRAIN.value:
            self.test_train_split = config["test_train_split_ratio"]
        logger.info(f"** doing feature engineering")



    def do_feature_engineering(self, pdf: pd.DataFrame):
        """

        """
        logger.info("** start feature engineering! **")
        if self.run_mode == RUN_MODE.TRAIN.value:
            self.checkSkewness(pdf=pdf)
            pdf = self.dealWithNan(pdf=pdf)
            pdf = self.uniteFeatureColumns(pdf=pdf)
            x_train, y_train, x_test, y_test = self.split_test_train(pdf=pdf)
            res = [x_train, y_train, x_test, y_test]
        elif self.run_mode == RUN_MODE.PREDICT.value:
            res = x
        else:
            raise Exception(" Selected RUN_MODE is not correct!...")


        logger.info(f"** FINISHED feature engineering!")

        return res


    def checkSkewness(self, pdf: pd.DataFrame):
        """
        To balance the target categories. after check the skewness of the target column, downsample
         is performed to balance the data_handling.
        """
        pdf_count   = pdf.groupby([self.target_col]).size().to_frame(name="counts").reset_index()
        skewness    = skew(pdf_count.values[:, 1])
        if_skewed 	= True if abs(skewness) > self.skewness_threshold else False

        if if_skewed:
            logger.warning("** Data is skewed. down-sampling or up-sampling is suggested")


    def dealWithNan(self, pdf: pd.DataFrame):
        """

        """
        if self.fill_nan_approach == "empty_string":
            pdf = pdf.fillna(value='')
        elif self.fill_nan_approach == "drop":
            pdf = pdf.dropna()
        else: 
            logger.ERROR("** selected Nan handdling approach is not valid")
        return pdf


    def uniteFeatureColumns(self, pdf: pd.DataFrame):
        pdf[self.final_feature_col] = pdf[self.feature_cols].agg(' '.join, axis=1)
        return pdf


    def split_test_train(self, pdf: pd.DataFrame):
        """
        Returns: x_train, y_train, x_test, y_test
        """

        x_train, x_test, y_train, y_test = train_test_split(pdf[[self.final_feature_col]], pdf[self.target_col], test_size=self.test_train_split, random_state=0)
        return x_train, y_train, x_test, y_test



class Model(object):

    def __init__(self):
        self.run_mode = config["run_mode"]
        self.model_folder = config["model_folder"]
        self.model_selection = config["model_selection"]
        #self.model_parameters = config['model_patameters']
        self.model_file_name = f"{self.model_folder}/model.pkl"
        self.final_feature_col = config['final_feature_col']

        self.model = None
        
        if self.run_mode == RUN_MODE.PREDICT.value:
            self.prediction_col = config["prediction_col"]
            self.output_file_name = config["output_file_name"]
            with open(f"{self.model_folder}/{config['label_encoder']}.pkl", "rb") as fp:
                self.le = pickle.load(fp)
        logger.info(f"** do model {self.run_mode}, active model is {self.model_selection} !!")


    def construct_preProcessing_pipline(self):
        descriptive_features_pipeline = Pipeline(steps=
            [
                ('CountVectorizer', CountVectorizer()),
                ('Tfidf', TfidfTransformer())
            ]
        )

        preprocessing_pipeline = ColumnTransformer(transformers=
            [
                ('num',descriptive_features_pipeline, self.final_feature_col)
            ]
        )
        return preprocessing_pipeline

    def construct_classifier_pipline(self):
        if self.model_selection == 'svm': 
            classifier_pipline = SGDClassifier()
        elif self.model_selection == 'naive_bayes': 
            classifier_pipline = MultinomialNB()
        elif self.model_selection == 'knn': 
            classifier_pipline = KNeighborsClassifier()
        elif self.model_selection == 'logistic_regression': 
            classifier_pipline = LogisticRegression()

        return classifier_pipline

    def construct_model_pipline(self, preprocessing_pipeline, classifier_pipline):
        pipe = Pipeline(steps=
            [
                ('preprocessor', preprocessing_pipeline),
                ('classifier', classifier_pipline)
            ]
        )
        return pipe

    
    def run(self, data):
        if self.run_mode == RUN_MODE.TRAIN.value:
            x_train, y_train, x_test, y_test = data
            print(y_test.head())
            pp   = self.construct_preProcessing_pipline()
            clf  = self.construct_classifier_pipline()
            pipe = self.construct_model_pipline(pp,clf)
            self.model = pipe.fit(x_train, y_train)
            Model.save_model(model=self.model, file_name=self.model_file_name)
            logger.info(f"saved model: {self.model_selection} to {self.model_file_name}")

            self.evaluate_model(x_test, y_test, self.model)
                    
        if self.run_mode == RUN_MODE.PREDICT.value:
            self.model = Model.load_model(model=self.model, file_name=self.model_file_name)
            x = data
            y = self.model.predict(x)
            logger.info(f"predicted values are: {y}")

            
    
                
    def evaluate_model(self, x_test: pdSeries.Series, y_test: pdSeries.Series, pipe:Pipeline):
        predicted = pipe.predict(x_test)
        logger.info("model accuracy is: %.3f \n" % pipe.score(x_test, y_test) )
        logger.info(metrics.classification_report(y_test, predicted, target_names=y_test.unique()))

    @staticmethod
    def save_model(model, file_name:str):
        """
        """
        if file_name != "":
            with open(file_name, 'wb') as fp:
                pickle.dump(model, fp)


    @staticmethod
    def load_model(file_name: str):
        try:
            with open(file_name, 'rb') as fp:
                return pickle.load(fp)
        except FileNotFoundError:
            raise FileNotFoundError("model doesn't exist! need to train first!")

In [60]:
data_handler = DataReader()
data = data_handler.retrieve_data_source()

feature_handler = FeatEngineering()
pdf = featureObj.do_feature_engineering(data)

model = Model()
model.run(pdf)


INFO:__main__:** doing feature engineering
INFO:__main__:** start feature engineering! **
INFO:__main__:** FINISHED feature engineering!
INFO:__main__:** do model train, active model is svm !!
INFO:__main__:saved model: svm to saved_models/model.pkl


3069           BICYCLES
1675     CONTACT LENSES
6363    WASHINGMACHINES
543     WASHINGMACHINES
3214    WASHINGMACHINES
Name: productgroup, dtype: object


INFO:__main__:model accuracy is: 0.998 

INFO:__main__:                 precision    recall  f1-score   support

       BICYCLES       1.00      1.00      1.00       611
 CONTACT LENSES       1.00      1.00      1.00       626
WASHINGMACHINES       1.00      1.00      1.00       602
     USB MEMORY       1.00      1.00      1.00       562

       accuracy                           1.00      2401
      macro avg       1.00      1.00      1.00      2401
   weighted avg       1.00      1.00      1.00      2401

