In [18]:
# Library functions
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
#from xgboost import XGBClassifier
#from catboost import CatBoostClassifier
import multiprocessing
import os
import joblib
from tqdm import tqdm

# Our functions
from utils import TextPreprocessor, FeatureGenerator, remove_nan_questions, get_param_grid
# patch_sklearn()  # to speed up scikit-learn

In [19]:
_path_folder_quora = "Datasets/QuoraQuestionPairs"
MODELS_DIR = "models"
SEED = 123

In [20]:
if not os.path.exists(MODELS_DIR):
    os.makedirs(MODELS_DIR)
    print(f"Folder '{MODELS_DIR}' created successfully.")
else:
    print(f"Folder '{MODELS_DIR}' already exists.")

Folder 'models' created successfully.


In [21]:
train_df = pd.read_csv(os.path.join(_path_folder_quora, "quora_train_data.csv"))
x_train = train_df.loc[:, ["question1", "question2"]]
y_train = train_df.loc[:, "is_duplicate"]

x_train, y_train = remove_nan_questions(x_train, y_train)

x_train, x_test, y_train, y_test = train_test_split(
        x_train, y_train, test_size=0.05, random_state=SEED)
x_train, x_validation, y_train, y_validation = train_test_split(
        x_train, y_train, test_size=0.05, random_state=SEED)

In [22]:
pipe = Pipeline(
            [('preprocessor', TextPreprocessor(
                to_lower=True
            )),
             ('generator', FeatureGenerator(exts=('cv', ), aggs=('stack', ), extra_features=tuple())),
             ('classifier', LogisticRegression(max_iter=1000, solver="liblinear",random_state=SEED))],
            verbose=True)

pipe.fit(x_train, y_train)

[Pipeline] ...... (step 1 of 3) Processing preprocessor, total=   0.7s
[Pipeline] ......... (step 2 of 3) Processing generator, total=  16.9s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  34.5s


In [23]:
joblib.dump(pipe, os.path.join(MODELS_DIR, f"simple_solution.pk1"))

['models\\simple_solution.pk1']

In [24]:
GRID_SEARCH: bool = False  # True

In [25]:
if GRID_SEARCH:
    models = {
            "AdaBoostClassifier": AdaBoostClassifier(),
            "RandomForestClassifier": RandomForestClassifier(),
            "LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(),
            "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(),
            "BernoulliNB": BernoulliNB(),
            "GaussianNB": GaussianNB(),
            "KNeighborsClassifier": KNeighborsClassifier(),
            "SVC": SVC(),
            "LogisticRegression": LogisticRegression(max_iter=1000, random_state = SEED),
            "GradientBoostingClassifier": GradientBoostingClassifier()
            #"XGBClassifier": XGBClassifier(n_jobs=multiprocessing.cpu_count() - 1),
            #"CatBoostClassifier": CatBoostClassifier(silent=True),
        }

In [26]:
if not GRID_SEARCH:
    models = {"LogisticRegression": LogisticRegression(max_iter=1000, random_state = SEED)}

In [27]:
if not GRID_SEARCH:
    pipe = Pipeline(
            [('preprocessor', # TextPreprocessor()
              TextPreprocessor(
                 remove_stop_words = True,
                 remove_punctuation = True,
                 to_lower = True,
                 apply_stemming = True,
                 british = False)
            ),
             ('generator', FeatureGenerator(exts=('cv_2w', 'tf_idf_2w'), aggs=('stack', 'absolute'))),
             ('classifier', models['LogisticRegression'])],
            verbose=True)
    pipe.fit(x_train, y_train)

[Pipeline] ...... (step 1 of 3) Processing preprocessor, total= 5.8min
[Pipeline] ......... (step 2 of 3) Processing generator, total= 1.8min
[Pipeline] ........ (step 3 of 3) Processing classifier, total=17.3min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
if not GRID_SEARCH:
    joblib.dump(pipe, f'{MODELS_DIR}/improved_solution.joblib')

In [29]:
if GRID_SEARCH:
    fitted_models = {}
    scores = {}
    for name, model in tqdm(models.items()):
        # define pipeline given a model
        pipe = Pipeline(
            [('preprocessor', TextPreprocessor()),
             ('generator', FeatureGenerator()),
             ('classifier', model)],
            verbose=True)
        # get grid of parameters to search
        grid = get_param_grid(name, SEED)
        grid_search = GridSearchCV(
            pipe,
            param_grid=grid,
            scoring= "roc_auc",
            cv=2,
            verbose=10,
            n_jobs=multiprocessing.cpu_count() - 1,
            error_score="raise",
        )

        # fit grid search with pipeline and grid
        grid_search.fit(x_train, y_train)

        # save model
        fitted_models[name] = grid_search.best_estimator_
        scores[name] = grid_search.best_score_

        joblib.dump(grid_search, os.path.join(MODELS_DIR, f"fitted_{name}.pk1"))

In [30]:
if GRID_SEARCH:
    best_model_name = max(scores)
    best_model = fitted_models[best_model_name]
    print(f"Best model found in the grid search is {best_model_name}, with a CV score of {scores[best_model_name]:.4f}")
    fitted_pipe = best_model
    joblib.dump(fitted_pipe, f'{MODELS_DIR}/improved_solution.joblib')