##### A stemmed CountVectorizer

In [1]:
import copy
from sklearn.feature_extraction.text import CountVectorizer

# Overide CountVectorizer to integrate stemming
class StemmedCountVectorizer(CountVectorizer):
    def __init__(self, stemmer, **kwargs):
        super(StemmedCountVectorizer, self).__init__(**kwargs)
        self.stemmer = stemmer

    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: (self.stemmer.stem(w) for w in analyzer(doc))

    def get_params(self, deep=True):
        params = super().get_params(deep=deep)
        cp = copy.copy(self)
        cp.__class__ = CountVectorizer
        params.update(CountVectorizer.get_params(cp, deep))
        return params

In [7]:
# -*- coding: utf-8 -*-

"""FakeNewsClassifier Trainer

This script conduct the following steps (overview):

1. Load the processes dataset from {PROJECT_DIR}/data/processed as dataframe
2. Concatenate title and text in one column
3. Split the dataset into train and test

4. Set up a sklearn pipeline with preprocessor and classifier for the combined column (title+text)
   and perform hyperparam search 
5. Extract best score and best_params for and log it
6. Predict on testset and log the score

7. Pickle best pipe to disk (final model)

"""
# %%
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()

import os
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, f1_score, recall_score, precision_score
from dotenv import find_dotenv, load_dotenv
import nltk
from nltk.stem.snowball import GermanStemmer
from sklearn.feature_extraction.text import CountVectorizer
import dagshub
from joblib import dump


load_dotenv(find_dotenv())
#%% Load and split dataset
# INPUTFILE = os.path.join(os.getenv('PROJECT_DIR'), 'data', 'processed', 'fake_news_processed.csv')
INPUTFILE = os.path.join('data', 'datasets_merged.csv')
df = pd.read_csv(INPUTFILE, sep=';')
logger.info('Distribution of fake news in entire dataset: \n%s' % df.fake.value_counts(normalize=True))

X = df['title']+' '+df['text']
y = df['fake']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                    test_size=0.2,
                    stratify=y,
                    random_state=42)

# Download and init stopwordlist
nltk.download('stopwords')
STOPWORD_LIST = nltk.corpus.stopwords.words('german')

#print(X_train[100])

# %% Test with CountVectorizer
"""
count_vec = StemmedCountVectorizer(token_pattern=r'\b[a-zA-Z]{2,}\b',
                                max_features=100,
                                stop_words=STOPWORD_LIST,
                                lowercase=True,
                                stemmer=GermanStemmer(ignore_stopwords=True),
                                ngram_range=(2,2)
                        )

word_counts = count_vec.fit_transform(X_train).todense()

df_body = pd.DataFrame(word_counts, columns=count_vec.get_feature_names())
df_body_trans = pd.DataFrame(df_body.T.sum(axis=1), columns=['count'])
df_body_trans.sort_values(by='count', ascending=False).head(20)

"""

#%% Define param grid for tuning
param_grid = {'vectorizer__max_features':[500],
        'clf__n_estimators': [400],
        'clf__learning_rate': [0.9],
        'clf__max_depth': [5]}

# Constants for training
MODEL_PATH = os.getenv('MODEL_PATH')
CV_SCORING = 'f1'
CV_FOLDS = 3

# %%
with dagshub.dagshub_logger() as dagslog:
        # Pipeline
        pipe = Pipeline(steps=[
        ('vectorizer', StemmedCountVectorizer(stop_words=STOPWORD_LIST,
                                   token_pattern=r'\b[a-zA-Z]{2,}\b',
                                   stemmer=GermanStemmer(ignore_stopwords=True),
                                   lowercase=True)),
        ('clf', CatBoostClassifier(allow_writing_files=False))
        ])
        
        # pipe_title_tune.get_params().keys()
        logger.info('Start training estimator pipeline')
        cv_grid = GridSearchCV(pipe, param_grid, scoring=CV_SCORING, cv=CV_FOLDS, n_jobs=2)
        cv_grid.fit(X_train, y_train)
        #cv_grid.best_estimator_.named_steps['clf'].get_all_params()

        # Log and assign best score and params to var
        logger.info('Best params from GridSearchCV: %s' % str(cv_grid.best_params_))
        dagslog.log_hyperparams({'model_path': MODEL_PATH})
        dagslog.log_hyperparams({'cv_score': CV_SCORING})
        dagslog.log_hyperparams({'best_cv_params': cv_grid.best_params_})

        logger.info('Best %s from GridSearchCV: %s' % (CV_SCORING, str(cv_grid.best_score_)))
        dagslog.log_metrics({'best_cv_score': cv_grid.best_score_})
        
        # Predict on testdata
        y_pred = cv_grid.best_estimator_.predict(X_test)
        logger.info('f1_score for testdata: %s' % str(f1_score(y_test, y_pred)))
        dagslog.log_metrics({'f1_score_on_testdata': f1_score(y_test, y_pred)})
        logger.info('precision_score for testdata: %s' % str(precision_score(y_test, y_pred)))
        dagslog.log_metrics({'precision_score_on_testdata': precision_score(y_test, y_pred)})
        logger.info('recall_score for testdata: %s' % str(recall_score(y_test, y_pred)))
        dagslog.log_metrics({'recall_score_on_testdata': recall_score(y_test, y_pred)})
        logger.info('Classification report for testdata: \n%s' % classification_report(y_test, y_pred))
        logger.info('Confusion matrix for testdata: \n%s' % confusion_matrix(y_test, y_pred))

        # Dump model to disk
        dump(cv_grid.best_estimator_, MODEL_PATH)

INFO:root:Distribution of fake news in entire dataset: 
0    0.922928
1    0.077072
Name: fake, dtype: float64
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fjun\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
INFO:root:Start training estimator pipeline


0:	learn: 0.1790363	total: 157ms	remaining: 1m 2s
1:	learn: 0.1292465	total: 171ms	remaining: 33.9s
2:	learn: 0.1143598	total: 184ms	remaining: 24.3s
3:	learn: 0.1028559	total: 196ms	remaining: 19.4s
4:	learn: 0.0949349	total: 209ms	remaining: 16.5s
5:	learn: 0.0891569	total: 221ms	remaining: 14.5s
6:	learn: 0.0860463	total: 233ms	remaining: 13.1s
7:	learn: 0.0828056	total: 245ms	remaining: 12s
8:	learn: 0.0804890	total: 258ms	remaining: 11.2s
9:	learn: 0.0802443	total: 268ms	remaining: 10.5s
10:	learn: 0.0773814	total: 281ms	remaining: 9.92s
11:	learn: 0.0743589	total: 294ms	remaining: 9.49s
12:	learn: 0.0742056	total: 304ms	remaining: 9.04s
13:	learn: 0.0712867	total: 317ms	remaining: 8.73s
14:	learn: 0.0710549	total: 327ms	remaining: 8.4s
15:	learn: 0.0693764	total: 339ms	remaining: 8.14s
16:	learn: 0.0692207	total: 350ms	remaining: 7.88s
17:	learn: 0.0672085	total: 362ms	remaining: 7.69s
18:	learn: 0.0662156	total: 374ms	remaining: 7.5s
19:	learn: 0.0660709	total: 385ms	remaining: 

176:	learn: 0.0272094	total: 2.2s	remaining: 2.77s
177:	learn: 0.0270789	total: 2.21s	remaining: 2.76s
178:	learn: 0.0270125	total: 2.22s	remaining: 2.75s
179:	learn: 0.0269998	total: 2.23s	remaining: 2.73s
180:	learn: 0.0269821	total: 2.25s	remaining: 2.72s
181:	learn: 0.0269769	total: 2.25s	remaining: 2.7s
182:	learn: 0.0268652	total: 2.27s	remaining: 2.69s
183:	learn: 0.0266784	total: 2.28s	remaining: 2.67s
184:	learn: 0.0266743	total: 2.29s	remaining: 2.66s
185:	learn: 0.0265352	total: 2.3s	remaining: 2.65s
186:	learn: 0.0265061	total: 2.31s	remaining: 2.63s
187:	learn: 0.0265027	total: 2.32s	remaining: 2.62s
188:	learn: 0.0264620	total: 2.33s	remaining: 2.6s
189:	learn: 0.0263202	total: 2.34s	remaining: 2.59s
190:	learn: 0.0263165	total: 2.35s	remaining: 2.58s
191:	learn: 0.0261984	total: 2.37s	remaining: 2.56s
192:	learn: 0.0260912	total: 2.38s	remaining: 2.55s
193:	learn: 0.0259652	total: 2.39s	remaining: 2.54s
194:	learn: 0.0259588	total: 2.4s	remaining: 2.52s
195:	learn: 0.025

340:	learn: 0.0168844	total: 4.07s	remaining: 704ms
341:	learn: 0.0168831	total: 4.08s	remaining: 691ms
342:	learn: 0.0168805	total: 4.09s	remaining: 679ms
343:	learn: 0.0168684	total: 4.1s	remaining: 667ms
344:	learn: 0.0168546	total: 4.11s	remaining: 655ms
345:	learn: 0.0167984	total: 4.12s	remaining: 643ms
346:	learn: 0.0167353	total: 4.13s	remaining: 631ms
347:	learn: 0.0167231	total: 4.14s	remaining: 619ms
348:	learn: 0.0166949	total: 4.15s	remaining: 607ms
349:	learn: 0.0166856	total: 4.16s	remaining: 595ms
350:	learn: 0.0166549	total: 4.17s	remaining: 583ms
351:	learn: 0.0166390	total: 4.19s	remaining: 571ms
352:	learn: 0.0165855	total: 4.2s	remaining: 559ms
353:	learn: 0.0165495	total: 4.21s	remaining: 547ms
354:	learn: 0.0165172	total: 4.22s	remaining: 535ms
355:	learn: 0.0164860	total: 4.23s	remaining: 523ms
356:	learn: 0.0164603	total: 4.24s	remaining: 511ms
357:	learn: 0.0164589	total: 4.25s	remaining: 499ms
358:	learn: 0.0163962	total: 4.26s	remaining: 487ms
359:	learn: 0.

INFO:root:Best params from GridSearchCV: {'clf__learning_rate': 0.9, 'clf__max_depth': 5, 'clf__n_estimators': 400, 'vectorizer__max_features': 500}
INFO:root:Best f1 from GridSearchCV: 0.8555504998353983


389:	learn: 0.0151265	total: 4.65s	remaining: 119ms
390:	learn: 0.0150010	total: 4.66s	remaining: 107ms
391:	learn: 0.0149992	total: 4.67s	remaining: 95.3ms
392:	learn: 0.0149916	total: 4.68s	remaining: 83.4ms
393:	learn: 0.0149502	total: 4.7s	remaining: 71.5ms
394:	learn: 0.0149196	total: 4.71s	remaining: 59.6ms
395:	learn: 0.0148455	total: 4.72s	remaining: 47.7ms
396:	learn: 0.0148343	total: 4.73s	remaining: 35.8ms
397:	learn: 0.0147797	total: 4.75s	remaining: 23.8ms
398:	learn: 0.0147034	total: 4.76s	remaining: 11.9ms
399:	learn: 0.0146892	total: 4.77s	remaining: 0us


INFO:root:f1_score for testdata: 0.8681434599156118
INFO:root:precision_score for testdata: 0.8878101402373247
INFO:root:recall_score for testdata: 0.849329205366357
INFO:root:Classification report for testdata: 
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     11599
           1       0.89      0.85      0.87       969

    accuracy                           0.98     12568
   macro avg       0.94      0.92      0.93     12568
weighted avg       0.98      0.98      0.98     12568

INFO:root:Confusion matrix for testdata: 
[[11495   104]
 [  146   823]]


ValueError: Second argument should be a filename or a file-like object, None (type <class 'NoneType'>) was given.

In [9]:
key = 'HOME'
value = os.getenv(key)
  
# Print the value of 'HOME'
# environment variable
print("Value of 'HOME' environment variable :", value) 
  
# Get the value of 'JAVA_HOME'
# environment variable
key = 'MODEL_PATH'
value = os.getenv(key)
  
# Print the value of 'JAVA_HOME'
# environment variable
print("Value of 'JAVA_HOME' environment variable :", value) 

Value of 'HOME' environment variable : None
Value of 'JAVA_HOME' environment variable : None
