# First classic ML models training
---

## Notebook contents ##  
* Simple linear models training
* Selection of the best model, method
* Simple improvements of model, GridSearch for example
* Conclusions

`NOTE`: average `macro f1_score` selected as the target metric, anothers can be used as additional.

---

In [None]:
# import ast 
import re
import warnings 
# import pymorphy2

# import kagglehub
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt 
import pandas as pd

from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, f1_score
from scipy.sparse import hstack, csr_matrix


warnings.filterwarnings("ignore")

random_state = 42

Data taken from `text_domain_features_0.ipynb`

In [None]:
# select and read data: 
df_path = 'dump_features_include_numeric.csv'
df = pd.read_csv(df_path, index_col=0)

Now we're taking dataset_id 0, 1 to train, and the another ones (2-7) to test: 

In [None]:
# select data from the cleanest datasets (0, 1) to train: 
df_train = df[(df['dataset_id']==0) | (df['dataset_id']==1)]
df_train.dropna(subset='is_toxic', inplace=True)

# select data from the same source (vk), with the best results that BERT classifier predicted: 
df_test = df[df['dataset_id']==7]

n_test_add = 60000 
df_test = pd.concat(
    [
    df_test, 
    df[(df['dataset_id']==2) | (df['dataset_id']==3) | (df['dataset_id']==4) | (df['dataset_id']==5)].dropna(subset='is_toxic').sample(
        n=n_test_add,
        random_state=random_state)
    ]
    )

print(f'df_train shape: {df_train.shape}')
print(f'df_test shape: {df_test.shape}')


In [None]:
# count train is_toxic distribution:
plt.figure(figsize=(6,4))
sns.countplot(x='is_toxic', data=df_train.dropna(subset='is_toxic'), palette=['mediumseagreen','salmon'])
plt.title('Distribution of is_toxic variable in the chosen train')
plt.show()

In [None]:
# count test is_toxic distribution:
plt.figure(figsize=(6,4))
sns.countplot(x='is_toxic', data=df_test, palette=['mediumseagreen','salmon'])
plt.title('Distribution of is_toxic variable in the chosen test')
plt.show()

In [None]:
def train_and_test_models(
        train_texts: pd.Series,
        train_targets: pd.Series,
        train_num_features: pd.DataFrame,

        test_texts: pd.Series,
        test_targets: pd.Series,
        test_num_features: pd.DataFrame,

        # for results params:
        use_num_features: bool = True,
        texts_col: str = '',
        downsample: bool = False
    ):
    
    """Train basic models. Tha main goal is to to find the best one model.  

    train_texts, test_texts - pd.Series columns from base dataset that uses preprocessed texts.  
    train_targets, test_targets - pd.Series columns from base dataset that   
    train_num_features, test_num_features include additional pd.Dataframe that was composed in text domain  
    use_num_features uses bool var to define wwhether to add numerical features into experiment or not  
    """

    # apply tf-idf: 
    vectorizer = TfidfVectorizer(
        ngram_range=(1, 2),
        min_df=3,
        max_df=0.9
    )
    X_train_tfidf = vectorizer.fit_transform(train_texts)
    X_test_tfidf = vectorizer.transform(test_texts)

    if use_num_features:
        # check whether all columns in train, test are appropriating to each other,
        # correct if not:  
        train_num_features, test_num_features = train_num_features.align(
            test_num_features, axis=1, join="left", fill_value=0
        )

        # tf-idf is sparse, so compose numeric features into sparse format too: 
        train_num_sparse = csr_matrix(train_num_features.values)
        test_num_sparse = csr_matrix(test_num_features.values)

        # concat vectorized text and numeric features: 
        X_train = hstack([X_train_tfidf, train_num_sparse])
        X_test = hstack([X_test_tfidf, test_num_sparse])
    else:
        X_train = X_train_tfidf
        X_test = X_test_tfidf

    
    # select basic model examples using standart params only
    # the best model will be selected and tried to improve in its hypermarams: 
    # models = {
    #     "Logistic Regression": LogisticRegression(class_weight="balanced"),
    #     "Linear SVM (LinearSVC)": LinearSVC(class_weight="balanced"),
    #     "SGD Classifier": SGDClassifier(class_weight="balanced")
    # }
    models = {
        "Logistic Regression": LogisticRegression(),
        "Linear SVM (LinearSVC)": LinearSVC(),
        "SGD Classifier": SGDClassifier()
    }

    # metric results: 
    results = []
    
    # train and eval: 
    for name, model in models.items():
        # print("=" * 60)
        # print(f"Using model:: {name}")
        model.fit(X_train, train_targets)
        preds = model.predict(X_test)

        macro_f1 = f1_score(test_targets, preds, average="macro")
        micro_f1 = f1_score(test_targets, preds, average="micro")

        # base target metrics and additional output to eval metrics complexely: 
        # print(f"Macro F1-score: {macro_f1:.4f}")
        # print(f"Micro F1-score: {micro_f1:.4f}")

        # compose cls report and take the metrics from it: 
        report = classification_report(test_targets, preds, output_dict=True)
        class_metrics = {}
        for cls in sorted(report.keys()):
            if cls.replace('.', '', 1).isdigit(): 
                cls_text = int(float(cls))
                class_metrics[f"class_{cls_text}_precision"] = report[cls]["precision"]
                class_metrics[f"class_{cls_text}_recall"] = report[cls]["recall"]

        # update the record by values and add to the list: 
        record = {
            "model": name,
            "macro_f1": macro_f1,
            "micro_f1": micro_f1,
            "use_num_features": use_num_features, 
            "texts_col": texts_col,
            "downsampled_to_equals": downsample           
        }
        record.update(class_metrics)
        results.append(record)
    
    return results


In [None]:
# init results to compose the metrics: 
results = [] 

target_col = 'is_toxic'
composed_features = [
    'count_spp', 'count_rpp', 'punct_after_space', 'has_emoji', 'has_emoticon',
    'has_capslock', 'is_all_lower', 'has_punctuation_spp', 'has_punctuation_rpp',
    'has_fence_ironic_style', 'count_profanity', 'has_pronouns', 'starts_with_cap',
    'has_url', 'has_number', 'has_mention', 'has_hashtag', 'ends_with_dot',
    'has_emotional_sym', 'has_repeating_letters_3plus'
]


for downsample in [False, True]: 
    if downsample: 
        # ss a temporary solution, 
        # select the n_1 random values from the train set that equal to 0 in is_toxic: 
        # n_1 = df_train[df_train['is_toxic']==1].shape[0]
        # n_0 = df_train[df_train['is_toxic']==0].shape[0]

        # downsampling of train: 
        n_1 = df_train[df_train['is_toxic'] == 1].shape[0]
        df_0 = df_train[df_train['is_toxic'] == 0]
        df_1 = df_train[df_train['is_toxic'] == 1]
        df_0_down = df_0.sample(n=n_1, random_state=42)
        df_train_copy = pd.concat([df_0_down, df_1], axis=0)

        # downsampling of test:
        n_1 = df_test[df_test['is_toxic'] == 1].shape[0]
        df_0 = df_test[df_test['is_toxic'] == 0]
        df_1 = df_test[df_test['is_toxic'] == 1]
        df_0_down = df_0.sample(n=n_1, random_state=42)
        df_test_copy = pd.concat([df_0_down, df_1], axis=0)
    else: 
        df_train_copy = df_train.copy()
        df_test_copy = df_test.copy()
        
    for use_num in [True, False]: # select whether to choose numerical features or not

        for texts_col in ['text_raw', 'text_encoded_profanity', 'text_del_stop_words', 'text_without_tokens']:
            
            # prepare data: 
            df_train_clean = df_train_copy.dropna(subset=composed_features + [texts_col]).copy()
            df_test_clean = df_test_copy.dropna(subset=composed_features + [texts_col]).copy()

            # select appropriate rows: 
            train_texts = df_train_clean[texts_col]
            train_targets = df_train_clean[target_col]
            train_num_features = df_train_clean[composed_features]

            test_texts = df_test_clean[texts_col]
            test_targets = df_test_clean[target_col]
            test_num_features = df_test_clean[composed_features]

            # run models: 
            model_reports = train_and_test_models(
                train_texts,
                train_targets,
                train_num_features,
                test_texts,
                test_targets,
                test_num_features,
                
                # params to register results: 
                use_num_features=use_num,
                texts_col=texts_col,
                downsample=downsample
            )
            results.extend(model_reports)

    # convert to df: 
    results_df = pd.DataFrame(results)

Let's take a look on the results and find the best model and method at this moment.

In [None]:
# print top n: 
top_n = 8
results_df.sort_values(
    # by=['macro_f1', 'micro_f1', 'class_0_precision', 'class_0_recall', 'class_1_precision', 'class_1_recall'], 
    by=['macro_f1'], 
    ascending=False, inplace=True)
results_df.head(top_n)

In [None]:
sns.heatmap(results_df.phik_matrix())
plt.suptitle('The impact of experimental approaches on metrics')

Short conclusions are: 
* The methods used, `use_num_features` (which means adding the numerical features composed in text domain empirically), and `downsampled_to_equals` (which means downsampling ==0 values to balance the set), affect precision and recall more strongly than they affect the macro_f1 score.
* `downsampled_to_equals` and `use_num_features` affect recall more than precision
* The most dangerous metric is recall

The best model: 
* Is Linear SVM (LinearSVC)
* Is adding the numerical features to the train+test set
* Is not using downsampling for train, test

and it has metrics: 

In [None]:
results_df.head(1)