# Features engineering.v0
---
## Notebook contents ##
This notebook contains the example of data **normalization** and **feature composition** in text domain. After its execution, the results are
* New columns in merged data.csv: added preprocessed version of the initial data; added created features. 
* The conclusion of its correlation with the target var and how can we use them 
---

In [None]:
# import ast 
import re
import pymorphy2

# import kagglehub
import numpy as np
import pandas as pd 
import seaborn as sns 
from tqdm import tqdm
import matplotlib.pyplot as plt 
# import great_expectations as gx
# import great_expectations.expectations as gxe 


## 1. Normalize text and encode some patterns
The base data to train the classical ML algorithms is a fully cleaned, simplified, shorted texts, splitted into the words and (optional) lemmatized. 

In [None]:
df_common_csv = 'data/raw/df_common.csv'

In [None]:
df = pd.read_csv(df_common_csv, index_col=0)
print('Initial shape', df.shape)
df.drop_duplicates(subset='text_raw')
print('Shape after drop duplicates', df.shape)
df.head()

### Process the existing values in the square brackets
Raw datasets may contain their own encoding or tokenizing styles,  
so analyze and transform it into the unified format

In [None]:
from collections import Counter

def calc_num_of_square_br(column: pd.Series): 
    """Process values in square brackets: """

    pattern = re.compile(r'\[([^\[\]]+)\]')  
    all_in_brackets = column.dropna().apply(lambda x: pattern.findall(x))

    flat_list = [word for sublist in all_in_brackets for word in sublist]
    counter = Counter(flat_list)
    most_common = counter.most_common(20)

    return most_common

In [None]:
calc_num_of_square_br(df['text_raw'])

In [None]:
# replace 'Ссылка' на [LINK]: 
patterns_to_replace = [
    r'\[ссылка заблокирована по решению администрации проекта\]',
    r'\[Ссылка\]',
    r'\[ссылка\]'
]

replace_count = 0
for pattern in patterns_to_replace:
    count = df['text_raw'].str.count(pattern).sum()
    replace_count += count
    df['text_raw'] = df['text_raw'].str.replace(pattern, '[URL]', regex=True)

In [None]:
# defines how to map the substrings and replace it in the initial dataset
# We already have [LINK], [NUMBER]

# REPEAT_PUNCT, number, 
mapping_dict = {
    "url": "[URL]",
    "num": "[NUM]", # 
    "mention": "[MNT]",
    "hashtag": "[HSG]",
    "email": "[EML]",
    "repeat_punct": "[RPP]",
    # "emoticon": "[EMOTICON]",
    # "emoji": "[EMOJI]"
}

### Process non-informatives: URLs, hashtags, numbers etc.
In this part of the code, we are cleaning non-informative substrings that are making texts noisy.   
These substrings can be deleted without no information loss   
(but it will be checked by correlations analysis in p.2 too).

In [None]:
import html

def map_noninformatives(text, mapping_dict: dict):
    """Mapping extra str that does not contain useful or informative substrings: 
    Cleaning of URLs, mentiones, hashtags, numbers, emails, HTML symbols, do strip
    Uses the dict of mapping
    
    Regexps are written by GPT"""

    def clean_html(text):
        """Cleaning of html entities. Regexps are written by GPT"""
        # Replace <br> and its variations with a space
        text = re.sub(r'<br\s*/?>', ' ', text, flags=re.IGNORECASE)
        
        # Remove all other HTML tags
        text = re.sub(r'<[^>]+>', '', text)
        
        # Decode HTML entities (e.g. &quot;, &amp;, &#39;)
        text = html.unescape(text)
        
        # Remove extra spaces and trim
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    # Replace urls: 
    text = re.sub(r"http\S+|www\S+", mapping_dict.get('url'), text)
    
    # Replace mentions: 
    text = re.sub(r"[@]\w+", mapping_dict.get('mention'), text)
    text = re.sub(r'id\d+\|[^\s]+', mapping_dict.get('mention'), text) # based on info above

    # replace hashtags: 
    text = re.sub(r"[#]\w+", mapping_dict.get('hashtag'), text) 

    # Replace numbers (integer or decimal):
    text = re.sub(r'\b\d+(\.\d+)?\b|NUMBER|number', mapping_dict.get('num'), text, flags=re.IGNORECASE)

    # Replace emails: 
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', mapping_dict.get('email'), text)

    # Delete br and quot, other HTML tags: 
    # text = re.sub(r'<br\s*/?>', ' ', text, flags=re.IGNORECASE)
    # text = re.sub(r'&quot;', '', text, flags=re.IGNORECASE)
    text = clean_html(text)
    
    # collapse multiple spaces and strip: 
    # text = re.sub(r"\s+", " ", text).strip()
    
    return text

tqdm.pandas(desc="Mapping non-informative symbols")

df['text_encoded_noninfrm'] = df['text_raw'].progress_apply(lambda r: map_noninformatives(r, mapping_dict))

In [None]:
df.head()

In [None]:
# As the result, we replaced some frequent mentiones, specific patterns
calc_num_of_square_br(df['text_encoded_noninfrm'])

### Process emojis, emoticons
Here we are encoding some special substrings that may be informative and can be used as features, like emoji or emoticons.

In [None]:
import json

def save_dict_to_json(data_dict, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data_dict, f, ensure_ascii=False, indent=4)

In [None]:
import emoji

# Encoding dict to replace its values in the text: 
encoding_emoji = dict()

# Encoding dict of emoticons to replace its values in the text: 
encoding_emoticon = dict()

def map_emoji_emoticons(text: str, 
                        mapping_dicts: tuple[dict, dict], 
                        emoji_regrow=r'',
                        token_emoji='EMJ',
                        token_emoticon='EMT'):
    encoding_emoji, encoding_emoticon = mapping_dicts

    def encode_emoji(encoding_text: str):
        emoji_list = emoji.emoji_list(encoding_text)
        for e in sorted(set([item['emoji'] for item in emoji_list])):
            if e not in encoding_emoji: 
                token = f"[{token_emoji}_{len(encoding_emoji)}]" 
                encoding_emoji[e] = token
            encoding_text = encoding_text.replace(e, encoding_emoji[e])
        return encoding_text

    def encode_emoticon(encoding_text: str, regrow=emoji_regrow):
        emoticon_pattern = re.compile(regrow)
        emoticons_found = set(re.findall(emoticon_pattern, encoding_text))
        for emo in emoticons_found:
            if emo not in encoding_emoticon:
                token = f"[{token_emoticon}_{len(encoding_emoticon)}]"
                encoding_emoticon[emo] = token
            encoding_text = encoding_text.replace(emo, encoding_emoticon[emo])
        return encoding_text

    text = encode_emoji(text)
    text = encode_emoticon(text)
    return text


In [None]:
# This regexp is writte by GPT to process different emoticons: 
reg_row = r"""(?x)  # verbose mode
(?:                             # group of emoticons
    (?::|;|=|8)                 # leading eyes for Western style
    (?:-)?                      # optional nose
    (?:\)|\(|D|P|p|O|o|3|/|\\|\||\*|\$|@)   # mouth / expression
        |
    (?:\^\^|_\^|^_\^|^‿^|˘‿˘)   # simple happy eyes/mouth combos such as ^_^ _^ ^_^ ˘‿˘
        |
    (?:T_T|;_;|;\-\;|>_<|>\.<|>_>|<_<)  # sad/embarrassed/frustrated
        |
    (?:<3|♥|♡)                # heart symbols
        |
    (?:¯\\_\(ツ\)_/¯|¯\\_\(ಠ_ಠ\)_/¯)  # shrug / disapproval special ones
        |
    (?:uwu|OwO|UwU|owo)        # internet‑emoticon style
)
"""

tqdm.pandas(desc="Mapping emojis, emoticons")

# Compose new colunmn
df['text_encoded_emoj_emotic'] = df['text_encoded_noninfrm'].progress_apply(
    lambda r: map_emoji_emoticons(r, (encoding_emoji, encoding_emoticon), reg_row)
)
print('Found unique emoji:', len(encoding_emoji))
print('Found unique emoticons:', len(encoding_emoticon))

save_dict_to_json(encoding_emoji, 'encoding_emoji.json')
save_dict_to_json(encoding_emoticon, 'encoding_emoticon.json')

### Process punctuation
Punctuation processing includes the next steps 
1. Define the unique variations of the punctuation repeatings, ... !!! for example, etc. It also will process the residial (or rare) emoticons. 
2. Encode them and add to the mapping dicts


In [None]:
# Punctuation encoding part 
encoding_rep_punct = dict()
encoding_sep_punct = dict()


def map_punctuation(
        text: str, 
        mapping_rep_dict: dict, 
        mapping_sep_dict: dict, 
        rep_regexp=r"([!\"#$%&'()*+,\-./:;<=>?@\[\]^_`{|}~]{2,})",
        nonrep_regexp=r"[!\"#$%&'()*+,\-./:;<=>?@\[\]^_`{|}~]",
        token_sep_punct='SPP', # separate punctuatuin 
        token_seq_punct='RPP' # repeating punctuation
    ) -> str:
    """
    Encode both repeating and single punctuation marks with mapping dictionaries.
    Sequences inside any [ ... ] (even unclosed) are ignored.
    Already existing [TOKEN]-like patterns are ignored too.
    """


    def is_inside_token(pos: int) -> bool:
        """Check if position is inside [TOKEN]-like region."""
        return any(start <= pos < end for start, end in token_spans)
    
    def is_protected(pos: int) -> bool:
        """Return True if position is inside any protected region (token or [ ... ])."""
        
        return is_inside_token(pos) or any(start <= pos < end for start, end in bracket_spans)

    def repl_repeat(match):
        """Handle repeating punctuation"""

        seq = match.group(1)
        start = match.start()

        if is_protected(start):
            return seq

        # Unique punctuation marks, preserving order
        unique_chars = ''.join(sorted(set(seq), key=seq.index))
        normalized = unique_chars[0] if len(unique_chars) == 1 else unique_chars

        if normalized not in mapping_rep_dict:
            token = f"[{token_seq_punct}_{len(mapping_rep_dict)}]"
            mapping_rep_dict[normalized] = token
        else:
            token = mapping_rep_dict[normalized]

        return token

    def repl_single(match):
        """Handle separate punctuation"""

        ch = match.group(0)
        start = match.start()

        if is_protected(start):
            return ch

        if ch not in mapping_sep_dict:
            token = f"[{token_sep_punct}_{len(mapping_sep_dict)}]"
            mapping_sep_dict[ch] = token
        else:
            token = mapping_sep_dict[ch]

        return token
    
    # Compile patterns: 
    rep_pattern = re.compile(rep_regexp)
    sep_pattern = re.compile(nonrep_regexp)

    # Find existing [TOKEN]-like regions: 
    token_spans = []
    for match in re.finditer(r"\[[A-Za-z0-9_]+\]", text):
        token_spans.append((match.start(), match.end()))

    # Detect generic bracket regions (for [ ... ] ), excluding [TOKEN] ones: 
    bracket_spans = []
    open_pos = None
    for i, ch in enumerate(text):
        if is_inside_token(i):
            continue  # Skip positions inside known tokens entirely
        if ch == '[':
            if open_pos is None:
                open_pos = i
        elif ch == ']' and open_pos is not None:
            bracket_spans.append((open_pos, i + 1))
            open_pos = None
    if open_pos is not None:  # text ends with unclosed '['
        bracket_spans.append((open_pos, len(text)))

    # Apply replacements: 
    new_text = rep_pattern.sub(repl_repeat, text)
    new_text = sep_pattern.sub(repl_single, new_text)

    return new_text


tqdm.pandas(desc="Mapping punctuations")

# Encode repeatinfg punctuations, encode non-repeating: 
df['text_encoded_punct'] = df['text_encoded_emoj_emotic'].progress_apply(
    lambda r: map_punctuation(r, encoding_rep_punct, encoding_sep_punct)
)

print('Found unique signs sequences:', len(encoding_rep_punct))
print('Found separated unique signs:', len(encoding_sep_punct))

save_dict_to_json(encoding_rep_punct, 'encoding_rep_punct.json')
save_dict_to_json(encoding_sep_punct, 'encoding_sep_punct.json')

In [None]:
# pip install slaviclean==0.1.1

### Process profanity words

<!-- `WARNING`: 
map_slang_and_rude preprocessing is used in lemmas handlings and it's working slower than the full BERT-based model inference.  
But the main benefit of classical ML algorithms is its better and faster performance, and we don't want to lose it. 

So there're three variants and steps to resolve this problem: 
1. Faster inference, worse model quality (by default).
2. Slower inference, better model quality (optional) -->

In [None]:
# Read the prevoiusly composed set of the bad words:  
profanity_file = 'bad_words_lemmas.txt'
profanities = list()
with open(profanity_file, 'r', encoding="utf-8") as f:
    profanities_set = [l.strip() for l in f.readlines()]
print('Founf N profanities: ', len(profanities_set))

# Init base pymorphy analyzer: 
morph = pymorphy2.MorphAnalyzer()

# Set the value of mapping profanities dict which are
encoding_profanities = dict()

def map_profanity(
        text: str, 
        profanities: list,
        mapping_dict=None, 
        token_profanity='PRF' # profanity 
        # replace_char='[PROFANITY]'
        ) -> str:
    """
    Map Russian profanity using [what] in text to tokens,
    """

    def repl(match) -> str:
        
        word = match.group(0)
        lower = word.lower()
        lemma = morph.parse(lower)[0].normal_form
        
        if lemma in profanities: 
            # print('word', word, 'lemma', lemma) # uncomment if you don't need you eyes
            if lemma not in mapping_dict:
                res = f"[{token_profanity}_{len(mapping_dict)}]"
                mapping_dict[lemma] = res
            else:
                res = mapping_dict[lemma]

            # print(res)
        else: 
            res = word
        
        return res

    pattern = re.compile(r"\b[А-Яа-яЁё']+\b")
    
    return pattern.sub(repl, text)
    # return repl(text)


tqdm.pandas(desc="Mapping profanities")
df['text_encoded_profanity'] = df['text_encoded_punct'].progress_apply(
    lambda r: map_profanity(text=r, profanities=profanities_set, mapping_dict=encoding_profanities)
)
print('Found unique profanities:', len(encoding_profanities))

save_dict_to_json(encoding_profanities, 'encoding_profanities.json')

In [None]:
df.head()

### Process 1st, 2nd-person pronouns (todo)

### Delete stop words (as the last, optional column)


In [None]:
from stop_words import get_stop_words

russian_stopwords = set(get_stop_words('russian'))
df['text_del_stop_words'] = df['text_encoded_profanity'].apply(
    lambda x: ' '.join(
        word for word in x.split() if word.lower() not in russian_stopwords
    )
)

In [None]:
df.head()

### Plot wordcloud after processing 

In [None]:
# Delete all tokens: 
df['text_no_brackets'] = df['text_del_stop_words'].apply(
    lambda x: re.sub(r'\[.*?\]', '', x)
).str.replace(r'\s+', ' ', regex=True).str.strip()
df.head()

### 
As the last part, ensure that the most frequent words have sense (code taken from `dataset_eda.ipynb`): 

In [None]:
from wordcloud import WordCloud

text = ' '.join(df['text_no_brackets']).lower()

# text_cleaned = re.sub(r'\[.*?\]', '', text)

# Delete spaces: 
# text_cleaned = re.sub(r'\s+', ' ', text_cleaned).strip()

wordcloud_toxic = WordCloud(width=800, height=400, background_color='white',
                              colormap='Reds', max_words=100).generate(text)
_, axes = plt.subplots(1, 1, figsize=(16, 8))
axes.imshow(wordcloud_toxic, interpolation='bilinear')
axes.axis('off')
axes.set_title('Word Cloud', fontsize=16, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
df.to_csv('dump_features_0.csv')

All composed tokens that we can use in feature composition are: 
* `[URL]` - url
* `[NUM]` - number 
* `[MNT]` - mention
* `[HSG]` - hashtag
* `[EML]` - email 
* `[SPP_n]` - n-st type of separate (single) punctuation 
* `[RPP_n]` - n-st type of repeating punctuation 
* `[EMJ_n]` - n-st type of emoji
* `[EMT_n]`- n-st type of emoticon 
* `[PRF_n]`- n-st type of  profanity (its lemma)

## 2. Define handcrafted features and calculate its correlations with targtet

The examples of the handcrafted features are created with using of the previous steps and may have the next variations: 


**Punctuation:**
1. Count of separate punctuations in phrase 
2. Count of repeating punctuations in phrase 
3. Is punctuation after spaces in phrase  
(for example: '*Some phrase , another part ,  etc*')

**Emoji, emoticons:**  
1. Is emoji included  
2. Is emoticons encluded  


**Tonalities in the text structure:**
1. Is capslock included (like a screaming tone)   
Note: do not include tokens like a  [SOME_TOKEN] to the processing  
2. Is all parts in a lowercase  
'*this is example. i'm talking a bit tired maybe. the end.*'
3. Is punctuation using  
'*this is another example talking so fast maybe*'
3. Is '*FeNcE IrOnIc StYlE*' included 

**Tonalities in the lexical content:**
1. Count of profanity words included
2. Are pronouns included (column **todo**)  
Explain: if we have a second-person pronoun and some profanity in the phraze, it's possibly that the message is offensive=>toxic 

In [None]:
df = pd.read_csv('dump_features_0.csv', index_col=0)

In [None]:
# Feature composing part 

def is_all_lower(text):
    text_no_tokens = re.sub(r'\[.*?\]', '', text)
    
    return int(bool(text_no_tokens) and text_no_tokens.islower())

def starts_with_cap(text):
    """Returns True, if more than a half of sentences in the text start with cap"""

    text_no_tokens = re.sub(r'\[.*?\]', '', text).strip()
    sentences = re.split(r'[.!?]', text_no_tokens)
    starts_with_cap_each = [s.strip()[0].isupper() for s in sentences if s.strip()]
    
    if not starts_with_cap_each:
        return 0  
    
    # Calculate sums and take a decision: 
    true_count = sum(starts_with_cap_each)
    false_count = len(starts_with_cap_each) - true_count
    
    return int(true_count > false_count)

def has_caps(word_list):
    for w in word_list:
        if w.isupper() and not re.match(r'\[.*?\]', w):
            return True
    return False

 
df_features = df.copy()
df_col_base = df_features['text_encoded_profanity']

# 1. Count of separate punctuations (SPP)
df_features['count_spp'] = df_col_base.str.count(r'\[SPP_\d+\]')

# 2. Count of repeating punctuations (RPP)
df_features['count_rpp'] = df_col_base.str.count(r'\[RPP_\d+\]')

# 3. Is punctuation after spaces (check SPP after space)
df_features['punct_after_space'] = df_col_base.str.contains(r' \[SPP_\d+\]').astype(int)

# Emoji, emoticons
# 1. Is emoji included: 
df_features['has_emoji'] = df_col_base.str.contains(r'\[EMJ_\d+\]').astype(int)

# 2. Is emoticon included: 
df_features['has_emoticon'] = df_col_base.str.contains(r'\[EMT_\d+\]').astype(int)

# Tonalities in the text structure
# 1. Is capslock included (ignoring tokens [SOME_TOKEN]): 
df_features['has_capslock'] = df_features['text_encoded_profanity'].apply(
    lambda x: has_caps(re.findall(r'\b\w+\b', x))
).astype(int)

# 2. Is all lowercase
df_features['is_all_lower'] = df_features['text_encoded_profanity'].apply(is_all_lower)

# 3. Is punctuation used: 
df_features['has_punctuation_spp'] = df_col_base.str.contains(r'\[SPP_\d+\]').astype(int)
df_features['has_punctuation_rpp'] = df_col_base.str.contains(r'\[RPP_\d+\]').astype(int)

# 4. Is '*FeNcE IrOnIc StYlE*' included: 
df_features['has_fence_ironic_style'] = df_col_base.str.contains(r'\*.*?\*').astype(int)

# Tonalities in lexical content
# 1. Count of profanity words (PRF): 
df_features['count_profanity'] = df_col_base.str.count(r'\[PRF_\d+\]')

# 2. Are pronouns included (assuming pronouns are inserted как токены, например [PRON_n]): 
df_features['has_pronouns'] = df_col_base.str.contains(r'\[PRON_\d+\]').astype(int)

# Is starting with cap: 
df_features['starts_with_cap'] = df_features['text_encoded_profanity'].apply(starts_with_cap)

# Neutral tokens checking: 
df_features['has_url'] = df_col_base.str.contains(r'\[URL\]').astype(int)
df_features['has_number'] = df_col_base.str.contains(r'\[NUM\]').astype(int)
df_features['has_mention'] = df_col_base.str.contains(r'\[MNT\]').astype(int)
df_features['has_hashtag'] = df_col_base.str.contains(r'\[HSG\]').astype(int)
df_features['has_email'] = df_col_base.str.contains(r'\[EML\]').astype(int)

df_features.head()

In [None]:
# Calc composed features correlations woth target: 
composed_features = [
    'count_spp',
    'count_rpp',
    'punct_after_space',
    'has_emoji',
    'has_emoticon',
    'has_capslock',
    'is_all_lower',
    'has_punctuation_spp',
    'has_punctuation_rpp',
    'has_fence_ironic_style',
    'count_profanity',
    'has_pronouns',
    'starts_with_cap',
    'has_url',
    'has_number',
    'has_mention',
    'has_hashtag',
    'has_email'
]

df_corr = df_features[df_features['is_toxic']!=np.nan]
correlations = df_corr[composed_features + ['is_toxic']].corr()['is_toxic'].drop('is_toxic')

# Plot tops of the correlated: 
correlations_sorted = correlations.sort_values(ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x=correlations_sorted.values, y=correlations_sorted.index, palette="viridis")
plt.title('Correlation of composed features with is_toxic')
plt.xlabel('Correlation with is_toxic')
plt.ylabel('Features')
plt.xlim(-0.3, 0.3)  
plt.tight_layout()
plt.show()

* `count_profanity`, `starts_with_cap`, `is_all_lower`, `has_punctuation_spp`, `has_mention`, `has_number` features have quite high correlations with `is_toxic`

### Types-of-feature correlations
ALso, the separate tables will be created to define the next types of the correlations: 
1. [`raw_text_id` - `Types_of_emoji_used`].csv vs `target`
2. [`raw_text_id` - `Types_of_emoticon_used`].csv vs `target`
3. [`raw_text_id` - `Types_of_repeating_punctuation_used`].csv vs `target`
4. [`raw_text_id` - `Types_of_separate_punctuation_used`].csv vs `target`

If some strong correlation will be found for the specific types of the emojis/punct etc.,  
we can use it as the additional text features during the inference.

In [None]:
import pandas as pd
import json

# Set up composed jsons: 
json_files = {
    'profanities': 'encoding_profanities.json',
    'rep_punct': 'encoding_rep_punct.json',
    'sep_punct': 'encoding_sep_punct.json',
    'emoji': 'encoding_emoji.json',
    'emoticon': 'encoding_emoticon.json'
}

# Read dictionaries: 
encodings = {}
for key, file in json_files.items():
    with open(file, 'r', encoding='utf-8') as f:
        encodings[key] = json.load(f)

# Compose correlation dataframes for the each vaule of different tokens categories:
df_category = {}
for category, token_dict in encodings.items():
    cols = list(token_dict.keys())
    
    # merging by the raw_text_id: 
    df_temp = pd.DataFrame({'raw_text_id': df_features['raw_text_id']}) 
    for token in cols:
        df_temp[token] = df_features['text_encoded_profanity'].str.count(re.escape(token))
    df_category[category] = df_temp
    df_temp.to_csv(f'df_{category}.csv', index=False)


In [None]:
def compute_top_correlations(features_csv, original_csv, top_n=10):
    """
    Calculate pairwise correlations between is_toxic and the additional created features 
    original_csv must have raw_text_id, is_toxic columns for merging 
    """
    
    df_features_new = pd.read_csv(features_csv)
    df_original = pd.read_csv(original_csv)
    
    # merge on raw_text_id: 
    df_merged = df_features_new.merge(df_original[['raw_text_id', 'is_toxic']], on='raw_text_id', how='left')
    df_merged = df_merged[df_merged['is_toxic'].notna()]
    
    # calculate correlations with is_toxic
    token_cols = df_features_new.columns.drop('raw_text_id')
    correlations = df_merged[token_cols].corrwith(df_merged['is_toxic'])
    
    # get top-N correlations by absolute value
    top_corr = correlations.reindex(correlations.abs().sort_values(ascending=False).index).head(top_n)
    
    return top_corr


In [None]:
import gc
gc.collect()

In [None]:
import gc

for category in ['emoticon', 'emoji', 'rep_punct', 'sep_punct']:
        
    top_corr = compute_top_correlations(f'df_{category}.csv', 'dump_features_0.csv', top_n=10)
    print(f"\nTop correlations for {category}:")
    print(top_corr)

    gc.collect()

## 3. Conclusions

There're a few features could be added (or considered to be added) to the inputs of classical ML   
(it can be concatenated with TF-IDF numerical features, for example): 
* `count_profanity`, `starts_with_cap`, `is_all_lower`, `has_punctuation_spp`, `has_mention`, `has_number` 
* the rest from `composed_features` are additional and can be considered experimentally   
<!-- * `[URL]`, `[NUM]`, `[MNT]`, `[HSG]`,`[EML]`, substrings can be deleted from text -->

The negative conclusions are: 
* There're not enough count of emojis in the dataset to calculate correlations correctly (or they're not so frequent/informative)
* The pairwise correlations for types of `emoji_n`/`punctuation_n`/`profanity_n` are less than their common interpretations (`has_emoji`, `count_profanity`, `count_spp` etc.), so there's no need to include them into inference