# FeedBack Prize: EDA

## Requirments:

In [27]:
!pip install pandas
!pip install text-unidecode

Collecting text-unidecode
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 3.2 MB/s eta 0:00:011
[?25hInstalling collected packages: text-unidecode
Successfully installed text-unidecode-1.3


## Libraries

In [29]:
import re
import spacy
import pandas as pd
from text_unidecode import unidecode
from typing import Dict, List, Tuple
from collections import Counter

## Configs

In [3]:
class CFG:
    # File paths:
    common_path = '/srv/dataset/datafbp'
    train_path = f'{common_path}/train.csv'
    test_path = f'{common_path}/test.csv'
    sub_path = f'{common_path}/sample_submission.csv'
    
    # Tokenizer:
    tokenizer = spacy.load('ru_core_news_sm', disable=['tok2vec', 'parser', 'attribute_ruler', 'ner'])

## Load dataframe

In [36]:
train_df = pd.read_csv(CFG.train_path)
test_df = pd.read_csv(CFG.test_path)

In [5]:
train_df

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5
...,...,...,...,...,...,...,...,...
3906,FFD29828A873,I believe using cellphones in class for educat...,2.5,3.0,3.0,3.5,2.5,2.5
3907,FFD9A83B0849,"Working alone, students do not have to argue w...",4.0,4.0,4.0,4.0,3.5,3.0
3908,FFDC4011AC9C,"""A problem is a chance for you to do your best...",2.5,3.0,3.0,3.0,3.5,3.0
3909,FFE16D704B16,Many people disagree with Albert Schweitzer's ...,4.0,4.5,4.5,4.0,4.5,4.5


## Text preprocessing (Lite)

In [34]:
def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end


def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text


def preprocess(text: str) -> str:
    # Put space beetween word and point:
    text = text.replace('.The', '. The')
    text = text.replace('.I', '. I')
    text = text.replace('.For', '. For')
    # Remove extra spaces:
    text = re.sub(' +', ' ', text)
    
    return text

In [37]:
train_df['full_text'] = train_df['full_text'].apply(resolve_encodings_and_normalize)
train_df['full_text'] = train_df['full_text'].apply(preprocess)

test_df['full_text'] = test_df['full_text'].apply(resolve_encodings_and_normalize)
test_df['full_text'] = test_df['full_text'].apply(preprocess)

In [38]:
train_df

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5
...,...,...,...,...,...,...,...,...
3906,FFD29828A873,I believe using cellphones in class for educat...,2.5,3.0,3.0,3.5,2.5,2.5
3907,FFD9A83B0849,"Working alone, students do not have to argue w...",4.0,4.0,4.0,4.0,3.5,3.0
3908,FFDC4011AC9C,"""A problem is a chance for you to do your best...",2.5,3.0,3.0,3.0,3.5,3.0
3909,FFE16D704B16,Many people disagree with Albert Schweitzer's ...,4.0,4.5,4.5,4.0,4.5,4.5
