<a href="https://colab.research.google.com/github/EleonoraBaim/NPS_Dialogue_system/blob/main/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [None]:
# -*- coding: utf-8 -*-
import pickle
import re
from pathlib import Path
import pandas as pd
import pymorphy2
import json
from sklearn.model_selection import train_test_split
import numpy as np
import emoji
from skmultilearn.model_selection import IterativeStratification
from collections import Counter
from skmultilearn.model_selection.measures import get_combination_wise_output_matrix

pd.set_option("display.max_colwidth", 100)

min_target_size = 100
project_path = "C:/Users/baimbetova_49768/PycharmProjects/nps/"
datatset_folder = 'dataset/'

# Functions

In [None]:
def iterative_train_test_split(df, text, label, train_size):
    """Custom iterative train test split which
    'maintains balanced representation with respect
    to order-th label combinations.'
    """
    stratifier = IterativeStratification(n_splits=2, order=1, sample_distribution_per_fold=[1.0-train_size, train_size, ])
    train_indices, test_indices = next(stratifier.split(df[text], df.loc[:, label]))
    X_train, y_train = df[text][train_indices], df.loc[train_indices, label]
    X_test, y_test = df[text][test_indices], df.loc[test_indices, label]
    return X_train, X_test, y_train, y_test


def normalization (text):
    """Normalization of text for tf-idf vectorization"""
  import tqdm

  client_chanel_norm = []
  counter = 0

  for i in tqdm.tqdm(text):
    i = str(i).strip().lower()
    i = typos_corr(i)
    i = re.sub(r"[^a-zA-Zа-яА-Я ]+", r'', i) 
    i = i.split()
    result = ''
    for word in i:
      result = result + morph.parse(word)[0].normal_form + ' '
    i = result

    client_chanel_norm.append(i)

  return client_chanel_norm

def typos_corr(data):
   """Corretions of typos for better accuracy of model"""
  remove = ['зп', 'КГ']
  for word in remove:
    data = data.replace(word, "kaspigold")
  # Kaspi RED
  remove = ['КР', 'каспи ред', 'kaspi red', 'kaspired', 'каспи рэд', 'каспи red']
  for word in remove:
    data = data.replace(word, "kaspired")
  remove = ['Хор.обсл. ']
  for word in remove:
    data = data.replace(word, "хорошее обслуживание")
  remove = ['ТК']
  for word in remove:
    data = data.replace(word, "торговыйкредит")
  remove = ['%']
  for word in remove:
    data = data.replace(word, "проценты")
  remove = ['КМ']
  for word in remove:
    data = data.replace(word, "каспимагазин")
    
  return data

def tf_idf_transform (data, tf_idf_model):
  import tqdm

  feature_names_text = tf_idf_model.get_feature_names()
  data_text = pd.DataFrame(tf_idf_model.transform(data).toarray())
  
  data_text_names = pd.DataFrame()
  for column in tqdm.tqdm(data_text.columns):
    data_text_names[str('T ' + feature_names_text[column])] = data_text[column]
    
  data = data_text_names
  data = data.fillna('NaN')
  return data


morph = pymorphy2.MorphAnalyzer()

# 1.0 Data importing

In [None]:
paths = [str(x) for x in Path(project_path + datatset_folder).glob("**/*.xls*")]
datatset = pd.DataFrame()
class_name = []

for file in paths:
    datatset = datatset.append(pd.read_excel(file))


print('Original dataset shape: ' + str(datatset.shape))

Original dataset shape: (230648, 16)


In [None]:
dataset = datatset.copy()

# 2.0 Data preparation

In [None]:
# Working with NA data
print(dataset.isnull().sum())
dataset['Статус звонка'].fillna(0, inplace=True)
dataset['Оценка'].fillna(0, inplace=True)
dataset['Продукт NPS'].fillna('NA', inplace=True)
dataset['Тональность'].fillna('NA', inplace=True)
dataset = dataset[dataset['Статус звонка']==0]
dataset = dataset[dataset['Оценка']!=0]
dataset = dataset[dataset['Продукт NPS']!='NA']
dataset = dataset[dataset['Тональность']!='NA']
print('-----------------------------------')
print('Training dataset shape: ' + str(dataset.shape))

Training dataset shape: (184070, 16)


In [None]:
# Leaving only needed data
dataset = dataset.loc[:,['Уникальный идентификатор','Почему','Продукт опроса', 'Канал', 'Оценка', 'Тональность', 'Продукт NPS', 'Тематика', 'Детализация']]

In [None]:
# Counting num of classes for every comment
num_of_class = pd.DataFrame(dataset['Уникальный идентификатор'].value_counts())
num_of_class.reset_index(inplace=True)
num_of_class = num_of_class.rename(columns = {'Уникальный идентификатор' : 'Количество классов', 'index' : 'Уникальный идентификатор' }, inplace = False)
dataset = dataset.merge(num_of_class, on = 'Уникальный идентификатор', how='left')

# 3.0 Classification labels

In [None]:
# Defining classes, rare classes go to 'nan -> nan -> nan'
dataset.reset_index(inplace = True)

class_name = []
for index, row in dataset.iterrows():
    class_name.append(str(row['Тональность'] + " -> " + str(row['Продукт NPS']) + " -> " + str(row['Тематика']) + " -> " + str(row['Детализация'])))

dataset["target_name"] = pd.Series(class_name)

target_stat = dataset.groupby(['target_name']).size()
target_stat = target_stat[target_stat > min_target_size]

print('\nClasses for learning (small classes will join to nan -> nan -> nan): ')
print(target_stat.sort_values(ascending = False))



Classes for learning (small classes will join to nan -> nan -> nan): 
target_name
Позитив -> Переводы -> Условия -> Удобно переводить деньги                                          11333
Позитив -> Отделения -> Сотрудник -> Хорошо обслужил                                                10122
Позитив -> Платежи на Kaspi.kz -> Услуги к оплате -> Удобно совершать оплату услуг                   4882
Позитив -> Отделения -> Сотрудник -> Быстро решил вопрос клиента                                     4798
Позитив -> Магазин на Kaspi.kz -> Получение -> Нравится доставка...                                  4641
Позитив -> Кредит Наличными -> Получение -> Нравится высокая вероятность одобрения кредита           4492
Позитив -> Kaspi Red -> Условия -> Удобно пользоваться клубной картой                                4364
Позитив -> Кредит Наличными -> Получение -> Нравится быстро и легко получать кредиты                 4323
Позитив -> Kaspi Gold -> Покупки -> Оплата картой через POS – термина

In [None]:
pd.DataFrame(dataset["target_name"].value_counts()).to_excel('classes.xlsx')

In [None]:
categories = {}
categories_exp = {}
for key, value in enumerate(target_stat.index.unique()):
        categories[value] = key
        categories_exp[key] = value

categories['nan -> nan -> nan'] = key +1
categories_exp[key +1] = 'nan -> nan -> nan'

nan_col = key +1

cat_str = json.dumps(categories_exp, ensure_ascii=False)
with open(project_path + "model/" + 'categories.json', 'w', encoding='utf-8') as cat_file:
    cat_file.write(cat_str)

print('Saved json file: ' + str(project_path + "model/" + 'categories.json'))

target = dataset["target_name"].map(categories)
target.fillna(categories['nan -> nan -> nan'], inplace=True)
dataset['target'] = target


Saved json file: C:/Users/baimbetova_49768/PycharmProjects/nps/model/categories.json


# 4.0 Binarization of targets

In [None]:
needed_mask = dataset.columns.isin(['Почему', 'Продукт опроса','target'])

needed_df = dataset.iloc[:,needed_mask]

#Delete dublicates
needed = (needed_df.drop_duplicates(subset=needed_df.columns, keep=False)).reset_index()

#Grouping
dataset_m = needed.groupby(['Почему', 'Продукт опроса'])['target'].agg(list)
dataset_m = dataset_m.reset_index()

#Creating Binary matrix
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
multi_target = mlb.fit_transform(dataset_m.target)

dataset_multi=pd.DataFrame()
dataset_multi[['Почему', 'Продукт опроса']] = dataset_m[['Почему', 'Продукт опроса']]
dataset_multi[mlb.classes_] = pd.DataFrame(multi_target)

In [None]:
dataset = dataset_multi.copy()
dataset.head()

Unnamed: 0,Почему,Продукт опроса,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,...,29.0,30.0,31.0,32.0,33.0,34.0,35.0,36.0,37.0,38.0
0,"(каз) 5, переплате меньше",АКБ,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,"(каз) 5. Везде спасает: когда нужно вовремя берем КН и товарный, вчера ПДП сделал КН, но теперь...",9999,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,(каз) 5. МП доступно и легко,СТО,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"(каз) Кг хорошая. Легко все, все можно платить с КГ.",ЗП,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"(каз) Ни в коем случае не порекомендую. На 4к сделала покупку, добавили 3750 тг плюсом. (бросил...",Kaspi Red,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
dataset['CONTEXT'] = dataset['text'] + " " + dataset['Продукт опроса']
dataset.drop(labels=['Почему', 'Продукт опроса', 'text'], axis=1, inplace=True)
cols = dataset.columns.tolist()
cols = cols[-1:] + cols[:-1]
dataset = dataset[cols] 
dataset

Unnamed: 0,CONTEXT,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,...,29.0,30.0,31.0,32.0,33.0,34.0,35.0,36.0,37.0,38.0
0,каз переплате меньше АКБ,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,каз везде спасает когда нужно вовремя берем кн и товарный вчера пдп сделал кн но теперь хочу ещ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,каз мп доступно и легко СТО,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,каз кг хорошая легко все все можно платить с кг ЗП,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,каз ни в коем случае не порекомендую на к сделала покупку добавили тг плюсом бросила трубку Kas...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66698,я хотела уточнить всегда беру кредит когда надо досрочно закрываю я хочу что вы скидки сделали д...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
66699,я хочу кр отключить мне неудобно когда я хочу оплату совершить в мп картой кг выскакивает кр и и...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
66700,я часто пользуюсь и мне нравится всегда быстро Переводы на Kaspi.kz,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
66701,я че то больше просила но мало дали ну быстро как бы КН ИП,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# 5.0 Data Spliting

In [None]:
#Deleting 50% of nan classes in order to decrease samples of this classes 
dataset.drop(index = dataset.loc[dataset[nan_col] == 1].sample(frac=0.5, random_state = 15).index, inplace = True)

text = 'CONTEXT'
labels = dataset.columns[dataset.columns!='CONTEXT']

dataset.reset_index(inplace=True, drop = True)

In [None]:
X_train, X_, y_train, y_ = iterative_train_test_split(dataset, text, labels, train_size=0.7)

X_ = pd.DataFrame(X_)
y_ = pd.DataFrame(y_)

dataset_= X_.join(y_)
dataset_.reset_index(inplace=True, drop = True)

X_val, X_test, y_val, y_test = iterative_train_test_split(dataset_, text, labels, train_size=0.5)

train_df = pd.DataFrame(X_train).join(pd.DataFrame(y_train)).reset_index(drop=True)
val_df = pd.DataFrame(X_val).join(pd.DataFrame(y_val)).reset_index(drop=True)
test_df = pd.DataFrame(X_test).join(pd.DataFrame(y_test)).reset_index(drop=True)

In [None]:
train_size = 0.7
val_size = 0.15
test_size = 0.15

# Get counts for each class
counts = {}
counts["train_counts"] = Counter(str(combination) for row in get_combination_wise_output_matrix(
    np.array(y_train), order=1) for combination in row)
counts["val_counts"] = Counter(str(combination) for row in get_combination_wise_output_matrix(
    np.array(y_val), order=1) for combination in row)
counts["test_counts"] = Counter(str(combination) for row in get_combination_wise_output_matrix(
    np.array(y_test), order=1) for combination in row)


# Adjust counts across splits
for k in counts["val_counts"].keys():
    counts["val_counts"][k] = int(counts["val_counts"][k] * \
        (train_size/val_size))
for k in counts["test_counts"].keys():
    counts["test_counts"][k] = int(counts["test_counts"][k] * \
        (train_size/test_size))
    
# View distributions
pd.DataFrame({
    "train": counts["train_counts"],
    "val": counts["val_counts"],
    "test": counts["test_counts"]
}).T.fillna(0)

#del(y_train)
#del(y_validation)

Unnamed: 0,"(11,)","(14,)","(17,)","(38,)","(12,)","(2,)","(1,)","(35,)","(9,)","(19,)",...,"(13,)","(25,)","(18,)","(3,)","(30,)","(27,)","(22,)","(31,)","(34,)","(5,)"
train,572,2323,594,9081,1016,2246,1327,492,745,965,...,2341,787,638,617,1845,2017,865,519,375,475
val,569,2324,592,9081,1017,2249,1325,490,742,961,...,2342,784,634,620,1843,2020,863,522,373,457
test,574,2324,592,9081,1017,2244,1330,494,746,966,...,2342,788,639,616,1848,2016,868,518,378,494


# 6.0 Text preprocessing

In [None]:
# Нормализация текста клиента
print('\nStart text normalization:')

print('Train text ... ')
norm_train_text = normalization(train_df[text])

print('Test text ... ')
norm_test_text = normalization(test_df[text])

print('Val text ... ')
norm_val_text = normalization(val_df[text])

train_df['normalized'] = norm_train_text
test_df['normalized'] = norm_test_text
val_df['normalized'] = norm_val_text

norm_text = 'normalized'

In [None]:
#save datasets
train_df.to_csv('train_dataset.csv')
val_df.to_csv('val_dataset.csv')
test_df.to_csv('test_dataset.csv')