In [None]:
! pip install html2text
! pip install transformers==4.3.3
! pip install -U imbalanced-learn
! pip install wandb
! git clone https://github.com/NVIDIA/apex
! cd apex
! pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" /kaggle/working/apex/
! pip install simpletransformers


In [None]:
import torch
if torch.cuda.is_available():  
    device = torch.device("cuda")
    print('We will use the GPU:', torch.cuda.get_device_name(0))
    
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import nltk
import pandas as pd 
from html2text import html2text
import re
import warnings
import itertools
import torch
from tqdm.notebook import tqdm
import chardet
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import f1_score, classification_report
from spacy.tokenizer import Tokenizer
from spacy.lang.tr import Turkish
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from simpletransformers.classification import ClassificationModel
import pickle
from sklearn.model_selection import train_test_split


In [None]:
encoding = "utf-8"
sep = ";"


migros_path = "/content/drive/MyDrive/Data/A_Data/"

migros_raw_product_1 = pd.read_csv(fr"{migros_path}products1Yeni.csv",
                                   sep=sep, encoding=encoding)
migros_raw_product_2 = pd.read_csv(fr"{migros_path}products2Yeni.csv",
                                   sep=sep, encoding=encoding)
migros_raw_product_3 = pd.read_csv(fr"{migros_path}products3Yeni.csv",
                                   sep=sep, encoding=encoding)
migros_raw_product_4 = pd.read_csv(fr"{migros_path}products4Yeni.csv",
                                   sep=sep, encoding=encoding)

__frames = [migros_raw_product_1, migros_raw_product_2,
            migros_raw_product_3, migros_raw_product_4]
df = shuffle(pd.concat(__frames))





In [None]:
nltk.download('stopwords')
__WPT = nltk.WordPunctTokenizer()
__stop_words_list = nltk.corpus.stopwords.words('turkish')
__stop_words_list += ["g", "kg", "ml", "l", "cm",
                      "x", "li", "lı", "ay", "lü", "lu",
                      "ad", "adet", "gr", "boy", "orta", "küçük", "büyük",
                      "paket", "adetx", "köy", "doğal"]

__TAG_RE = re.compile(r'<[^>]+>')


def html_totext(text):
    return html2text(text)


def remove_tags(text):
    return __TAG_RE.sub(' ', text)


def remove_undefine_words(text):
    text = text.replace('&;nbsp;', ' ')
    text = text.replace('\n', ' ')
    text = text.replace('8&;#39', "'")
    text = text.replace('&lt', '')
    text = text.replace('p&gt;', '')
    text = text.replace('strong&gt;', ' ')
    text = text.replace('&;amp;', '&')
    text = text.replace('&;#39;', r'\ ')
    text = text.replace('&;', '')
    text = text.replace('nbsp;', ' ')
    text = text.replace('//', ' ')
    text = text.replace(' /', ' ')
    return text


def remove_html(text):
    txt = remove_undefine_words(text)
    txt = html_totext(text)
    txt = remove_tags(txt)
    return txt.strip()


def lower_letters(txt):
    txt = txt.replace("İ", "i")
    return txt.lower()


def resub_comma(txt):
    return txt.replace(",", " ")


def remove_stopwords(txt):
    return ' '.join([word for word in txt.split()
                     if word.strip() not in __stop_words_list 
                     and len(word) > 1]) 


def remove_punctuation(txt):
    return re.sub(r'[^\w\s]', ' ', txt)


def remove_repeatedLetter(txt):
     return (''.join(i for i, _ in itertools.groupby(txt)))
 

def remove_integer(txt):
    return ''.join([word for word in txt if not word.isdigit()])


def just_one_word(txt):
    new_txt = []
    for word in txt.split(' '):
        if word not in new_txt:
            new_txt.append(word)
    return ' '.join(new_txt)


def clean_text(txt):
    txt = html_totext(txt)
    txt = remove_undefine_words(txt)
    txt = remove_tags(txt)
    txt = lower_letters(txt)
    txt = resub_comma(txt)
    txt = remove_punctuation(txt)
    txt = remove_integer(txt)
    txt = remove_repeatedLetter(txt)
    txt = just_one_word(txt)
    txt = remove_stopwords(txt)
    return txt



In [None]:
df = df.dropna(subset=['uMarka'])
df = df.dropna(subset=['urunAdi'])

In [None]:
del df['Unnamed: 8']

In [None]:
df.drop(df[df["urunKategorileri"]=='Oreal'].index, inplace = True) 
df.drop(df[df["urunKategorileri"]=='Guy'].index, inplace = True) 
df.drop(df[df["urunKategorileri"]=='Shoulders'].index, inplace = True) 
df.drop(df[df["urunKategorileri"]=='Bugün Eklenenler'].index, inplace = True) 

In [None]:
df["uMarka"] = df["uMarka"].str.capitalize()

In [None]:
df.drop_duplicates(keep = False, inplace = True)

In [None]:
def get_last_category(df2):
    category = "" 
    for i in range(7, 0, -1):
        if str(df2[i]) != "nan" :
            category = df2[i]
            break
    return str(category)

In [None]:
df["new_kat"] = df.apply(get_last_category, axis=1)

In [None]:
df["urunAdi"] = df["urunAdi"].apply(clean_text)
train_df = df[["urunKategorileri","urunAdi"]]

In [None]:
train_df=train_df.rename(columns={'new_kat':'cat',
                          'urunAdi':'text'})

In [None]:
train_df

In [None]:
print(train_df.cat.unique())
cat_size = len(train_df.cat.unique())
print("Total categories",cat_size)


In [None]:
train_df['labels'] = pd.factorize(train_df.cat)[0]

train_df.head()

In [None]:
train, test = train_test_split(train_df, test_size=0.0001, random_state=42)

In [None]:
train.shape, test.shape

In [None]:
model = ClassificationModel('bert', 'bert-base-multilingual-uncased', num_labels=cat_size, 
                            args={'reprocess_input_data': True, 'overwrite_output_dir': True,
                                  'num_train_epochs': 7})

In [None]:
model.train_model(train)

In [None]:
test_data = pd.read_csv(r"/content/drive/MyDrive/testData.csv",encoding="utf-8")

_category = train_df.cat.unique().tolist()
def test_ml(txt):
    raw_txt = txt
    txt = clean_text(txt)
    predictions, raw_outputs  = model.predict([txt])
    return str(_category[predictions[0]])
     
test_data["new_cat"] = test_data["name"].apply(test_ml)
test_data.to_csv(r"/content/drive/Mydrive/sonuc.csv", index = False)

In [None]:
handle = open(r"/content/drive/MyDrive/new_ML.dat", "wb")pickle.dump(model, handle)
handle.close()