In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

from transformers import TFRobertaModel
from transformers import AutoTokenizer

import tensorflow as tf
from tensorflow import keras

from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report

In [4]:
filename = "/kaggle/input/uznews-prepared/uznews_preprocessed.csv"
df = pd.read_csv(filename)
df.head()

Unnamed: 0,text,clean_text,sentiment
0,O'zbekiston futbol assotsiatsiyasi intizom qo'...,ozbekiston futbol assotsiatsiyasi intizom qomi...,1
1,Rustam Yusupov Toshkent obodonlashtirish Bosh ...,rustam yusupov toshkent obodonlashtirish bosh ...,1
2,Xitoyning Yutong kompaniyasi yil oxirigacha Sa...,xitoyning yutong kompaniyasi yil oxirigacha sa...,1
3,Samarqand viloyatida 3 gektar o'rmon xo'jaligi...,samarqand viloyatida gektar ormon xojaligi yer...,1
4,Andijon viloyati Xonobodda bir kishi davlat or...,andijon viloyati xonobodda bir kishi davlat or...,0


## Class Balancing by RandomOverSampler

In [5]:
ros = RandomOverSampler()
train_x, train_y = ros.fit_resample(np.array(df['clean_text']).reshape(-1, 1), np.array(df['sentiment']).reshape(-1, 1));
train_os = pd.DataFrame(list(zip([x[0] for x in train_x], train_y)), columns = ['clean_text', 'sentiment']);

In [6]:
train_os['sentiment'].value_counts()

1    2565
0    2565
Name: sentiment, dtype: int64

## Train - Validation - Test split

In [7]:
X = train_os['clean_text'].values
y = train_os['sentiment'].values

In [8]:
seed=42

# split the original data into train and TEST sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, random_state=seed)

# split the remaining 90% of the data into train and VALIDATION sets
X_train, X_valid, y_train, y_valid = train_test_split(X_train_val, y_train_val, test_size=0.1, random_state=seed)

## One hot encoding

In [9]:
y_train_le = y_train.copy()
y_valid_le = y_valid.copy()
y_test_le = y_test.copy()

In [10]:
ohe = preprocessing.OneHotEncoder()
y_train = ohe.fit_transform(np.array(y_train).reshape(-1, 1)).toarray()
y_valid = ohe.fit_transform(np.array(y_valid).reshape(-1, 1)).toarray()
y_test = ohe.fit_transform(np.array(y_test).reshape(-1, 1)).toarray()

In [11]:
print(f"TRAINING DATA: {X_train.shape[0]}\nVALIDATION DATA: {X_valid.shape[0]}\nTESTING DATA: {X_test.shape[0]}" )

TRAINING DATA: 4155
VALIDATION DATA: 462
TESTING DATA: 513


# UzRoberta

In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('rifkat/uztext-3Gb-BPE-Roberta')

Downloading (…)lve/main/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/964k [00:00<?, ?B/s]

In [13]:
MAX_LEN=512

In [14]:
# tokenize the training data
train_inputs = tokenizer(X_train.tolist(), max_length=MAX_LEN, padding='max_length', truncation=True)
train_input_ids = np.array(train_inputs['input_ids'])
train_attention_masks = np.array(train_inputs['attention_mask'])

# tokenize the validation data
val_inputs = tokenizer(X_valid.tolist(), max_length=MAX_LEN, padding='max_length', truncation=True)
val_input_ids = np.array(val_inputs['input_ids'])
val_attention_masks = np.array(val_inputs['attention_mask'])

# tokenize the test data
test_inputs = tokenizer(X_test.tolist(), max_length=MAX_LEN, padding='max_length', truncation=True)
test_input_ids = np.array(test_inputs['input_ids'])
test_attention_masks = np.array(test_inputs['attention_mask'])

# Modeling

In [15]:
def create_model(roberta_model, max_len=MAX_LEN):
    
    opt = tf.keras.optimizers.Adam(
        learning_rate=1e-5, 
        beta_1=0.9, 
        beta_2=0.999, 
        epsilon=1e-7
    )
    loss = tf.keras.losses.CategoricalCrossentropy()
    accuracy = tf.keras.metrics.CategoricalAccuracy()

    input_ids = tf.keras.Input(shape=(max_len,),dtype='int32')
    attention_masks = tf.keras.Input(shape=(max_len,),dtype='int32')
    output = roberta_model([input_ids,attention_masks])
    output = output[1]
    output = tf.keras.layers.Dense(2, activation=tf.nn.softmax)(output)
    model = tf.keras.models.Model(inputs = [input_ids,attention_masks],outputs = output)
    model.compile(opt, loss=loss, metrics=accuracy)
    return model

In [16]:
roberta_model = TFRobertaModel.from_pretrained('rifkat/uztext-3Gb-BPE-Roberta', from_pt=True)

Downloading pytorch_model.bin:   0%|          | 0.00/334M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'roberta.embeddings.position_ids', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream 

In [17]:
model = create_model(roberta_model, MAX_LEN)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 512)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 512)]        0           []                               
                                                                                                  
 tf_roberta_model (TFRobertaMod  TFBaseModelOutputWi  83449344   ['input_1[0][0]',                
 el)                            thPoolingAndCrossAt               'input_2[0][0]']                
                                tentions(last_hidde                                               
                                n_state=(None, 512,                                           

In [18]:
model.fit([train_input_ids,train_attention_masks], y_train, validation_data=([val_input_ids,val_attention_masks], y_valid), epochs=4, batch_size=30)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


# Results

In [23]:
result = model.predict([test_input_ids,test_attention_masks])



In [24]:
y_pred =  np.zeros_like(result)
y_pred[np.arange(len(y_pred)), result.argmax(1)] = 1

In [25]:
print('\tClassification Report:\n\n',classification_report(y_test,y_pred, target_names=['Negative', 'Positive']))

	Classification Report:

               precision    recall  f1-score   support

    Negative       0.80      0.91      0.85       261
    Positive       0.89      0.77      0.83       252

   micro avg       0.84      0.84      0.84       513
   macro avg       0.85      0.84      0.84       513
weighted avg       0.85      0.84      0.84       513
 samples avg       0.84      0.84      0.84       513



In [22]:
# # save the model
# model.save('news_sentiment_uzbek_model.h5')

## Test with user input