In [71]:
# %pip install pysrt
# %pip install catboost

In [72]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer

# import required module
import os
import pysrt
import re

import nltk
from nltk.corpus import stopwords 

nltk.download('stopwords')

from pymystem3 import Mystem
from sklearn.metrics import confusion_matrix

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lampq\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [73]:
stop_words = set(stopwords.words('english'))
kaggle_df = pd.read_csv('database/kaggle/cefr_leveled_texts.csv')

In [74]:
kaggle_df.head()

Unnamed: 0,text,label
0,Hi!\nI've been meaning to write for ages and f...,B2
1,﻿It was not so much how hard people found the ...,B2
2,Keith recently came back from a trip to Chicag...,B2
3,"The Griffith Observatory is a planetarium, and...",B2
4,-LRB- The Hollywood Reporter -RRB- It's offici...,B2


In [75]:
movies = pd.read_excel('database/english_score/movies_labels.xlsx')

In [76]:
# assign directory
directory_A2 = 'database/english_score/Subtitles_all/A2'
directory_B1 = 'database/english_score/Subtitles_all/B1'
directory_B2 = 'database/english_score/Subtitles_all/B2'
directory_C1 = 'database/english_score/Subtitles_all/C1'
directory_test = 'database/english_score/Subtitles_all/Subtitles'

path_dict = {}
path_dict['A2'] = directory_A2
path_dict['B1'] = directory_B1
path_dict['B2'] = directory_B2
path_dict['C1'] = directory_C1
path_dict['test'] = directory_test

In [77]:
def get_text_from_file(file):
    text = []

    for sub in file:
        text.append(sub.text)
    
    return ''.join(text)

In [78]:
def get_text_from_folder(folder):
    folder_text = []
    for filename in os.scandir(folder):
        if filename.is_file():
            file_srt = pysrt.open(filename.path, encoding='iso-8859-1')
            folder_text.append(get_text_from_file(file_srt))

    return folder_text        

In [79]:
df_dict = {}
for folder in path_dict:
    print(folder)
    print(path_dict[folder])
    df_dict[folder] = get_text_from_folder(path_dict[folder])

A2
database/english_score/Subtitles_all/A2
B1
database/english_score/Subtitles_all/B1


B2
database/english_score/Subtitles_all/B2
C1
database/english_score/Subtitles_all/C1
test
database/english_score/Subtitles_all/Subtitles


In [80]:
df = pd.DataFrame(df_dict.items(), columns=['target', 'text'])

In [81]:
df.head()

Unnamed: 0,target,text
0,A2,[( bugs chittering )( brakes squeak )- ( engin...
1,B1,"[I need a father who's a role model,\nnot some..."
2,B2,"[[match snaps, fizzles][fire crackling][puffs]..."
3,C1,[[TELEGRAPH MACHINE BEEPING][TRAIN WHISTLE BLO...
4,test,"[<font color=""#ffff80""><b>Fixed & Synced by bo..."


In [82]:
df = df.explode('text')

In [83]:
movies.head()

Unnamed: 0,id,Movie,Level
0,0,10_Cloverfield_lane(2016),B1
1,1,10_things_I_hate_about_you(1999),B1
2,2,A_knights_tale(2001),B2
3,3,A_star_is_born(2018),B2
4,4,Aladdin(1992),A2/A2+


In [84]:
def clear_text(text):
    re_text = re.sub(r'[^a-zA-Z]', ' ', text)
    split_text = re_text.split()
    join_text = " ".join(split_text)
    return join_text.lower()

In [85]:
# def remove_stopwords(text):
#     words = text.split()
#     return ' '.join([word.lower() for word in words if not word.lower() in stop_words])

In [86]:
df["text"] = df["text"].apply(clear_text)

In [87]:
# df['text'] = df['text'].apply(remove_stopwords)

In [88]:
df['text']

0    bugs chittering brakes squeak engine stops tru...
0    birds chirping bugs chittering boy mom right h...
0    thunder rumbling merle that s right you heard ...
0    birds chirping what nothing it s not nothing i...
0    walkie talkie squawks rick morgan i don t know...
                           ...                        
4    i what am i doing i i with my life i i i m so ...
4    music no one s complained music there s the la...
4    i oh my god i i it s full on double rainbow al...
4    lucy i okay there are two things that i i i re...
4    fear treachery bloodlust thousands of years ag...
Name: text, Length: 278, dtype: object

In [89]:
target = ['A2', 'B1', 'B2', 'C1']
df_train = df.loc[df['target'].isin(target)]
df_test = df.loc[df['target'] == 'test']

In [90]:
df_train['target'].unique(), df_test['target'].unique()

(array(['A2', 'B1', 'B2', 'C1'], dtype=object), array(['test'], dtype=object))

In [91]:
X_subs = df_train['text']
y_subs = df_train['target']

In [92]:
movies['Level'].unique()

array(['B1', 'B2', 'A2/A2+', 'C1', 'B1, B2', 'A2/A2+, B1', 'A2'],
      dtype=object)

In [93]:
def normalize_target(x):    
    if x == 'A2/A2+': 
        return 'A2'
    if x == 'B1, B2': 
        return 'B2'
    if x == 'A2/A2+, B1': 
        return 'B1'
    else:
        return x

In [94]:
def change_target(x):
    if x == 'A1': 
        return 0
    if x == 'A2': 
        return 1
    if x == 'B1': 
        return 2
    if x == 'B2': 
        return 3
    if x == 'C1': 
        return 4
    if x == 'C2': 
        return 5

In [95]:
test_movies = []

for filename in os.scandir(directory_test):
    if filename.is_file():
        test_movies.append(os.path.basename(filename).split('.')[0].lower())

In [96]:
def df_lower(x):
    return x.lower()

In [97]:
movies['Movie'] = movies['Movie'].apply(df_lower)

In [98]:
test_targets = []

for test_movie in test_movies:
    level = movies[movies['Movie'] == test_movie]['Level'].values

    if len(level) > 0:
        test_targets.append(level[0])
    else:
        test_targets.append('')    

In [99]:
X_test = df_test['text'].to_frame()

In [100]:
df_test['target'] = test_targets

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['target'] = test_targets


In [101]:
df_test = df_test[df_test['target'] != '']

In [102]:
X_test = df_test['text'].to_frame()
y_test = df_test['target']

In [103]:
df_test.head()

Unnamed: 0,target,text
4,B1,font color ffff b fixed synced by bozxphd enjo...
4,B1,hey i ll be right with you so cameron here you...
4,A2/A2+,i oh i come from a land from a faraway place i...
4,A2/A2+,captioning made possible by mgm home entertain...
4,A2/A2+,indistinct conversation all laughing mama tany...


In [104]:
len(X_test), len(y_test)

(106, 106)

In [105]:
y_test.unique()

array(['B1', 'A2/A2+', 'B2', 'C1', 'B1, B2', 'A2/A2+, B1'], dtype=object)

# ДАТАСЕТ KAGGLE

In [106]:
kaggle_df = pd.read_csv('database/kaggle/cefr_leveled_texts.csv')

In [107]:
kaggle_df.head()

Unnamed: 0,text,label
0,Hi!\nI've been meaning to write for ages and f...,B2
1,﻿It was not so much how hard people found the ...,B2
2,Keith recently came back from a trip to Chicag...,B2
3,"The Griffith Observatory is a planetarium, and...",B2
4,-LRB- The Hollywood Reporter -RRB- It's offici...,B2


In [108]:
kaggle_df['label'].value_counts()

A1    288
B2    286
A2    272
C1    241
B1    205
C2    202
Name: label, dtype: int64

In [109]:
X_kaggle= kaggle_df['text'].to_frame()
y_kaggle = kaggle_df['label']

In [110]:
X_kaggle['text'] = X_kaggle['text'].apply(clear_text)

In [111]:
X_subs = df_train['text']
y_subs = df_train['target']

In [112]:
X_kaggle.shape, X_subs.shape

((1494, 1), (163,))

In [113]:
X = pd.concat([X_kaggle, X_subs.to_frame()])
y = pd.concat([y_kaggle, y_subs]).apply(normalize_target)
y_nun = y.apply(change_target)

In [114]:
y.value_counts()

B2    393
A1    288
A2    278
C1    274
B1    222
C2    202
dtype: int64

In [115]:
X.shape, y.shape

((1657, 1), (1657,))

In [116]:
X_kaggle_train, X_kaggle_val, y_kaggle_train, y_kaggle_val =  train_test_split(X, y, test_size=0.25, random_state=1)

In [117]:
len(X_kaggle_train), len(X_kaggle_val), len(y_kaggle_train), len(y_kaggle_val)

(1242, 415, 1242, 415)

BERT

In [150]:
bert_prep_text = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoding = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/4")

In [151]:
def get_text_embedding(text):
    prep_text = bert_prep_text(text)
    return bert_encoding(prep_text)

In [153]:
# text_train_embeddings = get_text_embedding(X_kaggle_train["text"])["sequence_output"][:,0,:].numpy()
# text_val_embeddings = get_text_embedding(X_kaggle_val["text"])["sequence_output"][:,0,:].numpy()
text_test_embeddings = get_text_embedding(X_test["text"])["sequence_output"][:,0,:].numpy()

ResourceExhaustedError: Exception encountered when calling layer "keras_layer_8" "                 f"(type KerasLayer).

Graph execution error:

OOM when allocating tensor with shape[1242,16,128,64] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node transformer/layer_0/self_attention/einsum/Einsum}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_restored_function_body_155669]

Call arguments received by layer "keras_layer_8" "                 f"(type KerasLayer):
  • inputs={'input_word_ids': 'tf.Tensor(shape=(1242, 128), dtype=int32)', 'input_type_ids': 'tf.Tensor(shape=(1242, 128), dtype=int32)', 'input_mask': 'tf.Tensor(shape=(1242, 128), dtype=int32)'}
  • training=None

In [None]:
text_train_embeddings.shape, text_val_embeddings.shape, text_test_embeddings.shape

((1242, 256), (415, 256), (106, 256))

TFIDF

In [None]:
# stop_words = set(stopwords.words('english'))
# tf_idf = TfidfVectorizer(stop_words=list(stop_words))

# X_kaggle_tfidf_train = tf_idf.fit_transform(X_kaggle_train['text'])
# X_kaggle_tfidf_val = tf_idf.transform(X_kaggle_val['text'])
# X_tfidf_test = tf_idf.transform(X_test['text'])

In [None]:
# X_kaggle_tfidf_train.shape, X_kaggle_tfidf_val.shape

In [None]:
# model = CatBoostRegressor(
#                 random_seed=1,
#                 iterations=700,    
#                 task_type='GPU',
#                 data_partition='DocParallel', 
#                 od_type='Iter',
#                 # od_wait=20,
#                 # text_features=['text'],
#                 metric_period=500,
#                 learning_rate=0.2,
#                 rsm=1,
#                 loss_function="RMSE",
#                 eval_metric='RMSE'
# )

In [None]:
# model.fit(
#     X_kaggle_tfidf_train, y_kaggle_train,
#     eval_set=(X_kaggle_val, y_kaggle_val),
#     cat_features=None,
#     verbose=True,
#     # plot=True
# )

In [None]:
model_cls = CatBoostClassifier(
                random_seed=1,
                iterations=5000,    
                task_type='GPU',
                data_partition='DocParallel', 
                # od_type='Iter',
                # od_wait=20,
                # text_features=['text'],
                metric_period=500,
                learning_rate=0.001,
                rsm=1,
                loss_function="MultiClass",
                eval_metric='Accuracy',
)

In [None]:
model_cls.fit(
    text_train_embeddings, y_kaggle_train,
    eval_set=(text_val_embeddings, y_kaggle_val),
    verbose=True,
    # plot=True
)



0:	learn: 0.4983897	test: 0.3421687	best: 0.3421687 (0)	total: 26.8ms	remaining: 2m 13s
500:	learn: 0.6264090	test: 0.4337349	best: 0.4337349 (500)	total: 6.44s	remaining: 57.9s
1000:	learn: 0.6594203	test: 0.4385542	best: 0.4385542 (1000)	total: 12.7s	remaining: 50.8s
1500:	learn: 0.6876006	test: 0.4457831	best: 0.4457831 (1500)	total: 19s	remaining: 44.3s
2000:	learn: 0.7206119	test: 0.4530120	best: 0.4530120 (2000)	total: 25.2s	remaining: 37.8s
2500:	learn: 0.7552335	test: 0.4578313	best: 0.4578313 (2500)	total: 31.4s	remaining: 31.4s
3000:	learn: 0.7818035	test: 0.4626506	best: 0.4626506 (3000)	total: 37.8s	remaining: 25.2s
3500:	learn: 0.8140097	test: 0.4650602	best: 0.4650602 (3500)	total: 44s	remaining: 18.9s
4000:	learn: 0.8397746	test: 0.4698795	best: 0.4698795 (4000)	total: 50.3s	remaining: 12.6s
4500:	learn: 0.8639291	test: 0.4795181	best: 0.4795181 (4500)	total: 56.8s	remaining: 6.29s
4999:	learn: 0.8792271	test: 0.4843373	best: 0.4843373 (4999)	total: 1m 2s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1b9dc2a5b40>

# РЕЗУЛЬТАТЫ

для классификатора:

In [None]:
y_pred_cls = model_cls.predict(text_test_embeddings)
accuracy_score(y_test, y_pred_cls)

0.16037735849056603

In [None]:
ff

NameError: name 'ff' is not defined

In [None]:
y_pred = model.predict(X_tfidf_test)

CatBoostError: There is no trained model to use predict(). Use fit() to train model. Then use this method.

In [None]:
def num_target_to_cat(x):    
    if x > 0 and x <= 0.5: 
        return 'A1'
    if x > 0.5 and x < 2: 
        return 'A2'
    if x >= 2 and x < 2.5: 
        return 'B1'
    if x >= 2.5 and x < 3.5: 
        return 'B2'
    if x >= 3.5 and x < 4.5: 
        return 'C1'
    else: 
        return 'C2'

In [None]:
y_pred = pd.Series(y_pred).apply(num_target_to_cat)
y_pred.value_counts()

B2    70
B1    26
A2     5
C1     5
dtype: int64

In [None]:
y_test = df_test['target'].apply(normalize_target)
y_test.value_counts()

B1    38
B2    37
A2    25
C1     6
Name: target, dtype: int64

In [None]:
accuracy_score(y_test, y_pred)   

0.4716981132075472