### Классификация фильмов по уровню владения английского языка

Класссификация фильмов по уровню подготовки нужна для мотивации учащиегося сохранить интерес к изучению языка. Фильм должен содержать слова по уровню ученика.

Целью проекта будет разработанная модель соотносящая фильмы к определенному уровню владения английского языка.

В ходе работы планируем выполнить следующие шаги:
 1. Загрузка и подготовка данных.
 2. Выбор и обучение моделей, итоговая оценка качества предсказания лучшей модели.
 3. Вывод.

In [1]:
%load_ext watermark
%watermark -a 'Yandex' -u -d -v -p numpy,pandas,matplotlib,sklearn,nltk

Author: Yandex

Last updated: 2023-07-01

Python implementation: CPython
Python version       : 3.11.4
IPython version      : 8.14.0

numpy     : 1.25.0
pandas    : 2.0.2
matplotlib: 3.7.1
sklearn   : 0.0.post5
nltk      : 3.8.1



In [2]:
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version

## 1. Загрузка и подготовка данных

In [3]:
# выведем необходимые библиотеки для работы

import time
import warnings
from datetime import datetime

import pandas as pd
import numpy as np

import pyprind
import pickle
import chardet
import string
import re, sys
import os, glob

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier 
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, classification_report

RANDOM_STATE = 12345

from tabula import read_pdf
from PyPDF2 import PdfReader
import pysrt


In [4]:
# дополнительные артикли, предлоги и местоимения для фильтра субтитров
stop_additional = ['I', 'm', 'me', 'you', 'he', 'him', 'she', 'her', 'we', 'us', 'they', 'them', 'thus', 
        'your', 'our', 'its', 's', 'their', 'mine', 'ours', 'theirs', 'yours', 'hers', 'yes', 'his',
        'this', 'that', 'these', 'those', 'such', 'much', 'many', 'to', 'be', 'myself', 'yourself',
        'ourselves', 'yourselves', 'himself', 'herself', 'itself', 'themselves', 'for', 'mmm', 'hey',
        'who', 'what', 'where', 'when', 'why', 'which', 'whose', 'no', 'nothing', 'nobody', 'no',
        'none', 'neither', 'some', 'any', 'a', 'an', 'the', 'is', 'are', 'on', 'in', 'at', 'under',
        'above', 'near', 'there', 'about', 'against', 'of', 'by', 'from', 'since', 'till', 'untill', 
        'within', 'between', 'below', 'behind', 'outside', 'into', 'up', 'through', 'down', 'along', 
        'across', 'out', 'hi', 'oh', 'yeah', 'ah', 'this', 'it', 'do', 'll', 'not', 'shh', 'am', 'so',
        're', 'don', 'my', 'all', 'off', 've', 'with', 'mm', 'hmm', 'did', 'do', 'didn', 'will', 'and'
        'still', 'if', 'em', 'then', 'adw', 'and', 'still', 'or', 'bit', 'over', 'as', 'uh', 'huh',
        'okay', 'can', 'um', 'lot', 'alb', 'but', 'here', 'cb', 'couldn', 't', 'whoa', 'hyah', 'hijo',
        'eww', 'ain', 'ed', 'than', 'sir', 'thank', 'elder_man', 'mike', 'have', 'just', 'know', 'ross',
        'n', 'ni','louis', 'was', 'one', 'now', 'well', 'tell', 'want', 'get', 'like', 'gonna', 'right',
        'âª âª', 'i', '‘em', 'x', 'id', 'im', 'ive', '♪', 'carl', 'aint', 'ï»¿', 'ii', 'lynx',
        'itll', 'iim', 'iin', 'ithis', 'ii', 'â', 'ª', 'colorffffeldermanfont', 'â ª', 'colorp®fontb',
        'durã¡n', 'colorffkrishnatejafontb', 'ÿþ', 'e020', 'ffff00', 'mr']



In [5]:
# откроем файл с названием фильмов и уровнем английского
df = pd.read_excel('/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/movies_labels.xlsx', index_col='id')
df.head(3)

Unnamed: 0_level_0,Movie,Level
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,10_Cloverfield_lane(2016),B1
1,10_things_I_hate_about_you(1999),B1
2,A_knights_tale(2001),B2


In [6]:
# функция для фильтра стоп-словами
def clean_subs(subs):
    subs = [a for a in subs if a not in stop_additional]  
    return subs

In [7]:
# загрузим в список имена файлов из папки с субтитрами A2
films_name = os.listdir(path='/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/A2')
films_filtr = set(films_name) & set(df['Movie'] + '.srt')


In [8]:
a_2 = []

# загружаем и отфильтровываем субтитры 
for film in films_filtr:
    try: 
        subs = pysrt.open(f'/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/A2/{film},')
    except:
        subs = pysrt.open(f'/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/A2/{film}', encoding='latin-1')
    # вызов функии для очистки текста
    subs = re.findall('[a-zA-Z]+\w+', subs.text)
    for i in range(len(subs)):
        subs[i] = subs[i].lower()
    cln_subs = clean_subs(subs)
    a_2.append(cln_subs)
    

    

In [9]:
print(a_2)

[['thunder', 'rumbling', 'merle', 'heard', 'bitch', 'got', 'problem', 'bring', 'man', 'enough', 'take', 'chain', 'pussy', 'heard', 'pussy', 'ass', 'noncom', 'bitch', 'deaf', 'take', 'damn', 'chain', 'command', 'kiss', 'lily', 'white', 'ass', 'laughs', 'said', 'heard', 'idiot', 'takes', 'swing', 'laughing', 'hysterically', 'should', 'seen', 'look', 'face', 'punched', 'front', 'teeth', 'five', 'pow', 'pow', 'god', 'months', 'stockade', 'teeth', 'cost', 'hard', 'time', 'god', 'worth', 'every', 'minute', 'see', 'prick', 'spit', 'teeth', 'ground', 'worth', 'every', 'minute', 'wind', 'blowing', 'god', 'god', 'grunts', 'god', 'jesus', 'merciful', 'christ', 'god', 'help', 'god', 'god', 'jesus', 'please', 'jesus', 'please', 'help', 'come', 'help', 'zombies', 'growling', 'banging', 'softly', 'god', 'crying', 'jesus', 'jesus', 'please', 'behave', 'being', 'punished', 'deserve', 'deserve', 'been', 'bad', 'help', 'show', 'way', 'go', 'god', 'grunting', 'never', 'mind', 'silly', 'christ', 'boy', 'be

In [10]:
# создаем датафрейм с субтитрами уровня А2
data_a_2 = {'subs': a_2}
df_a_2 = pd.DataFrame(data_a_2)
df_a_2['level'] = 'A2'
print(df_a_2)

                                                subs level
0  [thunder, rumbling, merle, heard, bitch, got, ...    A2
1  [bugs, chittering, brakes, squeak, engine, sto...    A2
2  [walkie, talkie, squawks, rick, morgan, hear, ...    A2
3  [birds, chirping, bugs, chittering, boy, mom, ...    A2
4  [people, yelling, radio, chatter, ma, ma, plea...    A2
5  [birds, chirping, always, something, dad, teac...    A2


In [11]:
# загрузим в список имена файлов из папки с субтитрами B1
films_name = os.listdir(path='/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/B1')
films_filtr = set(films_name) & set(df['Movie'] + '.srt')



In [12]:
b_1 = []

#загружаем и отфильтровываем субтитры 
for film in films_filtr:
    try: 
        subs = pysrt.open(f'/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/B1/{film},')
    except:
        subs = pysrt.open(f'/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/B1/{film}', encoding='latin-1')
    # вызов функии для очистки текста
    subs = re.findall('[a-zA-Z]+\w+', subs.text)
    for i in range(len(subs)):
        subs[i] = subs[i].lower()
    cln_subs = clean_subs(subs)
    b_1.append(cln_subs)
    

In [13]:
# создаем датафрейм с субтитрами уровня B1
data_b_1 = {'subs': b_1}
df_b_1 = pd.DataFrame(data_b_1)
df_b_1['level'] = 'B1'
print(df_b_1)

                                                 subs level
0   [dismount, herman, horse, sick, chaps, wander,...    B1
1   [spirit, stallion, cimarron, story, found, boo...    B1
2   [asia, largest, earth, continents, stretches, ...    B1
3   [africa, continent, earth, today, has, spectac...    B1
4   [million, years, ago, planet, looked, very, di...    B1
5   [going, come, angela, mind, sheep, dada, hear,...    B1
6   [wake, mornin, each, every, day, sit, table, h...    B1
7   [need, father, role, model, horny, geek, boy, ...    B1
8   [southern, tip, south, america, andes, mountai...    B1
9   [australia, island, continent, cast, adrift, d...    B1
10  [billion, human, lives, ended, august, th, sur...    B1
11  [santos, good, see, saved, usual, table, santo...    B1
12  [europe, home, million, people, continent, has...    B1
13  [sleepy, angelenos, good, good, morning, buddy...    B1
14  [goddamn, bugs, shit, crap, nick, dick, surpri...    B1
15  [continent, planet, changes, more, d

In [14]:
# загрузим в список имена файлов из папки с субтитрами B2
films_name = os.listdir(path='/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/B2')
films_filtr = set(films_name) 



In [15]:
b_2 = []

#загружаем и отфильтровываем субтитры 
for film in films_filtr:
    try: 
        subs = pysrt.open(f'/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/B2/{film},')
    except:
        subs = pysrt.open(f'/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/B2/{film}', encoding='latin-1')
    # вызов функии для очистки текста
    subs = re.findall('[a-zA-Z]+\w+', subs.text)
    for i in range(len(subs)):
        subs[i] = subs[i].lower()
    cln_subs = clean_subs(subs)
    b_2.append(cln_subs)
    

In [16]:
# создаем датафрейм с субтитрами уровня B2
data_b_2 = {'subs': b_2}
df_b_2 = pd.DataFrame(data_b_2)
df_b_2['level'] = 'B2'
print(df_b_2)


                                                  subs level
0    [previously, suits, suing, entire, firm, fraud...    B2
1    [god, sorry, easier, ways, kill, worried, rat,...    B2
2    [previously, suits, less, every, people, appli...    B2
3    [previously, suits, harvey, specter, best, clo...    B2
4    [come, put, table, go, take, wrap, hello, unbu...    B2
..                                                 ...   ...
102  [ask, daughters, come, see, ma, girls, too, so...    B2
103  [husband, deeply, touched, welcome, ma, how, l...    B2
104  [coughing, coughing, continues, spits, exhales...    B2
105  [majesty, always, sovereign, begins, meal, abs...    B2
106  [woke, wished, dead, aching, head, lay, motion...    B2

[107 rows x 2 columns]


In [17]:
# загрузим в список имена файлов из папки с субтитрами C1
#films_name = os.listdir(path='/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1')
#films_filtr = set(films_name) 
#c_1 = []


In [18]:
#for film in films_filtr:
#    try: 
#        subs = pysrt.open(f'/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/{film},')
#    except:
#        subs = pysrt.open(f'/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/{film}', encoding='latin-1')
#    # вызов функии для очистки текста
#    subs = re.findall('[a-zA-Z]+\w+', subs.text)
#    for i in range(len(subs)):
#        subs[i] = subs[i].lower()
#    cln_subs = clean_subs(subs)
#    c_1.append(cln_subs)
    

Предыдущий метод при последовательной обработки данных уровня C1 ломает данные, будем открывать файлы вручную.

In [19]:
path1= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Downton Abbey - S01E01 - Episode 1.eng.SDH.srt'
path2= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Downton Abbey - S01E02 - Episode 2.eng.SDH.srt'
path3= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Downton Abbey - S01E03 - Episode 3.eng.SDH.srt'
path4= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Downton Abbey - S01E04 - Episode 4.eng.SDH.srt'
path5= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Downton Abbey - S01E05 - Episode 5.eng.SDH.srt'
path6= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Downton Abbey - S01E06 - Episode 6.eng.SDH.srt'
path7= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Downton Abbey - S01E07 - Episode 7.eng.SDH.srt'

path01= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Suits S04E01 EngSub.srt'    
path02= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Suits S04E02 EngSub.srt'
path03= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Suits S04E03 EngSub.srt'    
path04= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Suits S04E04 EngSub.srt'    
path05= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Suits S04E05 EngSub.srt'    
path06= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Suits S04E06 EngSub.srt'    
path07= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Suits S04E07 EngSub.srt'   
path08= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Suits S04E08 EngSub.srt'    
path09= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Suits S04E09 EngSub.srt'    
path10= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Suits S04E10 EngSub.srt'    
path11= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Suits S04E11 EngSub.srt'    
path12= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Suits S04E12 EngSub.srt'   
path13= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Suits S04E13 EngSub.srt'
path14= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Suits S04E14 EngSub.srt'
path15= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Suits S04E15 EngSub.srt'
path16= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Suits S04E16 EngSub.srt'

path21= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Suits.S03E01.480pHDTV.x264-mSD.srt'
path22= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Suits.S03E02.720pHDTV.x264-mSD.srt'
path23= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Suits.S03E03.480pHDTV.x264-mSD.srt'
path24= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Suits.S03E04.480pHDTV.x264-mSD.srt'
path25= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Suits.S03E05.480p.HDTV.x264-mSD.srt'
path26= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Suits.S03E06.720p.HDTV.x264-mSD.srt'
path27= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Suits.S03E07.HDTV.x264-mSD.srt'
path28= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Suits.S03E08.480p.HDTV.x264-mSD.srt'
path29= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Suits.S03E09.480p.HDTV.x264-mSD.srt'
path30= '/Users/urvanov_aleksandr/Documents/Yandex/17DS+/Мастерская/english_level/English_scores/Subtitles_all/C1/Suits.S03E10.HDTV.x264-mSD.srt'


In [20]:
# функция для открытия и обработки файлов
def c1(path):
    filename = pysrt.open(path)
    filename = re.findall('[a-zA-Z]+\w+', filename.text)
    for i in range(len(filename)):
        filename[i] = filename[i].lower()
    filename = [a for a in filename if a not in stop_additional]  
    return filename

In [21]:
# вносим документы субтитров C1 в переменные
path1 = c1(path1)
path2 = c1(path2)
path3 = c1(path3)
path4 = c1(path4)
path5 = c1(path5)
path6 = c1(path6)
path7 = c1(path7)

path01 = c1(path01)
path02 = c1(path02)
path03 = c1(path03)
path04 = c1(path04)
path05 = c1(path05)
path06 = c1(path06)
path07 = c1(path07)
path08 = c1(path08)
path09 = c1(path09)
path10 = c1(path10)
path11 = c1(path11)
path12 = c1(path12)
path13 = c1(path13)
path14 = c1(path14)
path15 = c1(path15)
path16 = c1(path16)

path21 = c1(path21)
path22 = c1(path22)
path23 = c1(path23)
path24 = c1(path24)
path25 = c1(path25)
path26 = c1(path26)
path27 = c1(path27)
path28 = c1(path28)
path29 = c1(path29)
path30 = c1(path30)

In [22]:
# создаем список списков
c_1 = path1, path2, path3, path4, path5, path6, path7, path01, path2, path03, path04, path05, path06, path07, path08, path09, path10, path11, path12, path13, path14, path15, path16, path21, path22, path23, path24, path25, path26, path27, path28, path29, path30


In [23]:
# создаем датафрейм с субтитрами уровня С1
data_c_1 = {'subs': c_1}
df_c_1 = pd.DataFrame(data_c_1)
df_c_1['level'] = 'C1'

In [24]:
print(df_c_1.head())

                                                subs level
0  [telegraph, machine, beeping, train, whistle, ...    C1
1  [ma, crawley, house, good, ill, sighs, see, re...    C1
2  [bates, came, morning, said, would, isn, quite...    C1
3  [hammering, does, open, tomorrow, afternoon, l...    C1
4  [gasps, made, jump, daisy, matter, thumbs, sor...    C1


In [25]:
# создадим датафрейм с субтитрами и уровнем английского 
frames = df_a_2, df_b_1, df_b_2, df_c_1

df = pd.concat(frames)

In [26]:
print(df) 

                                                 subs level
0   [thunder, rumbling, merle, heard, bitch, got, ...    A2
1   [bugs, chittering, brakes, squeak, engine, sto...    A2
2   [walkie, talkie, squawks, rick, morgan, hear, ...    A2
3   [birds, chirping, bugs, chittering, boy, mom, ...    A2
4   [people, yelling, radio, chatter, ma, ma, plea...    A2
..                                                ...   ...
28  [lost, ava, company, assume, deal, edward, dea...    C1
29  [previously, suits, going, wall, tomorrow, way...    C1
30  [ava, hessington, acquitted, darby, backs, man...    C1
31  [previously, suits, bonding, father, speaking,...    C1
32  [previously, suits, copy, letter, wrote, copy,...    C1

[162 rows x 2 columns]


In [27]:
# перемешаем данные
df = df.sample(frac=1).reset_index(drop=True)
print(df)

                                                  subs level
0    [previously, suits, woodall, coming, after, se...    C1
1    [dismount, herman, horse, sick, chaps, wander,...    B1
2    [chloe, mother, said, wasn, eating, thought, c...    B2
3    [hills, alive, sound, music, songs, sung, thou...    B2
4    [clothes, wear, people, make, clothes, impact,...    B2
..                                                 ...   ...
157  [people, vocalizing, rhythmically, more, voice...    B2
158  [previously, suits, suing, entire, firm, fraud...    B2
159  [ma, crawley, house, good, ill, sighs, see, re...    C1
160  [goddamn, bugs, shit, crap, nick, dick, surpri...    B1
161  [bugs, chittering, brakes, squeak, engine, sto...    A2

[162 rows x 2 columns]


In [28]:
df['subs'] = df['subs'].apply(' '.join)

In [29]:
print(df)

                                                  subs level
0    previously suits woodall coming after sean cah...    C1
1    dismount herman horse sick chaps wander passag...    B1
2    chloe mother said wasn eating thought could he...    B2
3    hills alive sound music songs sung thousand ye...    B2
4    clothes wear people make clothes impact having...    B2
..                                                 ...   ...
157  people vocalizing rhythmically more voices joi...    B2
158  previously suits suing entire firm fraud looki...    B2
159  ma crawley house good ill sighs see refuse mec...    C1
160  goddamn bugs shit crap nick dick surprise comi...    B1
161  bugs chittering brakes squeak engine stops tru...    A2

[162 rows x 2 columns]


## Вывод 

Данные обработаны, данных недостаточно для корректного прогноза, будем исходить из того что есть. При попытке использовать леммитизацию ломаеются данные. База NTLK выдает ошибку. Отфильтровал собственными стоп-словами. Готовы переходить к следующему этапу.

## 2. Выбор и обучение моделей, итоговая оценка качества предсказания лучшей модели.

In [30]:
# выделяем целевой признак и признаки:
target = df["level"]
features = df["subs"]

In [31]:
# выделяем 75% данных для обучающей выборки, остальные данные для тестовой
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size = 0.25, random_state = RANDOM_STATE)

In [32]:
# создаем pipiline  для преобразования документов в вектор признаков и оценке релевантности слов
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB(fit_prior=False))])


In [33]:
# обучаем модель многочисленнным классифкатором наивного Байеса
text_clf = text_clf.fit(X_train, y_train)


In [34]:
# выводим предсказания
predicted = text_clf.predict(X_test)

In [35]:
nb = np.mean(predicted == y_test)

In [36]:
print('Accuracy для модели MultinomialNB :', nb)

Accuracy для модели MultinomialNB : 0.8048780487804879


In [37]:
# подберем гиперпараметры для модели MultinomialNB с помощью GridSearchCV
grid_param = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3)}

gs_clf = GridSearchCV(text_clf, grid_param, cv =2, n_jobs=-1)

In [38]:
# обучаем модель
gs_clf = gs_clf.fit(X_train, y_train)

In [39]:
# выводим лучшую модель
gs_clf.best_params_

{'clf__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [40]:
nb_gr = gs_clf.best_score_

In [41]:
print('Accuracy для модели MultinomialNB с подбором через GridSearchCV :', nb_gr)

Accuracy для модели MultinomialNB с подбором через GridSearchCV : 0.9087431693989071


----------

In [42]:
# попробуем использовать модель (SGD) машины опорных векторов 
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', 
                                                   alpha=1e-3, random_state=RANDOM_STATE))])
text_clf_svm.fit(X_train, y_train)


In [43]:
# выводим предсказания
predicted_svm = text_clf_svm.predict(X_test)

In [44]:
sgd = np.mean(predicted_svm == y_test)

In [45]:
print('Accuracy для модели SGDClassifier :', sgd)

Accuracy для модели SGDClassifier : 0.9512195121951219


In [46]:
# подберем гиперпараметры для модели SGDClassifier  с помощью GridSearchCV
grid_param = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
               'clf-svm__alpha': (1e-2, 1e-3)}
gs_clf_svm = GridSearchCV(text_clf_svm, grid_param, cv=2, n_jobs=-1)

In [47]:
# обучаем модель
gs_clf_svm = gs_clf_svm.fit(X_train, y_train)

In [48]:
# выводим лучшую модель
gs_clf_svm.best_params_

{'clf-svm__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}

In [49]:
sgd_gr = gs_clf_svm.best_score_

In [50]:
print('Accuracy для модели SGDClassifier с подбором через GridSearchCV :', sgd_gr)
      

Accuracy для модели SGDClassifier с подбором через GridSearchCV : 0.8920765027322404


### Вывод

Мы увидели, что для нашего набора данных оба алгоритма были почти одинаково согласованы при оптимизации. 

## 3. Вывод

In [51]:
# сводная таблица по показателям MAPE, время обучения модели:
index = ['MultinomialNB',
         'MultinomialNB v GridSearchCV',
         'SGDClassifier',
         'SGDClassifier v GridSearchCV']


data = {'accuracy':[nb,
                    nb_gr,
                    sgd,
                    sgd_gr]}

data = pd.DataFrame(data=data, index=index)
data.sort_values(by = 'accuracy', ascending=False)

Unnamed: 0,accuracy
SGDClassifier,0.95122
MultinomialNB v GridSearchCV,0.908743
SGDClassifier v GridSearchCV,0.892077
MultinomialNB,0.804878


В ходе работы было выполнено:
1. Загружены данные и проведена предобработка.
2. Выполнено сравнение 2-х моделей MultinomialNB, SGDClassifier  с использованием различных наборов гиперпараметров.
3. Выбрана лучшая модель по результатам метрики accuracy.

Лучшей моделью можно считать SGDClassifier c accuracy = 0.951220. Для заказчика однозначно рекомендую использовать эту модель для определения уровня английского языка субтитрам. 