#  Определение уровня английского в фильмах #


## Цель работы:

Разработать модель, которая будет предсказывать уровень английского в фильмах по субтитрам.

### Ход работы:

1. Создать датасет, из нескольких источников информации:
  - excel файл с разметкой фильмов,
  - файлы .srt с субтитрами,
  - pdf файлы со словарями слов, разбитых по уровням.
2. Провести EDA.
3. Построить модель, предсказывающую уровень фильма.
4. Создать приложение.

In [1]:
import pandas as pd
import lightgbm as lgb
import os
import pysrt
import re
import nltk
nltk.download('stopwords') 
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pymystem3 import Mystem
from PyPDF2 import PdfReader
from pickle import dump
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Света\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Света\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_excel('movies_labels.xlsx') # откроем разметку фильмов

In [3]:
df

Unnamed: 0,id,Movie,Level
0,0,10_Cloverfield_lane(2016),B1
1,1,10_things_I_hate_about_you(1999),B1
2,2,A_knights_tale(2001),B2
3,3,A_star_is_born(2018),B2
4,4,Aladdin(1992),A2/A2+
...,...,...,...
236,236,Matilda(2022),C1
237,237,Bullet train,B1
238,238,Thor love and thunder,B2
239,239,Lightyear,B2


Добавим к датафрейму столбец с ссылками указывающими путь к субтитрам, для этого:

In [4]:
sub_path_map = {} 

for i in os.walk('Subtitles_all'):   # с помощью библиотеки os получим список всех файлов, которые лежат в директории
    for j in i[-1]:                       # [-1] - означает последний элемент списка.
        sub_path_map[j] = f'{i[0]}/{j}'       # В данном случае j это название файла, а i[0] путь к файлу.
        
print(sub_path_map)        

{'The Walking Dead-S01E01-Days Gone Bye.English.srt': 'Subtitles_all\\A2/The Walking Dead-S01E01-Days Gone Bye.English.srt', 'The Walking Dead-S01E02-Guts.English.srt': 'Subtitles_all\\A2/The Walking Dead-S01E02-Guts.English.srt', 'The Walking Dead-S01E03-Tell It To The Frogs.English.srt': 'Subtitles_all\\A2/The Walking Dead-S01E03-Tell It To The Frogs.English.srt', 'The Walking Dead-S01E04-Vatos.English.srt': 'Subtitles_all\\A2/The Walking Dead-S01E04-Vatos.English.srt', 'The Walking Dead-S01E05-Wildfire.English.srt': 'Subtitles_all\\A2/The Walking Dead-S01E05-Wildfire.English.srt', 'The Walking Dead-S01E06-TS-19.English.srt': 'Subtitles_all\\A2/The Walking Dead-S01E06-TS-19.English.srt', 'AmericanBeauty1999.BRRip.srt': 'Subtitles_all\\B1/AmericanBeauty1999.BRRip.srt', "Angela's.Christmas.2018.WEBRip.Netflix.srt": "Subtitles_all\\B1/Angela's.Christmas.2018.WEBRip.Netflix.srt", 'Indiana Jones And The Last Crusade DVDRip Xvid -IZON-.srt': 'Subtitles_all\\B1/Indiana Jones And The Last Cr

In [5]:
def get_str_path(row):
    return sub_path_map.get(f'{row}.srt')

df['subs'] = df.Movie.apply(get_str_path)      # Добавляем наш путь к файлу в таблицу датафрейма

In [6]:
df

Unnamed: 0,id,Movie,Level,subs
0,0,10_Cloverfield_lane(2016),B1,Subtitles_all\Subtitles/10_Cloverfield_lane(20...
1,1,10_things_I_hate_about_you(1999),B1,Subtitles_all\Subtitles/10_things_I_hate_about...
2,2,A_knights_tale(2001),B2,Subtitles_all\Subtitles/A_knights_tale(2001).srt
3,3,A_star_is_born(2018),B2,Subtitles_all\Subtitles/A_star_is_born(2018).srt
4,4,Aladdin(1992),A2/A2+,Subtitles_all\Subtitles/Aladdin(1992).srt
...,...,...,...,...
236,236,Matilda(2022),C1,Subtitles_all\Subtitles/Matilda(2022).srt
237,237,Bullet train,B1,Subtitles_all\Subtitles/Bullet train.srt
238,238,Thor love and thunder,B2,Subtitles_all\Subtitles/Thor love and thunder.srt
239,239,Lightyear,B2,Subtitles_all\Subtitles/Lightyear.srt


Теперь работаем с этим столбцом.

In [7]:
paths = df['subs'].tolist() # переведем данные столбца с субтитрами в формат списка
paths

['Subtitles_all\\Subtitles/10_Cloverfield_lane(2016).srt',
 'Subtitles_all\\Subtitles/10_things_I_hate_about_you(1999).srt',
 'Subtitles_all\\Subtitles/A_knights_tale(2001).srt',
 'Subtitles_all\\Subtitles/A_star_is_born(2018).srt',
 'Subtitles_all\\Subtitles/Aladdin(1992).srt',
 'Subtitles_all\\Subtitles/All_dogs_go_to_heaven(1989).srt',
 'Subtitles_all\\Subtitles/An_American_tail(1986).srt',
 'Subtitles_all\\Subtitles/Babe(1995).srt',
 'Subtitles_all\\Subtitles/Back_to_the_future(1985).srt',
 'Subtitles_all\\Subtitles/Banking_On_Bitcoin(2016).srt',
 'Subtitles_all\\Subtitles/Batman_begins(2005).srt',
 'Subtitles_all\\Subtitles/Beauty_and_the_beast(2017).srt',
 'Subtitles_all\\Subtitles/Before_I_go_to_sleep(2014).srt',
 'Subtitles_all\\Subtitles/Before_sunrise(1995).srt',
 'Subtitles_all\\Subtitles/Before_sunset(2004).srt',
 'Subtitles_all\\Subtitles/Braveheart(1995).srt',
 'Subtitles_all\\Subtitles/Bridget_Jones_diary(2001).srt',
 'Subtitles_all\\Subtitles/Bridget_Joness_Baby.srt',
 

Подготовим функции преобразования файлов субтитров, а затем соберем их в одну общую функцию.

In [8]:
def join_strs(strs): # функция объединяющая строки субтитров в одну строку
    result = ''
    for s in strs:
         result += ' ' + s  
    return result[1:]

In [9]:
def prepare_text(text):                         # функция для очистки текста, использующая регулярные выражения
    del_n = re.compile('\n')                    # перенос каретки
    del_tags = re.compile('<[^>]*>')            # html-теги
    del_brackets = re.compile('\([^)]*\)')      # содержимое круглых скобок
    clean_text = re.compile('[^а-яa-z\s]')      # все небуквенные символы кроме пробелов
    del_spaces = re.compile('\s{2,}')
    text = del_n.sub(' ', str(text).lower())    # приводим к нижнему регистру 
    text = del_tags.sub('', text)               # очистка от тегов
    text = del_brackets.sub('', text)           # от содержимого круглых скобок
    res_text = clean_text.sub('', text)         # от всех небуквенных символов, кроме пробелов
    return del_spaces.sub(' ',res_text)

In [10]:
stop_words = stopwords.words('english')         # функция, удаляющая стоп-слова
def del_stopwords(text):
    clean_tokens = tuple(
        map( lambda x: x if x not in stop_words else '', word_tokenize(text) )
    )
    res_text = ' '.join(clean_tokens)
    return res_text

In [11]:
m = Mystem()                                 
def lemmatize(text):                           # функция лемматизации
    lemmatized_text = ''.join(m.lemmatize(text))
    return lemmatized_text.split('|')

In [12]:
def transformation(subs):               # функция преобразования текста субтитров
    
    subs = pysrt.open(subs)
    subs_films = []                     # создаем список из строк субтитров
    for x in range(len(subs)):
        subs_films.append(subs[x].text)
    
    sub_in_string = join_strs(subs_films) # соединяем их в одну строку 
    
    clean_sub_string = prepare_text(sub_in_string) # очистим строку субтиров функцией с регулярными выражениями
    
    clean_sub_string = del_stopwords(clean_sub_string) # удалим функцией стоп-слова
    
    clean_sub_string = lemmatize(clean_sub_string) # лемматизируем слова в строке субтитров
       
    return clean_sub_string[0]

Функция готова, теперь применим ее к каждому элементу списка с путями к субтитрам:

In [13]:
%%time
subs = []
for path in paths:
    subs.append(transformation(path))

CPU times: total: 34.1 s
Wall time: 6min 43s


In [14]:
df['subs'] = pd.Series(subs) # перепишем столбец subs

Готово, теперь у нас в столбце с субтитрами строки из слов, очищенные от лишних знаков, стоп-слов и приведенные к начальной форме.

In [15]:
df

Unnamed: 0,id,Movie,Level,subs
0,0,10_Cloverfield_lane(2016),B1,fixed synced bozxphd enjoy flick ben phone ...
1,1,10_things_I_hate_about_you(1999),B1,hey ill right cameron go nine schools y...
2,2,A_knights_tale(2001),B2,resync xenzainef retail help hes due list...
3,3,A_star_is_born(2018),B2,synced corrected mrcjnthn get black eyes o...
4,4,Aladdin(1992),A2/A2+,oh come land faraway place caravan came...
...,...,...,...,...
236,236,Matilda(2022),C1,chiming music playing mummy says im miracle ...
237,237,Bullet train,B1,boomslang stolen zoo last night extremely...
238,238,Thor love and thunder,B2,oh great mighty rapu pray water sustenan...
239,239,Lightyear,B2,buzz buzz lightyear mission log stardate senso...


In [16]:
for x in df['subs']:        # создадим из слов списки
    films_sub = x.split()

In [17]:
type(films_sub)

list

Теперь используем окфордские словари, для того, чтобы сгенерировать новые признаки. Для этого мы обединим все слова по категориям, удалим повторяющиеся, в пользу более сложного уровня и посчитаем доли слов каждой из групп, в каждом фильме. Откроем словари и создадим списки слов по категориям.

In [18]:
reader = PdfReader('American_Oxford_3000_by_CEFR_level.pdf')

text = []
for x in range(len(reader.pages)):
    page = reader.pages[x]    
    page_text = page.extract_text().split('\n')
    for y in page_text:
        text.append(y)

a1 = []
a2 = []
b1 = []
b2 = []
a1_trigger = 0
a2_trigger = 0
b1_trigger = 0
b2_trigger = 0

for y in range(len(text)):
    x = text[y] 
    if a1_trigger == 1:
        a1.append(x)
    if x == 'A1':
        a1_trigger = 1
        a2_trigger = 0
        b1_trigger = 0
        b2_trigger = 0
    if a2_trigger == 1:
        a2.append(x)
    if x == 'A2':
        a1_trigger = 0
        a2_trigger = 1
        b1_trigger = 0
        b2_trigger = 0
    if b1_trigger == 1:
        b1.append(x)
    if x == 'B1':
        a1_trigger = 0
        a2_trigger = 0
        b1_trigger = 1
        b2_trigger = 0
    if b2_trigger == 1:
        b2.append(x)
    if x == 'B2':
        a1_trigger = 0
        a2_trigger = 0
        b1_trigger = 0
        b2_trigger = 1

am3_words_a1 = []
am3_words_a2 = []
am3_words_b1 = []
am3_words_b2 = []
for x in a1:
    am3_words_a1.append(x.split()[0])
for x in a2:
    am3_words_a2.append(x.split()[0])
for x in b1:
    am3_words_b1.append(x.split()[0])
for x in b2:
    am3_words_b2.append(x.split()[0])

Поскольку распознавание текста в pdf документе прошло не совсем качественно, я просмотрела все слова и подчистила вручную.

In [19]:
am3_words_a1 = ['about', 'above', 'across', 'action', 'activity', 'actor', 'actress', 'add', 'address', 'adult', 'advice', 'afraid', 'after', 'afternoon', 'again', 'age', 'ago', 'agree', 'air', 'airport', 'all', 'also', 'always', 'amazing', 'and', 'angry', 'animal', 'another', 'answer', 'any', 'anyone', 'anything', 'apartment', 'apple', 'April', 'area', 'arm', 'around', 'arrive', 'art', 'article', 'artist', 'as', 'ask', 'at', 'August', 'aunt', 'away', 'awesome', 'baby', 'back', 'bad', 'bag', 'ball', 'banana', 'band', 'bank', 'bar', 'baseball', 'basketball', 'bath', 'bathroom', 'be', 'beach', 'beautiful', 'because', 'become', 'bed', 'bedroom', 'beer', 'before', 'begin', 'beginning', 'behind', 'believe', 'below', 'best', 'better', 'between', 'bicycle', 'big', 'bike', 'bill', 'bird', 'birthday', 'black', 'blog', 'blond', 'blue', 'boat', 'body', 'book', 'boot', 'bored', 'boring', 'born', 'both', 'bottle', 'box', 'boy', 'boyfriend', 'bread', 'break', 'breakfast', 'bring', 'brother', 'brown', 'build', 'building', 'bus', 'business', 'busy', 'but', 'butter', 'buy', 'by', 'bye', 'cafe', 'cake', 'call', 'camera', 'can', 'cannot', 'capital', 'car', 'card', 'career', 'carrot', 'carry', 'cat', 'CD', 'cent', 'center', 'chair', 'change', 'chart', 'cheap', 'check', 'cheese', 'chicken', 'child', 'chocolate', 'choose', 'city', 'class', 'classroom', 'clean', 'climb', 'clock', 'close', 'clothes', 'club', 'coat', 'coffee', 'cold', 'college', 'color', 'come', 'common', 'company', 'compare', 'complete', 'computer', 'concert', 'conversation', 'cook', 'cooking', 'cool', 'correct', 'cost', 'could', 'country', 'course', 'cousin', 'cow', 'cream', 'create', 'culture', 'cup', 'customer', 'cut', 'dad', 'dance', 'dancer', 'dancing', 'dangerous', 'dark', 'date', 'daughter', 'day', 'dear', 'December', 'decide', 'delicious', 'describe', 'description', 'design', 'desk', 'detail', 'dialogue', 'dictionary', 'die', 'diet', 'difference', 'different', 'difficult', 'dinner', 'dirty', 'discuss', 'dish', 'do', 'doctor', 'dog', 'dollar', 'door', 'down', 'downstairs', 'draw', 'dress', 'drink', 'drive', 'driver', 'during', 'DVD', 'each', 'ear', 'early', 'east', 'easy', 'eat', 'egg', 'eight', 'eighteen', 'eighty', 'elephant', 'eleven', 'else', 'email', 'end', 'enjoy', 'enough', 'euro', 'even', 'evening', 'event', 'ever', 'every', 'everybody', 'everyone', 'everything', 'exam', 'example', 'excited', 'exciting', 'exercise', 'expensive', 'explain', 'extra', 'eye', 'face', 'fact', 'fall', 'false', 'family', 'famous', 'fantastic', 'far', 'farm', 'farmer', 'fast', 'fat', 'father', 'favorite', 'February', 'feel', 'feeling', 'festival', 'few', 'fifteen', 'fifth', 'fifty', 'fill', 'final', 'find', 'fine', 'finish', 'fire', 'first', 'fish', 'five', 'flight', 'floor', 'flower', 'fly', 'follow', 'food', 'foot', 'football', 'for', 'forget', 'form', 'forty', 'four', 'fourteen', 'fourth', 'free', 'Friday', 'friend', 'friendly', 'from', 'front', 'fruit', 'full', 'fun', 'funny', 'future', 'game', 'garden', 'geography', 'get', 'girl', 'girlfriend', 'give', 'glass', 'go', 'good', 'goodbye', 'grandfather', 'grandmother', 'grandparent', 'gray', 'great', 'green', 'group', 'grow', 'guess', 'guitar', 'gym', 'hair', 'half', 'hand', 'happen', 'happy', 'hard', 'hat', 'hate', 'have', 'have', 'he', 'head', 'health', 'healthy', 'hear', 'hello', 'help', 'her', 'here', 'hey', 'hi', 'high', 'him', 'his', 'history', 'hobby', 'home', 'homework', 'hope', 'horse', 'hospital', 'hot', 'hotel', 'hour', 'house', 'how', 'however', 'hundred', 'hungry', 'husband', 'ice', 'ice', 'idea', 'if', 'imagine', 'important', 'improve', 'in', 'include', 'information', 'interest', 'interested', 'interesting', 'internet', 'interview', 'into', 'introduce', 'island', 'it', 'its', 'jacket', 'January', 'jeans', 'job', 'join', 'juice', 'July', 'June', 'just', 'keep', 'key', 'kind', 'kitchen', 'know', 'land', 'language', 'large', 'last', 'late', 'later', 'laugh', 'learn', 'leave', 'left', 'leg', 'lesson', 'let', 'letter', 'library', 'lie', 'life', 'light', 'like', 'like', 'line', 'lion', 'list', 'listen', 'little', 'live', 'local', 'long', 'look', 'lose', 'lot', 'love', 'lunch', 'machine', 'magazine', 'main', 'make', 'mall', 'man', 'many', 'map', 'March', 'market', 'married', 'match', 'May', 'maybe', 'me', 'meal', 'mean', 'meaning', 'meat', 'meet', 'meeting', 'member', 'menu', 'message', 'meter', 'midnight', 'mile', 'milk', 'million', 'minute', 'miss', 'mistake', 'model', 'modern', 'mom', 'moment', 'Monday', 'money', 'month', 'more', 'morning', 'most', 'mother', 'mountain', 'mouse', 'mouth', 'move', 'movie', 'much', 'museum', 'music', 'must', 'my', 'name', 'natural', 'near', 'need', 'negative', 'neighbor', 'neighborhood', 'never', 'new', 'news', 'newspaper', 'next', 'next', 'nice', 'night', 'nine', 'nineteen', 'ninety', 'no', 'no', 'nobody', 'north', 'nose', 'not', 'note', 'nothing', 'November', 'now', 'number', 'nurse', 'object', 'ocean', 'o’clock', 'October', 'of', 'off', 'office', 'often', 'oh', 'OK', 'old', 'on', 'once', 'one', 'onion', 'online', 'only', 'open', 'opinion', 'opposite', 'or', 'orange', 'order', 'other', 'our', 'out', 'outside', 'over', 'own', 'page', 'paint', 'painting', 'pair', 'pants', 'paper', 'paragraph', 'parent', 'park', 'part', 'partner', 'party', 'passport', 'past', 'pay', 'pen', 'pencil', 'people', 'pepper', 'perfect', 'period', 'person', 'personal', 'phone', 'photo', 'photograph', 'phrase', 'piano', 'picture', 'piece', 'pig', 'pink', 'place', 'plan', 'plane', 'plant', 'play', 'player', 'please', 'point', 'police', 'policeman', 'pool', 'poor', 'popular', 'positive', 'possible', 'post', 'potato', 'pound', 'practice', 'prefer', 'prepare', 'present', 'pretty', 'price', 'probably', 'problem', 'product', 'program', 'project', 'purple', 'put', 'quarter', 'question', 'quick', 'quickly', 'quiet', 'quite', 'radio', 'rain', 'read', 'reader', 'reading', 'ready', 'real', 'really', 'reason', 'red', 'relax', 'remember', 'repeat', 'report', 'restaurant', 'result', 'return', 'rice', 'rich', 'ride', 'right', 'river', 'road', 'room', 'routine', 'rule', 'run', 'sad', 'salad', 'salt', 'same', 'sandwich', 'Saturday', 'say', 'school', 'science', 'scientist', 'second', 'second', 'section', 'see', 'sell', 'send', 'sentence', 'September', 'seven', 'seventeen', 'seventy', 'share', 'she', 'sheep', 'shirt', 'shoe', 'shop', 'shopping', 'short', 'should', 'show', 'shower', 'sick', 'similar', 'sing', 'singer', 'sister', 'sit', 'situation', 'six', 'sixteen', 'sixty', 'skill', 'skirt', 'sleep', 'slow', 'small', 'smart', 'snake', 'snow', 'so', 'some', 'somebody', 'someone', 'something', 'sometimes', 'son', 'song', 'soon', 'sorry', 'sound', 'soup', 'south', 'space', 'speak', 'special', 'spell', 'spelling', 'spend', 'sport', 'spring', 'stand', 'star', 'start', 'statement', 'station', 'stay', 'still', 'stop', 'store', 'story', 'street', 'strong', 'student', 'study', 'style', 'subject', 'success', 'sugar', 'summer', 'sun', 'Sunday', 'supermarket', 'sure', 'sweater', 'swim', 'swimming', 'table', 'take', 'talk', 'tall', 'taxi', 'tea', 'teach', 'teacher', 'team', 'teenager', 'telephone', 'television', 'tell', 'ten', 'tennis', 'terrible', 'test', 'text', 'than', 'thank', 'thanks', 'that', 'the', 'theater', 'their', 'them', 'then', 'there', 'they', 'thing', 'think', 'third', 'thirsty', 'thirteen', 'thirty', 'this', 'thousand', 'three', 'through', 'Thursday', 'ticket', 'time', 'tired', 'title', 'to', 'today', 'together', 'toilet', 'tomato', 'tomorrow', 'tonight', 'too', 'tooth', 'topic', 'tourist', 'town', 'traffic', 'train', 'travel', 'tree', 'trip', 'truck', 'true', 'try', 'T-shirt', 'Tuesday', 'turn', 'TV', 'twelve', 'twenty', 'twice', 'two', 'type', 'umbrella', 'uncle', 'under', 'understand', 'university', 'until', 'up', 'upstairs', 'us', 'use', 'useful', 'usually', 'vacation', 'vegetable', 'very', 'video', 'visit', 'visitor', 'wait', 'waiter', 'wake', 'walk', 'wall', 'want', 'warm', 'wash', 'watch', 'water', 'way', 'we', 'wear', 'weather', 'website', 'Wednesday', 'week', 'weekend', 'welcome', 'well', 'west', 'what', 'when', 'where', 'which', 'white', 'who', 'why', 'wife', 'will', 'win', 'window', 'wine', 'winter', 'with', 'without', 'woman', 'wonderful', 'word', 'work', 'worker', 'world', 'would', 'write', 'writer', 'writing', 'wrong', 'yard', 'yeah', 'year', 'yellow', 'yes', 'yesterday', 'you', 'young', 'your', 'yourself']

In [20]:
am3_words_a2 = ['ability', 'able', 'accept', 'accident', 'according', 'achieve', 'act', 'active', 'actually', 'adult', 'advantage', 'adventure', 'advertise', 'advertisement', 'advertising', 'affect', 'after', 'against', 'ah', 'airline', 'alive', 'all', 'all', 'allow', 'almost', 'alone', 'along', 'already', 'alternative', 'although', 'among', 'amount', 'analyze', 'ancient', 'ankle', 'any', 'anybody', 'anymore', 'anyway', 'anywhere', 'app', 'appear', 'appearance', 'apply', 'architect', 'architecture', 'argue', 'argument', 'army', 'arrange', 'arrangement', 'as', 'asleep', 'assistant', 'athlete', 'attack', 'attend', 'attention', 'attractive', 'audience', 'author', 'available', 'average', 'avoid', 'award', 'awful', 'back', 'background', 'badly', 'based', 'bean', 'bear', 'beat', 'beef', 'before', 'behave', 'behavior', 'belong', 'belt', 'benefit', 'best', 'better', 'between', 'billion', 'biology', 'birth', 'bit', 'blank', 'block', 'blood', 'blow', 'board', 'boil', 'bone', 'book', 'borrow', 'boss', 'bottom', 'bowl', 'brain', 'bridge', 'bright', 'brilliant', 'broken', 'brush', 'burn', 'businessman', 'button', 'camp', 'camping', 'campus', 'can', 'candy', 'care', 'careful', 'carefully', 'carpet', 'cartoon', 'case', 'cash', 'catch', 'cause', 'celebrate', 'celebrity', 'cell', 'century', 'certain', 'certainly', 'chance', 'character', 'charity', 'chat', 'check', 'chef', 'chemistry', 'chip', 'choice', 'church', 'cigarette', 'circle', 'classical', 'clear', 'clearly', 'clerk', 'climate', 'close', 'closed', 'closet', 'clothing', 'cloud', 'coach', 'coast', 'code', 'colleague', 'collect', 'column', 'comedy', 'comfortable', 'comment', 'communicate', 'community', 'compete', 'competition', 'complain', 'completely', 'condition', 'conference', 'connect', 'connected', 'consider', 'contain', 'context', 'continent', 'continue', 'control', 'cook', 'cookie', 'copy', 'corner', 'correctly', 'count', 'couple', 'cover', 'crazy', 'creative', 'credit', 'crime', 'criminal', 'cross', 'crowd', 'crowded', 'cry', 'curly', 'cycle', 'daily', 'danger', 'dark', 'data', 'dead', 'deal', 'death', 'decision', 'deep', 'definitely', 'degree', 'dentist', 'department', 'depend', 'desert', 'designer', 'dessert', 'destroy', 'detective', 'develop', 'device', 'diary', 'differently', 'digital', 'direct', 'direction', 'director', 'disagree', 'disappear', 'disaster', 'discover', 'discovery', 'discussion', 'disease', 'distance', 'divorced', 'document', 'double', 'download', 'downstairs', 'downtown', 'drama', 'drawing', 'dream', 'drive', 'driving', 'drop', 'drug', 'dry', 'earn', 'earth', 'easily', 'education', 'effect', 'either', 'electric', 'electrical', 'electricity', 'electronic', 'elevator', 'employ', 'employee', 'employer', 'empty', 'ending', 'energy', 'engine', 'engineer', 'enormous', 'enter', 'environment', 'equipment', 'error', 'especially', 'essay', 'everyday', 'everywhere', 'evidence', 'exact', 'exactly', 'excellent', 'except', 'exist', 'expect', 'experience', 'experiment', 'expert', 'explanation', 'express', 'expression', 'extreme', 'extremely', 'factor', 'factory', 'fail', 'fair', 'fan', 'farm', 'farming', 'fashion', 'fat', 'fear', 'feature', 'feed', 'female', 'fever', 'fiction', 'field', 'fight', 'figure', 'film', 'final', 'finally', 'finger', 'finish', 'fire', 'first', 'fish', 'fishing', 'fit', 'fix', 'flat', 'flu', 'fly', 'flying', 'focus', 'following', 'foreign', 'forest', 'fork', 'formal', 'fortunately', 'forward', 'free', 'fresh', 'frog', 'fun', 'furniture', 'further', 'future', 'gallery', 'gap', 'garbage', 'gas', 'gate', 'general', 'gift', 'goal', 'god', 'gold', 'golf', 'good', 'government', 'grass', 'greet', 'grocery', 'ground', 'guest', 'guide', 'gun', 'guy', 'habit', 'half', 'hall', 'happily', 'have', 'headache', 'heart', 'heat', 'heavy', 'height', 'helpful', 'hero', 'hers', 'herself', 'hide', 'high', 'hill', 'himself', 'his', 'hit', 'hockey', 'hold', 'hole', 'holiday', 'home', 'hope', 'huge', 'human', 'hurt', 'ideal', 'identify', 'ill', 'illness', 'image', 'immediately', 'impossible', 'included', 'including', 'increase', 'incredible', 'independent', 'individual', 'industry', 'informal', 'injury', 'insect', 'inside', 'instead', 'instruction', 'instructor', 'instrument', 'intelligent', 'international', 'introduction', 'invent', 'invention', 'invitation', 'invite', 'involve', 'item', 'itself', 'jam', 'jazz', 'jewelry', 'joke', 'journalist', 'jump', 'kid', 'kill', 'kilometer', 'king', 'knee', 'knife', 'knock', 'knowledge', 'lab', 'lady', 'lake', 'lamp', 'land', 'laptop', 'last', 'later', 'laughter', 'law', 'lawyer', 'lazy', 'lead', 'eader', 'learning', 'least', 'lecture', 'lemon', 'lend', 'less', 'level', 'lifestyle', 'lift', 'light', 'light', 'likely', 'link', 'listener', 'little', 'lock', 'look', 'lost', 'loud', 'loudly', 'low', 'luck', 'lucky', 'mail', 'major', 'male', 'manage', 'manager', 'manner', 'mark', 'marry', 'material', 'math', 'mathematics', 'matter', 'may', 'mayor', 'media', 'medical', 'medicine', 'memory', 'mention', 'metal', 'method', 'middle', 'might', 'mind', 'mine', 'mirror', 'missing', 'monkey', 'moon', 'mostly', 'motorcycle', 'movement', 'musical', 'musician', 'myself', 'narrow', 'national', 'nature', 'nearly', 'necessary', 'neck', 'need', 'neither', 'nervous', 'network', 'noise', 'noisy', 'none', 'normal', 'normally', 'notice', 'novel', 'nowhere', 'number', 'nut', 'offer', 'officer', 'oil', 'onto', 'opportunity', 'option', 'ordinary', 'organization', 'organize', 'original', 'ourselves', 'outside', 'oven', 'overseas', 'own', 'owner', 'pack', 'pain', 'painter', 'palace', 'parking', 'particular', 'pass', 'passenger', 'past', 'patient', 'pattern', 'pay', 'peace', 'penny', 'per', 'percent', 'perform', 'perhaps', 'permission', 'personality', 'pet', 'photograph', 'physical', 'physics', 'pick', 'pilot', 'planet', 'plant', 'plastic', 'plate', 'platform', 'please', 'pleased', 'pocket', 'polite', 'pollution', 'pop', 'population', 'position', 'possession', 'possibility', 'poster', 'power', 'predict', 'present', 'president', 'prevent', 'print', 'printer', 'prison', 'prize', 'process', 'produce', 'professional', 'professor', 'profile', 'progress', 'promise', 'pronounce', 'protect', 'provide', 'public', 'publish', 'pull', 'purpose', 'push', 'quality', 'quantity', 'queen', 'question', 'quietly', 'race', 'railroad', 'raise', 'rate', 'rather', 'reach', 'react', 'realize', 'receive', 'recent', 'recently', 'reception', 'recipe', 'recognize', 'recommend', 'record', 'recording', 'recycle', 'reduce', 'refer', 'refrigerator', 'refuse', 'region', 'regular', 'relationship', 'remove', 'repair', 'replace', 'reply', 'report', 'reporter', 'request', 'research', 'researcher', 'respond', 'response', 'rest', 'rest', 'review', 'ride', 'ring', 'rise', 'rock', 'rock', 'role', 'roof', 'round', 'route', 'rude', 'run', 'runner', 'running', 'sadly', 'safe', 'sail', 'sailing', 'salary', 'sale', 'sauce', 'save', 'scared', 'scary', 'scene', 'schedule', 'score', 'screen', 'sea', 'search', 'season', 'seat', 'second', 'secret', 'secretary', 'seem', 'sense', 'separate', 'series', 'serious', 'serve', 'service', 'several', 'shake', 'shape', 'sheet', 'ship', 'shoulder', 'shout', 'shut', 'side', 'sign', 'silver', 'simple', 'since', 'singing', 'single', 'sir', 'site', 'size', 'ski', 'skiing', 'skin', 'sky', 'sleep', 'slowly', 'smartphone', 'smell', 'smile', 'smoke', 'smoking', 'sneaker', 'soap', 'soccer', 'social', 'society', 'sock', 'soft', 'soldier', 'solution', 'solve', 'somewhere', 'sort', 'source', 'speaker', 'specific', 'speech', 'speed', 'spider', 'spoon', 'square', 'stage', 'stair', 'stamp', 'star', 'start', 'state', 'stay', 'steal', 'step', 'stomach', 'stone', 'storm', 'stove', 'straight', 'strange', 'strategy', 'stress', 'structure', 'stupid', 'subway', 'succeed', 'successful', 'such', 'suddenly', 'suggest', 'suggestion', 'suit', 'support', 'suppose', 'sure', 'surprise', 'surprised', 'surprising', 'survey', 'sweet', 'symbol', 'system', 'tablet', 'talk', 'target', 'task', 'taste', 'teaching', 'technology', 'teenage', 'temperature', 'term', 'text', 'themselves', 'thick', 'thief', 'thin', 'thinking', 'third', 'thought', 'throw', 'tie', 'tip', 'tool', 'top', 'touch', 'tour', 'tourism', 'toward', 'towel', 'tower', 'toy', 'track', 'tradition', 'traditional', 'train', 'training', 'transportation', 'trash', 'traveler', 'trouble', 'twin', 'typical', 'underground', 'understanding', 'unfortunately', 'unhappy', 'uniform', 'unit', 'united', 'unusual', 'upstairs', 'use', 'used', 'user', 'usual', 'valley', 'variety', 'vehicle', 'view', 'village', 'virus', 'voice', 'wait', 'war', 'wash', 'washing', 'wave', 'weak', 'web', 'wedding', 'weight', 'welcome', 'wet', 'wheel', 'while', 'whole', 'whose', 'wide', 'wild', 'wind', 'winner', 'wish', 'wood', 'wooden', 'working', 'worried', 'worry', 'worse', 'worst', 'wow', 'yet', 'yours', 'zero']

In [21]:
am3_words_b1 = ['absolutely', 'academic', 'access', 'account', 'achievement', 'act', 'ad', 'addition', 'administration', 'admire', 'admit', 'advanced', 'advise', 'afford', 'age', 'agent', 'agreement', 'ahead', 'aim', 'alarm', 'album', 'alcohol', 'alcoholic', 'alternative', 'amazed', 'ambition', 'analysis', 'announce', 'announcement', 'annoy', 'annoyed', 'annoying', 'apart', 'apologize', 'application', 'appointment', 'appreciate', 'approximately', 'arrest', 'arrival', 'assignment', 'assist', 'atmosphere', 'attach', 'attitude', 'attract', 'attraction', 'authority', 'automatic', 'automatically', 'average', 'award', 'aware', 'backward', 'bake', 'balance', 'ban', 'bank', 'base', 'basic', 'basis', 'battery', 'battle', 'beauty', 'bee', 'belief', 'bell', 'bend', 'benefit', 'better', 'bite', 'block', 'board', 'bomb', 'border', 'bother', 'branch', 'brand', 'brave', 'breath', 'breathe', 'breathing', 'bride', 'bubble', 'bury', 'by', 'cable', 'calm', 'campaign', 'candidate', 'cap', 'captain', 'careless', 'category', 'ceiling', 'celebration', 'center', 'central', 'ceremony', 'chain', 'challenge', 'champion', 'channel', 'chapter', 'charge', 'cheap', 'cheat', 'cheerful', 'chemical', 'chest', 'childhood', 'claim', 'clause', 'clear', 'clever', 'click', 'client', 'climb', 'close', 'cloth', 'clue', 'coach', 'coal', 'coin', 'collection', 'colored', 'combine', 'comment', 'commercial', 'commit', 'communication', 'comparison', 'competitive', 'competitor', 'complaint', 'complex', 'concentrate', 'conclude', 'conclusion', 'confident', 'confirm', 'confuse', 'confused', 'connection', 'consequence', 'consist', 'consume', 'consumer', 'contact', 'container', 'content1', 'continuous', 'contrast', 'convenient', 'convince', 'cool', 'corn', 'costume', 'cotton', 'count', 'countryside', 'court', 'cover', 'covered', 'cream', 'criminal', 'cruel', 'cultural', 'cupboard', 'currency', 'current', 'currently', 'curtain', 'custom', 'cut', 'daily', 'damage', 'deal', 'decade', 'deep', 'define', 'definite', 'definition', 'deliver', 'departure', 'despite', 'destination', 'determine', 'determined', 'development', 'diagram', 'diamond', 'difficulty', 'direct', 'directly', 'dirt', 'disadvantage', 'disappointed', 'disappointing', 'discount', 'dislike', 'district', 'divide', 'documentary', 'donate', 'double', 'doubt', 'dressed', 'drop', 'drum', 'drunk', 'due', 'dust', 'duty', 'earthquake', 'eastern', 'economic', 'economy', 'edge', 'editor', 'educate', 'educated', 'educational', 'effective', 'effectively', 'effort', 'election', 'element', 'embarrassed', 'embarrassing', 'emergency', 'emotion', 'employment', 'empty', 'encourage', 'enemy', 'engaged', 'engineering', 'entertain', 'entertainment', 'entrance', 'entry', 'environmental', 'episode', 'equal', 'equally', 'escape', 'essential', 'eventually', 'examine', 'except', 'exchange', 'excitement', 'exhibition', 'exit', 'expand', 'expected', 'experience', 'experienced', 'experiment', 'explode', 'explore', 'explosion', 'export', 'extra', 'face', 'fairly', 'familiar', 'fancy', 'far', 'fascinating', 'fashionable', 'fasten', 'favor', 'fear', 'feature', 'federal', 'fence', 'fighting', 'file', 'financial', 'fitness', 'fixed', 'flag', 'flood', 'flour', 'flow', 'fold', 'folk', 'following', 'force', 'forever', 'frame', 'freeze', 'frequently', 'friendship', 'frighten', 'frightened', 'frightening', 'frozen', 'fry', 'fuel', 'function', 'fur', 'further', 'garage', 'gather', 'generally', 'generation', 'generous', 'gentle', 'gentleman', 'ghost', 'giant', 'glad', 'global', 'glove', 'go', 'grade', 'graduate', 'grain', 'grateful', 'growth', 'guard', 'guilty', 'hand', 'hang', 'happiness', 'hardly', 'hate', 'head', 'headline', 'heating', 'heavily', 'helicopter', 'highlight', 'highly', 'highway', 'hire', 'historic', 'historical', 'honest', 'horrible', 'horror', 'host', 'hunt', 'hurricane', 'hurry', 'identity', 'ignore', 'illegal', 'imaginary', 'immediate', 'immigrant', 'impact', 'import', 'importance', 'impression', 'impressive', 'improvement', 'incredibly', 'indeed', 'indicate', 'indirect', 'indoor', 'indoors', 'influence', 'ingredient', 'injure', 'injured', 'innocent', 'intelligence', 'intend', 'intention', 'invest', 'investigate', 'involved', 'iron', 'issue', 'IT', 'journal', 'journey', 'judge', 'key', 'keyboard', 'kick', 'killing', 'kind', 'kiss', 'knock', 'label', 'laboratory', 'lack', 'latest', 'lay', 'layer', 'lead', 'leading', 'leaf', 'leather', 'legal', 'leisure', 'length', 'level', 'lie', 'like', 'limit', 'lip', 'liquid', 'literature', 'live', 'living', 'local', 'locate', 'located', 'location', 'lonely', 'loss', 'luxury', 'mad', 'magic', 'mainly', 'management', 'market', 'marketing', 'marriage', 'meanwhile', 'measure', 'medium', 'mental', 'mention', 'mess', 'mild', 'mine', 'mix', 'mixture', 'mood', 'move', 'mud', 'murder', 'muscle', 'musical', 'mystery', 'nail', 'narrative', 'nation', 'native', 'naturally', 'neat', 'necessarily', 'need', 'needle', 'neither', 'net', 'next', 'nor', 'normal', 'northern', 'note', 'now', 'nuclear', 'obvious', 'obviously', 'occasion', 'occur', 'odd', 'official', 'old-fashioned', 'once', 'operation', 'organized', 'organizer', 'original', 'originally', 'ought', 'ours', 'outdoor', 'outdoors', 'overseas', 'pack', 'package', 'painful', 'pale', 'pan', 'participate', 'particularly', 'pass', 'passion', 'path', 'payment', 'peaceful', 'percentage', 'perfectly', 'performance', 'personally', 'persuade', 'photographer', 'photography', 'pin', 'pipe', 'place', 'planning', 'pleasant', 'pleasure', 'plenty', 'plot', 'plus', 'poem', 'poet', 'poetry', 'point', 'poison', 'poisonous', 'policy', 'political', 'politician', 'politics', 'port', 'portrait', 'possibly', 'pot', 'pour', 'poverty', 'powder', 'powerful', 'practical', 'pray', 'prayer', 'prediction', 'prepared', 'presentation', 'press', 'pressure', 'pretend', 'previous', 'previously', 'priest', 'primary', 'prince', 'princess', 'principal', 'printing', 'prisoner', 'private', 'producer', 'production', 'profession', 'profit', 'program', 'promote', 'proper', 'properly', 'property', 'protest', 'proud', 'prove', 'pull', 'punish', 'punishment', 'push', 'qualification', 'qualified', 'qualify', 'quit', 'quotation', 'quote', 'race', 'racing', 'raise', 'range', 'rare', 'rarely', 'reaction', 'reality', 'receipt', 'recommendation', 'reference', 'reflect', 'regularly', 'reject', 'relate', 'related', 'relation', 'relative', 'relaxed', 'relaxing', 'release', 'reliable', 'religion', 'religious', 'remain', 'remind', 'remote', 'rent', 'repair', 'repeat', 'repeated', 'represent', 'request', 'require', 'reservation', 'resource', 'respect', 'responsibility', 'responsible', 'result', 'retire', 'retired', 'revise', 'ring', 'risk', 'robot', 'roll', 'romantic', 'rope', 'rough', 'row1', 'royal', 'rule', 'safety', 'sail', 'sailor', 'sample', 'sand', 'scan', 'scientific', 'script', 'sculpture', 'secondary', 'security', 'seed', 'sensible', 'separate', 'seriously', 'servant', 'set', 'set', 'setting', 'sex', 'sexual', 'shake', 'share', 'sharp', 'shelf', 'shell', 'shift', 'shine', 'shiny', 'shoot', 'shy', 'sight', 'signal', 'silent', 'silly', 'similarity', 'similarly', 'simply', 'since', 'sink', 'slave', 'slice', 'slightly', 'slow', 'smooth', 'software', 'soil', 'solid', 'sort', 'southern', 'specifically', 'spending', 'spicy', 'spirit', 'spoken', 'spot', 'spread', 'spring', 'stadium', 'staff', 'standard', 'state', 'statistic', 'statue', 'stick', 'stick', 'still', 'store', 'stranger', 'strength', 'string', 'strongly', 'studio', 'stuff', 'substance', 'successfully', 'sudden', 'suffer', 'suit', 'suitable', 'summarize', 'summary', 'supply', 'supporter', 'surely', 'surface', 'survive', 'swim', 'switch', 'symptom', 'tail', 'talent', 'talented', 'tape', 'tax', 'technical', 'technique', 'tend', 'tent', 'that', 'theirs', 'theme', 'theory', 'therefore', 'this', 'though', 'throat', 'throughout', 'tight', 'till', 'tiny', 'tip', 'tire', 'toe', 'ton', 'tongue', 'total', 'totally', 'touch', 'tour', 'trade', 'trainer', 'translate', 'translation', 'transport', 'treat', 'treatment', 'trend', 'trick', 'truth', 'tube', 'type', 'typically', 'ugly', 'unable', 'uncomfortable', 'underwear', 'unemployed', 'unemployment', 'unfair', 'union', 'unless', 'unlike', 'unlikely', 'unnecessary', 'unpleasant', 'update', 'upon', 'upset', 'used', 'used', 'valuable', 'value', 'various', 'version', 'victim', 'view', 'viewer', 'violent', 'volunteer', 'vote', 'warm', 'warn', 'warning', 'waste', 'water', 'wave', 'weapon', 'weigh', 'western', 'whatever', 'whenever', 'whether', 'while', 'whole', 'will', 'win', 'wing', 'within', 'wonder', 'wool', 'worldwide', 'worry', 'worse', 'worst', 'worth', 'written', 'wrong', 'young', 'youth']

In [22]:
am3_words_b2 = ['abandon', 'abroad', 'absolute', 'acceptable', 'accompany', 'account', 'accurate', 'accuse', 'acknowledge', 'acquire', 'actual', 'adapt', 'additional', 'address', 'adopt', 'advance', 'affair', 'afterward', 'agency', 'agenda', 'aggressive', 'aid', 'aircraft', 'alarm', 'alter', 'amount', 'anger', 'angle', 'anniversary', 'annual', 'anxious', 'apparent', 'apparently', 'appeal', 'approach', 'appropriate', 'approval', 'approve', 'arise', 'armed', 'arms', 'artificial', 'artistic', 'ashamed', 'aside', 'aspect', 'assess', 'assessment', 'associate', 'associated', 'association', 'assume', 'attempt', 'attorney', 'back', 'bacteria', 'bar', 'barrier', 'basically', 'battle', 'bear', 'beat', 'beg', 'being', 'bent', 'bet', 'beyond', 'bill', 'bitter', 'blame', 'blind', 'bond', 'border', 'breast', 'brief', 'broad', 'broadcast', 'budget', 'bullet', 'bunch', 'burn', 'bush', 'but', 'calculate', 'cancel', 'cancer', 'capable', 'capacity', 'capture', 'cast', 'catch', 'chain', 'chair', 'chairman', 'challenge', 'characteristic', 'chart', 'chief', 'circumstance', 'cite', 'citizen', 'civil', 'classic', 'close', 'closely', 'collapse', 'combination', 'comfort', 'command', 'commission', 'commitment', 'committee', 'commonly', 'complex', 'complicated', 'component', 'concentration', 'concept', 'concern', 'concerned', 'conduct', 'confidence', 'conflict', 'confusing', 'congress', 'conscious', 'conservative', 'consideration', 'consistent', 'constant', 'constantly', 'construct', 'construction', 'contemporary', 'contest', 'contract', 'contribute', 'contribution', 'convert', 'convinced', 'core', 'corporate', 'council', 'county', 'courage', 'crash', 'creation', 'creature', 'credit', 'crew', 'crisis', 'criterion', 'critic', 'critical', 'criticism', 'criticize', 'crop', 'crucial', 'cry', 'cure', 'current', 'curve', 'curved', 'date', 'debate', 'debt', 'decent', 'declare', 'decline', 'decorate', 'decoration', 'decrease', 'deeply', 'defeat', 'defend', 'defense', 'delay', 'deliberate', 'deliberately', 'delivery', 'demand', 'demonstrate', 'deny', 'depressed', 'depressing', 'depth', 'desert', 'deserve', 'desire', 'desperate', 'detail', 'detailed', 'detect', 'dig', 'discipline', 'discount', 'dishonest', 'disk', 'dismiss', 'display', 'distribute', 'distribution', 'divide', 'division', 'document', 'domestic', 'dominate', 'downward', 'dozen', 'draft', 'drag', 'dramatic', 'edit', 'edition', 'efficient', 'elderly', 'elect', 'elsewhere', 'emerge', 'emotional', 'emphasis', 'emphasize', 'enable', 'encounter', 'engage', 'enhance', 'ensure', 'enthusiasm', 'enthusiastic', 'entire', 'entirely', 'equal', 'establish', 'estate', 'estimate', 'ethical', 'evaluate', 'even', 'evil', 'examination', 'excuse', 'executive', 'exhibit', 'existence', 'exit', 'expectation', 'expense', 'exploration', 'expose', 'extend', 'extent', 'external', 'extraordinary', 'extreme', 'facility', 'failure', 'faith', 'fault', 'favor', 'feather', 'fee', 'feed', 'feedback', 'feel', 'fellow', 'figure', 'file', 'finance', 'finding', 'firm', 'fix', 'flame', 'flash', 'flexible', 'float', 'fold', 'folding', 'following', 'forgive', 'former', 'fortune', 'forward', 'found', 'free', 'freedom', 'frequency', 'fuel', 'fully', 'function', 'fund', 'fundamental', 'funding', 'furthermore', 'gain', 'gang', 'generate', 'genre', 'goods', 'govern', 'governor', 'grab', 'grade', 'gradually', 'grand', 'grant', 'guarantee', 'handle', 'harm', 'harmful', 'hearing', 'heaven', 'heel', 'hell', 'hesitate', 'high', 'hire', 'hold', 'hollow', 'holy', 'honor', 'host', 'house', 'household', 'housing', 'humor', 'humorous', 'hunt', 'hunting', 'hurt', 'ideal', 'illustrate', 'illustration', 'imagination', 'impatient', 'imply', 'impose', 'impress', 'impressed', 'inch', 'incident', 'income', 'increasingly', 'industrial', 'infection', 'inform', 'initial', 'initially', 'initiative', 'inner', 'inquiry', 'insight', 'insist', 'inspire', 'install', 'instance', 'institute', 'institution', 'insurance', 'intended', 'intense', 'internal', 'interpret', 'interrupt', 'investigation', 'investment', 'issue', 'joy', 'judgment', 'junior', 'justice', 'justify', 'labor', 'landscape', 'largely', 'latest', 'launch', 'leadership', 'league', 'lean', 'leave', 'level', 'license', 'limited', 'line', 'lively', 'load', 'loan', 'logical', 'long-term', 'loose', 'lord', 'low', 'lower', 'lung', 'maintain', 'major', 'majority', 'make', 'map', 'mass', 'massive', 'master', 'matching', 'material', 'maximum', 'means', 'measurement', 'medium', 'melt', 'military', 'mineral', 'minimum', 'minister', 'minor', 'minority', 'mission', 'mistake', 'mixed', 'model', 'modify', 'monitor', 'moral', 'motor', 'mount', 'multiple', 'multiply', 'mysterious', 'narrow', 'national', 'negative', 'nerve', 'nevertheless', 'nightmare', 'notion', 'numerous', 'obey', 'object', 'objective', 'obligation', 'observation', 'observe', 'obtain', 'occasionally', 'offend', 'offense', 'offensive', 'official', 'opening', 'operate', 'opponent', 'oppose', 'opposed', 'opposition', 'organ', 'origin', 'otherwise', 'outcome', 'outer', 'outline', 'overall', 'owe', 'pace', 'package', 'panel', 'participant', 'partly', 'passage', 'patient', 'permanent', 'permit', 'perspective', 'phase', 'phenomenon', 'philosophy', 'pick', 'picture', 'pile', 'pitch', 'plain', 'plot', 'plus', 'pointed', 'popularity', 'pose', 'position', 'positive', 'possess', 'potential', 'power', 'praise', 'pregnant', 'preparation', 'presence', 'preserve', 'price', 'prime', 'principal', 'principle', 'print', 'priority', 'privacy', 'procedure', 'process', 'produce', 'professional', 'progress', 'project', 'proof', 'proposal', 'propose', 'prospect', 'protection', 'psychologist', 'psychology', 'publication', 'purchase', 'pure', 'pursue', 'range', 'rank', 'rapid', 'rapidly', 'rate', 'raw', 'reach', 'realistic', 'reasonable', 'recall', 'recover', 'reduction', 'regard', 'regional', 'register', 'regret', 'regulation', 'relatively', 'relevant', 'relief', 'rely', 'remark', 'representative', 'reputation', 'requirement', 'rescue', 'reserve', 'resident', 'resist', 'resolve', 'resort', 'retain', 'reveal', 'revolution', 'reward', 'rhythm', 'rid', 'rise', 'root', 'round', 'routine', 'rub', 'rubber', 'rural', 'rush', 'sample', 'satellite', 'satisfied', 'satisfy', 'saving', 'scale', 'schedule', 'scream', 'screen', 'seat', 'sector', 'secure', 'seek', 'select', 'selection', 'self', 'senate', 'senator', 'senior', 'sense', 'sensitive', 'sentence', 'sequence', 'session', 'settle', 'severe', 'shade', 'shadow', 'shall', 'shallow', 'shame', 'shape', 'shelter', 'shift', 'ship', 'shock', 'shocked', 'shooting', 'shot', 'significant', 'significantly', 'silence', 'silk', 'sincere', 'slide', 'slight', 'slip', 'slope', 'solar', 'somewhat', 'soul', 'specialist', 'species', 'speed', 'spiritual', 'split', 'sponsor', 'spot', 'spread', 'stable', 'stage', 'stand', 'stare', 'status', 'steady', 'steel', 'steep', 'step', 'sticky', 'stiff', 'stock', 'stream', 'stretch', 'strict', 'strike', 'structure', 'struggle', 'stuff', 'subject', 'submit', 'sum', 'surgery', 'surround', 'surrounding', 'survey', 'suspect', 'swear', 'sweep', 'switch', 'sympathy', 'tale', 'tank', 'target', 'tear', 'temporary', 'term', 'therapy', 'threat', 'threaten', 'thus', 'time', 'title', 'tone', 'tough', 'track', 'transfer', 'transform', 'transition', 'trial', 'trip', 'tropical', 'trouble', 'truly', 'trust', 'try', 'tune', 'tunnel', 'ultimately', 'unconscious', 'unexpected', 'unique', 'universe', 'unknown', 'upper', 'upward', 'urban', 'urge', 'value', 'van', 'vary', 'vast', 'venue', 'very', 'via', 'victory', 'violence', 'virtual', 'vision', 'visual', 'vital', 'vitamin', 'volume', 'wage', 'way', 'weakness', 'wealth', 'wealthy', 'whereas', 'wherever', 'whisper', 'whom', 'widely', 'wildlife', 'willing', 'wind', 'wire', 'wise', 'witness', 'worse', 'worst', 'worth', 'wound', 'wrap', 'wrong', 'yet', 'zone']

Аналогичные шаги проделаем для остальных трех словарей:

In [23]:
reader = PdfReader('American_Oxford_5000_by_CEFR_level.pdf')
  
text = []
for x in range(len(reader.pages)):
    page = reader.pages[x]    
    page_text = page.extract_text().split('\n')
    for y in page_text:
        text.append(y)

b2 = []
c1 = []
b2_trigger = 0
c1_trigger = 0

while(len(text) > 0):
    x = text.pop(0)
    if b2_trigger == 1:
        b2.append(x)
    if x == 'B2':
        b2_trigger = 1
        c1_trigger = 0
        
    if c1_trigger == 1:
        c1.append(x)
    if x == 'C1':
        b2_trigger = 0
        c1_trigger = 1
        
am5_words_b2 = []
am5_words_c1 = []
for x in b2:
    am5_words_b2.append(x.split()[0])
for x in c1:
    am5_words_c1.append(x.split()[0])

In [24]:
am5_words_b2 = ['absorb', 'abstract', 'accent', 'accidentally', 'accommodate', 'accommodation', 'accomplish', 'accountant', 'accuracy', 'accurately', 'acid', 'acre', 'activate', 'addiction', 'additionally', 'adequate', 'adequately', 'adjust', 'affordable', 'aged', 'agriculture', 'AIDS', 'alien', 'alongside', 'altogether', 'ambitious', 'ambulance', 'amusing', 'analyst', 'ancestor', 'animation', 'annually', 'anticipate', 'anxiety', 'apology', 'applicant', 'appropriately', 'arrow', 'artwork', 'asset', 'assign', 'assistance', 'assumption', 'assure', 'astonishing', 'athletic', 'attachment', 'audio', 'awareness', 'awkward', 'badge', 'balanced', 'ballet', 'balloon', 'barely', 'bargain', 'basement', 'basket', 'bat', 'beneficial', 'beside', 'besides', 'bias', 'bid', 'biological', 'blanket', 'blow', 'bold', 'bombing', 'boost', 'bound', 'brick', 'briefly', 'broadcaster', 'broadly', 'buck', 'bug', 'cabin', 'canal', 'candle', 'carbon', 'castle', 'casual', 'cave', 'certainty', 'certificate', 'challenging', 'championship', 'charming', 'chase', 'cheek', 'cheer', 'chop', 'circuit', 'civilization', 'clarify', 'classify', 'cliff', 'clinic', 'clip', 'coincidence', 'collector', 'colony', 'colorful', 'comic', 'commander', 'comparative', 'completion', 'compose', 'composer', 'compound', 'comprehensive', 'comprise', 'compulsory', 'concrete', 'confess', 'confusion', 'consequently', 'conservation', 'considerable', 'considerably', 'consistently', 'conspiracy', 'consult', 'consultant', 'consumption', 'controversial', 'controversy', 'convenience', 'convention', 'conventional', 'convey', 'convincing', 'cop', 'cope', 'corporation', 'corridor', 'counter', 'coverage', 'cowboy', 'crack', 'craft', 'creativity', 'critically', 'cruise', 'cue', 'curious', 'curriculum', 'cute', 'dairy', 'dare', 'darkness', 'database', 'deadline', 'deadly', 'dealer', 'deck', 'defender', 'delete', 'delighted', 'democracy', 'democratic', 'demonstration', 'depart', 'dependent', 'deposit', 'depression', 'derive', 'desperately', 'destruction', 'determination', 'devote', 'differ', 'dime', 'disability', 'disabled', 'disagreement', 'disappoint', 'disappointment', 'discourage', 'disorder', 'distant', 'distinct', 'distinguish', 'distract', 'disturb', 'dive', 'diverse', 'diversity', 'divorce', 'dominant', 'donation', 'dot', 'dramatically', 'drought', 'dull', 'dump', 'duration', 'dynamic', 'eager', 'economics', 'economist', 'editorial', 'efficiently', 'elbow', 'electronics', 'elegant', 'elementary', 'eliminate', 'embrace', 'emission', 'emotionally', 'empire', 'enjoyable', 'entertaining', 'entrepreneur', 'envelope', 'equip', 'equivalent', 'era', 'erupt', 'essentially', 'ethic', 'ethnic', 'evaluation', 'evident', 'evolution', 'evolve', 'exceed', 'exception', 'excessive', 'exclude', 'exotic', 'expansion', 'expedition', 'expertise', 'exploit', 'exposure', 'extension', 'extensive', 'extensively', 'extract', 'fabric', 'fabulous', 'faculty', 'failed', 'fake', 'fame', 'fantasy', 'fare', 'firefighter', 'firework', 'firm', 'firmly', 'flavor', 'fond', 'fool', 'forbid', 'forecast', 'format', 'formation', 'formerly', 'fortunate', 'forum', 'fossil', 'foundation', 'founder', 'fraction', 'fragment', 'framework', 'fraud', 'freely', 'frequent', 'fulfill', 'full-time', 'fundamentally', 'furious', 'gallon', 'gaming', 'gay', 'gender', 'gene', 'genetic', 'genius', 'genuine', 'genuinely', 'gesture', 'globalization', 'globe', 'golden', 'goodness', 'gorgeous', 'graphic', 'graphics', 'greatly', 'greenhouse', 'guideline', 'habitat', 'harbor', 'headquarters', 'heal', 'healthcare', 'helmet', 'hence', 'herb', 'hidden', 'hilarious', 'hip', 'historian', 'homeless', 'honesty', 'honey', 'hook', 'hopefully', 'hunger', 'hypothesis', 'icon', 'ID', 'identical', 'illusion', 'immigration', 'immune', 'implement', 'implication', 'incentive', 'incorporate', 'incorrect', 'independence', 'index', 'indication', 'inevitable', 'inevitably', 'infer', 'inflation', 'info', 'infrastructure', 'inhabitant', 'inherit', 'ink', 'innovation', 'innovative', 'input', 'insert', 'inspector', 'installation', 'instant', 'instantly', 'integrate', 'intellectual', 'interact', 'interaction', 'interpretation', 'interval', 'invade', 'invasion', 'investor', 'isolate', 'isolated', 'jail', 'jet', 'joint', 'journalism', 'jury', 'kindergarten', 'kit', 'ladder', 'landing', 'lane', 'lately', 'legend', 'lens', 'lifetime', 'lighting', 'likewise', 'limitation', 'literally', 'literary', 'litter', 'logo', 'lottery', 'loyal', 'lyric', 'makeup', 'making', 'manufacture', 'manufacturing', 'marathon', 'margin', 'marker', 'martial', 'mate', 'mechanic', 'mechanical', 'mechanism', 'medal', 'medication', 'membership', 'memorable', 'metaphor', 'miner', 'miserable', 'mode', 'modest', 'monster', 'monthly', 'monument', 'moreover', 'mortgage', 'mosque', 'mosquito', 'motion', 'motivate', 'motivation', 'moving', 'myth', 'naked', 'nasty', 'navigation', 'nearby', 'necessity', 'negotiate', 'negotiation', 'neutral', 'newly', 'nickel', 'norm', 'notebook', 'novelist', 'nowadays', 'nursing', 'nutrition', 'obesity', 'observer', 'obstacle', 'occupation', 'occupy', 'offender', 'ongoing', 'openly', 'opera', 'operator', 'optimistic', 'orchestra', 'organic', 'outfit', 'output', 'outstanding', 'overcome', 'overnight', 'ownership', 'oxygen', 'packet', 'palm', 'panic', 'parade', 'parallel', 'participation', 'partnership', 'part-time', 'passionate', 'password', 'patch', 'patience', 'pause', 'peer', 'penalty', 'perceive', 'perception', 'permanently', 'pharmacy', 'physician', 'pill', 'pity', 'placement', 'portion', 'potentially', 'precede', 'precious', 'precise', 'precisely', 'predictable', 'preference', 'presidential', 'pride', 'primarily', 'prior', 'probability', 'probable', 'proceed', 'programming', 'progressive', 'prohibit', 'promising', 'promotion', 'prompt', 'proportion', 'protein', 'protester', 'psychological', 'publicity', 'publishing', 'punk', 'purely', 'pursuit', 'puzzle', 'questionnaire', 'racial', 'racism', 'racist', 'radiation', 'rail', 'random', 'rat', 'rating', 'reasonably', 'rebuild', 'receiver', 'recession', 'reckon', 'recognition', 'recovery', 'recruit', 'referee', 'refugee', 'registration', 'regulate', 'reinforce', 'relieve', 'relieved', 'remarkable', 'remarkably', 'rental', 'reporting', 'resign', 'resolution', 'restore', 'restrict', 'restriction', 'résumé', 'retail', 'retirement', 'revenue', 'revision', 'ridiculous', 'risky', 'rival', 'rob', 'robbery', 'rocket', 'romance', 'rose', 'roughly', 'ruin', 'satisfaction', 'scandal', 'scare', 'scenario', 'scholar', 'scholarship', 'scratch', 'screening', 'seeker', 'seminar', 'settler', 'severely', 'sexy', 'shaped', 'shocking', 'shore', 'shortage', 'shortly', 'short-term', 'sibling', 'sidewalk', 'signature', 'significance', 'skilled', 'skull', 'slogan', 'so-called', 'somehow', 'sometime', 'sophisticated', 'spare', 'specialize', 'specify', 'spectacular', 'spectator', 'speculate', 'speculation', 'spice', 'spill', 'spite', 'spoil', 'spokesman', 'spokesperson', 'spokeswoman', 'sponsorship', 'stall', 'stance', 'starve', 'steadily', 'steam', 'stimulate', 'strengthen', 'strictly', 'stroke', 'stunning', 'subsequent', 'subsequently', 'suburb', 'suffering', 'sufficient', 'sufficiently', 'super', 'surgeon', 'survival', 'survivor', 'suspend', 'sustainable', 'swallow', 'sympathetic', 'tackle', 'tag', 'tap', 'technological', 'teen', 'temple', 'temporarily', 'tendency', 'tension', 'terminal', 'terms', 'terribly', 'terrify', 'territory', 'terror', 'terrorism', 'terrorist', 'testing', 'textbook', 'theft', 'therapist', 'thesis', 'thorough', 'thoroughly', 'thumb', 'timing', 'tissue', 'tournament', 'trace', 'trading', 'tragedy', 'tragic', 'trait', 'transmit', 'trap', 'treasure', 'tribe', 'trigger', 'trillion', 'troop', 'tsunami', 'ultimate', 'unacceptable', 'uncertainty', 'undergo', 'undertake', 'unfold', 'unfortunate', 'unite', 'unity', 'universal', 'urgent', 'usage', 'useless', 'valid', 'variation', 'vertical', 'viewpoint', 'visa', 'visible', 'voluntary', 'voting', 'wander', 'warming', 'weekly', 'weird', 'welfare', 'wheat', 'whoever', 'widespread', 'wisdom', 'withdraw', 'wolf', 'workforce', 'workplace', 'workshop', 'worm', 'wrist']

In [25]:
am5_words_c1 = ['abolish', 'abortion', 'absence', 'absent', 'absurd', 'abuse', 'academy', 'accelerate', 'acceptance', 'accessible', 'accomplishment', 'accordingly', 'accountability', 'accountable', 'accumulate', 'accumulation', 'accusation', 'accused', 'acid', 'acquisition', 'activation', 'activist', 'acute', 'adaptation', 'adhere', 'adjacent', 'adjustment', 'administer', 'administrative', 'administrator', 'admission', 'adolescent', 'adoption', 'adverse', 'advocate', 'aesthetic', 'affection', 'aftermath', 'aggression', 'agricultural', 'aide', 'alert', 'alien', 'align', 'alignment', 'alike', 'allegation', 'allege', 'allegedly', 'alliance', 'allocate', 'allocation', 'allowance', 'ally', 'aluminum', 'amateur', 'ambassador', 'amend', 'amendment', 'amid', 'analogy', 'anchor', 'angel', 'anonymous', 'apparatus', 'apparel', 'appealing', 'appetite', 'applaud', 'applicable', 'appoint', 'appreciation', 'arbitrary', 'architectural', 'archive', 'arena', 'arm', 'array', 'articulate', 'ash', 'aspiration', 'aspire', 'assassination', 'assault', 'assemble', 'assembly', 'assert', 'assertion', 'assurance', 'asylum', 'atrocity', 'attain', 'attendance', 'attribute', 'auction', 'audit', 'authentic', 'authorize', 'auto', 'autonomy', 'autumn', 'availability', 'await', 'backdrop', 'backing', 'backup', 'bail', 'ballot', 'bankruptcy', 'banner', 'bare', 'barrel', 'bass', 'bat', 'battlefield', 'bay', 'beam', 'beast', 'behalf', 'behavioral', 'beloved', 'bench', 'benchmark', 'beneath', 'beneficiary', 'betray', 'beverage', 'bind', 'biography', 'bishop', 'bizarre', 'blade', 'blast', 'bleed', 'blend', 'bless', 'blessing', 'boast', 'bonus', 'booking', 'boom', 'bounce', 'boundary', 'bow', 'breach', 'breakdown', 'breakthrough', 'breed', 'broadband', 'browser', 'brutal', 'buddy', 'buffer', 'bulk', 'burden', 'bureaucracy', 'burial', 'burst', 'cabinet', 'calculation', 'canvas', 'capability', 'capitalism', 'capitalist', 'cargo', 'carriage', 'carve', 'casino', 'casualty', 'catalog', 'cater', 'cattle', 'caution', 'cautious', 'cease', 'cemetery', 'chamber', 'chaos', 'characterize', 'charm', 'charter', 'choir', 'chronic', 'chunk', 'circulate', 'circulation', 'citizenship', 'civic', 'civilian', 'clarity', 'clash', 'classification', 'cling', 'clinical', 'closure', 'cluster', 'coalition', 'coastal', 'cocktail', 'cognitive', 'coincide', 'collaborate', 'collaboration', 'collective', 'collision', 'colonial', 'columnist', 'combat', 'commence', 'commentary', 'commentator', 'commerce', 'commissioner', 'commodity', 'communist', 'companion', 'comparable', 'compassion', 'compel', 'compelling', 'compensate', 'compensation', 'competence', 'competent', 'compile', 'complement', 'complexity', 'compliance', 'complication', 'comply', 'composition', 'compromise', 'compute', 'conceal', 'concede', 'conceive', 'conception', 'concession', 'condemn', 'confer', 'confession', 'configuration', 'confine', 'confirmation', 'confront', 'confrontation', 'congratulate', 'congregation', 'congressional', 'conquer', 'conscience', 'consciousness', 'consecutive', 'consensus', 'consent', 'conserve', 'consistency', 'consolidate', 'constitute', 'constitution', 'constitutional', 'constraint', 'consultation', 'contemplate', 'contempt', 'contend', 'contender', 'content', 'contention', 'continually', 'contractor', 'contradiction', 'contrary', 'contributor', 'conversion', 'convict', 'conviction', 'cooperate', 'cooperative', 'coordinate', 'coordination', 'coordinator', 'copper', 'copyright', 'correction', 'correlate', 'correlation', 'correspond', 'correspondence', 'correspondent', 'corresponding', 'corrupt', 'corruption', 'costly', 'councilor', 'counseling', 'counselor', 'counter', 'counterpart', 'countless', 'coup', 'courtesy', 'craft', 'crawl', 'creator', 'credibility', 'credible', 'creep', 'critique', 'crown', 'crude', 'crush', 'crystal', 'cult', 'cultivate', 'curiosity', 'custody', 'cutting', 'cynical', 'dam', 'damaging', 'dawn', 'debris', 'debut', 'decision-making', 'decisive', 'declaration', 'dedicated', 'dedication', 'deed', 'deem', 'default', 'defect', 'defensive', 'deficiency', 'deficit', 'defy', 'delegate', 'delegation', 'delicate', 'demon', 'denial', 'denounce', 'dense', 'density', 'dependence', 'depict', 'deploy', 'deployment', 'deprive', 'deputy', 'descend', 'descent', 'designate', 'desirable', 'desktop', 'destructive', 'detain', 'detection', 'detention', 'deteriorate', 'devastate', 'devil', 'devise', 'diagnose', 'diagnosis', 'dictate', 'dictator', 'differentiate', 'dignity', 'dilemma', 'dimension', 'diminish', 'dip', 'diplomat', 'diplomatic', 'directory', 'disastrous', 'discard', 'discharge', 'disclose', 'disclosure', 'discourse', 'discretion', 'discrimination', 'dismissal', 'displace', 'disposal', 'dispose', 'dispute', 'disrupt', 'disruption', 'dissolve', 'distinction', 'distinctive', 'distort', 'distress', 'disturbing', 'divert', 'divine', 'doctrine', 'documentation', 'domain', 'dominance', 'donor', 'dose', 'drain', 'drift', 'driving', 'drown', 'dual', 'dub', 'dumb', 'duo', 'dynamic', 'earnings', 'ease', 'echo', 'ecological', 'educator', 'effectiveness', 'efficiency', 'ego', 'elaborate', 'electoral', 'elevate', 'eligible', 'elite', 'embark', 'embarrassment', 'embassy', 'embed', 'embody', 'emergence', 'empirical', 'empower', 'enact', 'encompass', 'encouragement', 'encouraging', 'endeavor', 'endless', 'endorse', 'endorsement', 'endure', 'enforce', 'enforcement', 'engagement', 'engaging', 'enrich', 'enroll', 'ensue', 'enterprise', 'enthusiast', 'entitle', 'entity', 'epidemic', 'equality', 'equation', 'erect', 'escalate', 'essence', 'establishment', 'eternal', 'evacuate', 'evoke', 'evolutionary', 'exaggerate', 'excellence', 'exceptional', 'excess', 'exclusion', 'exclusive', 'exclusively', 'execute', 'execution', 'exert', 'exile', 'expenditure', 'experimental', 'expire', 'explicit', 'explicitly', 'exploitation', 'explosive', 'extract', 'extremist', 'facilitate', 'faction', 'fade', 'fairness', 'fatal', 'fate', 'favorable', 'feat', 'felony', 'feminist', 'fiber', 'fierce', 'filmmaker', 'filter', 'fine', 'firearm', 'fiscal', 'fit', 'flaw', 'flawed', 'flee', 'fleet', 'flesh', 'flexibility', 'flourish', 'fluid', 'footage', 'foreigner', 'forge', 'formula', 'formulate', 'forth', 'forthcoming', 'foster', 'fragile', 'franchise', 'frankly', 'freshman', 'frustrated', 'frustrating', 'frustration', 'functional', 'fundraising', 'funeral', 'gambling', 'gathering', 'gaze', 'gear', 'generic', 'genocide', 'gig', 'glance', 'glimpse', 'glorious', 'glory', 'governance', 'grace', 'grasp', 'grave', 'grave', 'gravity', 'grid', 'grief', 'grin', 'grind', 'grip', 'gross', 'guerrilla', 'guidance', 'guilt', 'gut', 'hail', 'halfway', 'halt', 'handful', 'handling', 'handy', 'harassment', 'hardware', 'harmony', 'harsh', 'harvest', 'hatred', 'haunt', 'hazard', 'heighten', 'heritage', 'hierarchy', 'high-profile', 'hint', 'homeland', 'hook', 'hopeful', 'horizon', 'horn', 'hostage', 'hostile', 'hostility', 'humanitarian', 'humanity', 'humble', 'hydrogen', 'identification', 'ideological', 'ideology', 'idiot', 'ignorance', 'imagery', 'immense', 'imminent', 'implementation', 'imprison', 'inability', 'inadequate', 'inappropriate', 'incarcerate', 'incarceration', 'incidence', 'inclined', 'inclusion', 'incur', 'indicator', 'indictment', 'indigenous', 'induce', 'indulge', 'inequality', 'infamous', 'infant', 'infect', 'inflict', 'influential', 'inherent', 'inhibit', 'initiate', 'inject', 'injection', 'injustice', 'inmate', 'inquire', 'insertion', 'insider', 'inspect', 'inspection', 'inspiration', 'instinct', 'institutional', 'instruct', 'instrumental', 'insufficient', 'insult', 'intact', 'intake', 'integral', 'integrated', 'integration', 'integrity', 'intellectual', 'intensify', 'intensity', 'intensive', 'intent', 'interactive', 'interface', 'interfere', 'interference', 'interim', 'interior', 'intermediate', 'intersection', 'intervene', 'intervention', 'intimate', 'intriguing', 'inventory', 'investigator', 'invisible', 'invoke', 'involvement', 'ironic', 'ironically', 'irony', 'irrelevant', 'isolation', 'judicial', 'jurisdiction', 'just', 'justification', 'keen', 'kidnap', 'kidney', 'kingdom', 'landlord', 'landmark', 'lap', 'large-scale', 'laser', 'latter', 'lawmaker', 'lawn', 'lawsuit', 'layout', 'leak', 'leap', 'legacy', 'legendary', 'legislation', 'legislative', 'legislature', 'legitimate', 'lengthy', 'lesbian', 'lesser', 'lethal', 'liable', 'liberal', 'liberation', 'liberty', 'lifelong', 'likelihood', 'limb', 'linear', 'lineup', 'linger', 'listing', 'liter', 'literacy', 'liver', 'lobby', 'log', 'logic', 'long-standing', 'longtime', 'loom', 'loop', 'loyalty', 'machinery', 'magical', 'magnetic', 'magnificent', 'magnitude', 'mainland', 'mainstream', 'maintenance', 'mandate', 'mandatory', 'manifest', 'manipulate', 'manipulation', 'manuscript', 'march', 'marginal', 'marine', 'marketplace', 'mask', 'massacre', 'mathematical', 'mature', 'maximize', 'meaningful', 'meantime', 'medieval', 'meditation', 'melody', 'memo', 'memoir', 'memorial', 'mentor', 'merchant', 'mercy', 'mere', 'merely', 'merge', 'merger', 'merit', 'methodology', 'midst', 'migration', 'militant', 'militia', 'mill', 'minimal', 'minimize', 'mining', 'ministry', 'minute', 'miracle', 'misery', 'misleading', 'missile', 'mob', 'mobile', 'mobility', 'mobilize', 'moderate', 'modification', 'module', 'momentum', 'monk', 'monopoly', 'morality', 'motive', 'municipal', 'mutual', 'namely', 'nationwide', 'naval', 'neglect', 'neighboring', 'nest', 'net', 'newsletter', 'niche', 'noble', 'nod', 'nominate', 'nomination', 'nominee', 'nonetheless', 'nonprofit', 'nonsense', 'noon', 'notable', 'notably', 'notify', 'notorious', 'novel', 'nursery', 'objection', 'oblige', 'obsess', 'obsession', 'occasional', 'occurrence', 'odds', 'offering', 'offspring', 'operational', 'opt', 'optical', 'optimism', 'oral', 'organizational', 'orientation', 'originate', 'outbreak', 'outing', 'outlet', 'outlook', 'outrage', 'outsider', 'overlook', 'overly', 'oversee', 'overturn', 'overwhelm', 'overwhelming', 'pad', 'parameter', 'parental', 'parliament', 'partial', 'partially', 'passing', 'passive', 'pastor', 'patent', 'pathway', 'patrol', 'patron', 'peak', 'peasant', 'peculiar', 'pension', 'persist', 'persistent', 'personnel', 'petition', 'philosopher', 'philosophical', 'pioneer', 'pipeline', 'pirate', 'pit', 'plea', 'plead', 'pledge', 'plug', 'plunge', 'pole', 'poll', 'pond', 'pop', 'portfolio', 'portray', 'postpone', 'postwar', 'practitioner', 'preach', 'precedent', 'precision', 'predator', 'predecessor', 'predominantly', 'pregnancy', 'prejudice', 'preliminary', 'premier', 'premise', 'premium', 'prescribe', 'prescription', 'presently', 'preservation', 'preside', 'presidency', 'prestigious', 'presumably', 'presume', 'prevail', 'prevalence', 'prevention', 'prey', 'privatization', 'privilege', 'probe', 'problematic', 'proceeding', 'proceeds', 'processing', 'processor', 'proclaim', 'productive', 'productivity', 'profitable', 'profound', 'projection', 'prominent', 'pronounced', 'propaganda', 'proposition', 'prosecute', 'prosecution', 'prosecutor', 'prospective', 'prosperity', 'protective', 'protocol', 'province', 'provincial', 'provision', 'provoke', 'psychiatric', 'pulse', 'pump', 'punch', 'query', 'quest', 'quota', 'radar', 'radical', 'rage', 'raid', 'rally', 'ranking', 'rape', 'ratio', 'rational', 'ray', 'readily', 'realization', 'realm', 'rear', 'reasoning', 'reassure', 'rebel', 'rebellion', 'recipient', 'reconstruction', 'recount', 'recruitment', 'referendum', 'reflection', 'reform', 'refuge', 'refusal', 'regain', 'regardless', 'regime', 'regulator', 'regulatory', 'rehabilitation', 'reign', 'rejection', 'relevance', 'reliability', 'reluctant', 'remainder', 'remains', 'remedy', 'reminder', 'removal', 'render', 'renew', 'renowned', 'replacement', 'reportedly', 'representation', 'reproduce', 'reproduction', 'republic', 'resemble', 'reside', 'residence', 'residential', 'residue', 'resignation', 'resistance', 'respective', 'respectively', 'restoration', 'restraint', 'resume', 'retreat', 'retrieve', 'revelation', 'revenge', 'reverse', 'revival', 'revive', 'revolutionary', 'rhetoric', 'rifle', 'riot', 'rip', 'ritual', 'robust', 'rock', 'rod', 'rookie', 'roster', 'rotate', 'rotation', 'ruling', 'rumor', 'sacred', 'sacrifice', 'saint', 'sake', 'sanction', 'say', 'scattered', 'scope', 'screw', 'scrutiny', 'seal', 'secondly', 'secular', 'seemingly', 'segment', 'seize', 'seldom', 'selective', 'sensation', 'sensitivity', 'sentiment', 'separation', 'serial', 'settlement', 'setup', 'sexuality', 'shareholder', 'shatter', 'shed', 'sheer', 'shipping', 'shoot', 'shrink', 'shrug', 'sigh', 'simulate', 'simulation', 'simultaneously', 'sin', 'situated', 'skeptical', 'sketch', 'skip', 'slam', 'slap', 'slash', 'slavery', 'slot', 'smash', 'snap', 'soak', 'soar', 'socialist', 'sole', 'solely', 'solidarity', 'solo', 'sophomore', 'sound', 'sovereignty', 'spam', 'span', 'spare', 'spark', 'specialized', 'specification', 'specimen', 'spectacle', 'spectrum', 'spell', 'sphere', 'spin', 'spine', 'spotlight', 'spouse', 'spy', 'squad', 'squeeze', 'stab', 'stability', 'stabilize', 'stake', 'standing', 'stark', 'statistical', 'steer', 'stem', 'stereotype', 'stimulus', 'stir', 'storage', 'straightforward', 'strain', 'strand', 'strategic', 'striking', 'strip', 'strip', 'strive', 'structural', 'stumble', 'stun', 'submission', 'subscriber', 'subscription', 'subsidy', 'substantial', 'substantially', 'substitute', 'substitution', 'subtle', 'suburban', 'succession', 'successive', 'successor', 'suck', 'sue', 'suicide', 'suite', 'summit', 'superb', 'superintendent', 'superior', 'supervise', 'supervision', 'supervisor', 'supplement', 'supportive', 'supposedly', 'suppress', 'supreme', 'surge', 'surgical', 'surplus', 'surrender', 'surveillance', 'suspension', 'suspicion', 'suspicious', 'sustain', 'swing', 'sword', 'symbolic', 'syndrome', 'synthesis', 'systematic', 'tackle', 'tactic', 'tactical', 'taxpayer', 'tempt', 'tenant', 'tender', 'tenure', 'terminal', 'terminate', 'terrain', 'terrific', 'testify', 'testimony', 'texture', 'thankfully', 'theatrical', 'theology', 'theoretical', 'thereafter', 'thereby', 'thoughtful', 'thought-provoking', 'thread', 'threshold', 'thrilled', 'thrive', 'tide', 'tighten', 'timber', 'timely', 'tobacco', 'tolerance', 'tolerate', 'toll', 'top', 'torture', 'toss', 'total', 'toxic', 'trace', 'trademark', 'trail', 'trailer', 'transaction', 'transcript', 'transformation', 'transit', 'transmission', 'transparency', 'transparent', 'trauma', 'treaty', 'tremendous', 'tribal', 'tribute', 'trigger', 'trio', 'triumph', 'trophy', 'troubled', 'trustee', 'tuition', 'tumor', 'turnout', 'turnover', 'twist', 'unconstitutional', 'undergraduate', 'underlying', 'undermine', 'undoubtedly', 'unify', 'unprecedented', 'unveil', 'upcoming', 'upgrade', 'uphold', 'utility', 'utilize', 'utterly', 'vacuum', 'vague', 'validity', 'vanish', 'variable', 'varied', 'vein', 'venture', 'verbal', 'verdict', 'verify', 'verse', 'versus', 'vessel', 'veteran', 'viable', 'vibrant', 'vice', 'vicious', 'violate', 'violation', 'virtue', 'vocal', 'vow', 'vulnerability', 'vulnerable', 'ward', 'warehouse', 'warfare', 'warrant', 'warrior', 'weaken', 'weave', 'weed', 'well', 'well-being', 'whatsoever', 'whereby', 'whip', 'wholly', 'widen', 'widow', 'width', 'willingness', 'wipe', 'wit', 'withdrawal', 'workout', 'worship', 'worthwhile', 'worthy', 'yell', 'yield']

In [26]:
reader = PdfReader('The_Oxford_3000_by_CEFR_level.pdf')
  
text = []
for x in range(len(reader.pages)):
    page = reader.pages[x]    
    page_text = page.extract_text().split('\n')
    for x in page_text:
        text.append(x)
a1 = []
a2 = []
b1 = []
b2 = []
a1_trigger = 0
a2_trigger = 0
b1_trigger = 0
b2_trigger = 0

while(len(text) > 0):
    x = text.pop(0)
    if a1_trigger == 1:
        a1.append(x)
    if x == 'A1':
        a1_trigger = 1
        a2_trigger = 0
        b1_trigger = 0
        b2_trigger = 0
    if a2_trigger == 1:
        a2.append(x)
    if x == 'A2':
        a1_trigger = 0
        a2_trigger = 1
        b1_trigger = 0
        b2_trigger = 0
    if b1_trigger == 1:
        b1.append(x)
    if x == 'B1':
        a1_trigger = 0
        a2_trigger = 0
        b1_trigger = 1
        b2_trigger = 0
    if b2_trigger == 1:
        b2.append(x)
    if x == 'B2':
        a1_trigger = 0
        a2_trigger = 0
        b1_trigger = 0
        b2_trigger = 1

ox3_words_a1 = []
ox3_words_a2 = []
ox3_words_b1 = []
ox3_words_b2 = []
for x in a1:
    ox3_words_a1.append(x.split()[0])
for x in a2:
    ox3_words_a2.append(x.split()[0])
for x in b1:
    ox3_words_b1.append(x.split()[0])
for x in b2:
    ox3_words_b2.append(x.split()[0])

In [27]:
ox3_words_a1 = ['about', 'above', 'across', 'action', 'activity', 'actor', 'actress', 'add', 'address', 'adult', 'advice', 'afraid', 'after', 'afternoon', 'again', 'age', 'ago', 'agree', 'air', 'airport', 'all', 'also', 'always', 'amazing', 'and', 'angry', 'animal', 'another', 'answer', 'any', 'anyone', 'anything', 'apartment', 'apple', 'April', 'area', 'arm', 'around', 'arrive', 'art', 'article', 'artist', 'as', 'ask', 'at', 'August', 'aunt', 'autumn', 'away', 'baby', 'back', 'bad', 'bag', 'ball', 'banana', 'band', 'bank', 'bath', 'bathroom', 'be', 'beach', 'beautiful', 'because', 'become', 'bed', 'bedroom', 'beer', 'before', 'begin', 'beginning', 'behind', 'believe', 'below', 'best', 'better', 'between', 'bicycle', 'big', 'bike', 'bill', 'bird', 'birthday', 'black', 'blog', 'blonde', 'blue', 'boat', 'body', 'book', 'boot', 'bored', 'boring', 'born', 'both', 'bottle', 'box', 'boy', 'boyfriend', 'bread', 'break', 'breakfast', 'bring', 'brother', 'brown', 'build', 'building', 'bus', 'business', 'busy', 'but', 'butter', 'buy', 'by', 'bye', 'cafe', 'cake', 'call', 'camera', 'can', 'cannot', 'capital', 'car', 'card', 'career', 'carrot', 'carry', 'cat', 'CD', 'cent', 'centre', 'century', 'chair', 'change', 'chart', 'cheap', 'check', 'cheese', 'chicken', 'child', 'chocolate', 'choose', 'cinema', 'city', 'class', 'classroom', 'clean', 'climb', 'clock', 'close', 'clothes', 'club', 'coat', 'coffee', 'cold', 'college', 'colour', 'come', 'common', 'company', 'compare', 'complete', 'computer', 'concert', 'conversation', 'cook', 'cooking', 'cool', 'correct', 'cost', 'could', 'country', 'course', 'cousin', 'cow', 'cream', 'create', 'culture', 'cup', 'customer', 'cut', 'dad', 'dance', 'dancer', 'dancing', 'dangerous', 'dark', 'date', 'daughter', 'day', 'dear', 'December', 'decide', 'delicious', 'describe', 'description', 'design', 'desk', 'detail', 'dialogue', 'dictionary', 'die', 'diet', 'difference', 'different', 'difficult', 'dinner', 'dirty', 'discuss', 'dish', 'do', 'doctor', 'dog', 'dollar', 'door', 'down', 'downstairs', 'draw', 'dress', 'drink', 'drive', 'driver', 'during', 'DVD', 'each', 'ear', 'early', 'east', 'easy', 'eat', 'egg', 'eight', 'eighteen', 'eighty', 'elephant', 'eleven', 'else', 'email', 'end', 'enjoy', 'enough', 'euro', 'even', 'evening', 'event', 'ever', 'every', 'everybody', 'everyone', 'everything', 'exam', 'example', 'excited', 'exciting', 'exercise', 'expensive', 'explain', 'extra', 'eye', 'face', 'fact', 'fall', 'false', 'family', 'famous', 'fantastic', 'far', 'farm', 'farmer', 'fast', 'fat', 'father', 'favourite', 'February', 'feel', 'feeling', 'festival', 'few', 'fifteen', 'fifth', 'fifty', 'fill', 'film', 'final', 'find', 'fine', 'finish', 'fire', 'first', 'fish', 'five', 'flat', 'flight', 'floor', 'flower', 'fly', 'follow', 'food', 'foot', 'football', 'for', 'forget', 'form', 'forty', 'four', 'fourteen', 'fourth', 'free', 'Friday', 'friend', 'friendly', 'from', 'front', 'fruit', 'full', 'fun', 'funny', 'future', 'game', 'garden', 'geography', 'get', 'girl', 'girlfriend', 'give', 'glass', 'go', 'good', 'goodbye', 'grandfather', 'grandmother', 'grandparent', 'great', 'green', 'grey', 'group', 'grow', 'guess', 'guitar', 'gym', 'hair', 'half', 'hand', 'happen', 'happy', 'hard', 'hat', 'hate', 'have', 'have', 'he', 'head', 'health', 'healthy', 'hear', 'hello', 'help', 'her', 'here', 'hey', 'hi', 'high', 'him', 'his', 'history', 'hobby', 'holiday', 'home', 'homework', 'hope', 'horse', 'hospital', 'hot', 'hotel', 'hour', 'house', 'how', 'however', 'hundred', 'hungry', 'husband', 'I', 'ice', 'ice', 'idea', 'if', 'imagine', 'important', 'improve', 'in', 'include', 'information', 'interest', 'interested', 'interesting', 'internet', 'interview', 'into', 'introduce', 'island', 'it', 'its', 'jacket', 'January', 'jeans', 'job', 'join', 'journey', 'juice', 'July', 'June', 'just', 'keep', 'key', 'kilometre', 'kind', 'kitchen', 'know', 'land', 'language', 'large', 'last', 'late', 'later', 'laugh', 'learn', 'leave', 'left', 'leg', 'lesson', 'let', 'letter', 'library', 'lie', 'life', 'light', 'like', 'like', 'line', 'lion', 'list', 'listen', 'little', 'live', 'local', 'long', 'look', 'lose', 'lot', 'love', 'lunch', 'machine', 'magazine', 'main', 'make', 'man', 'many', 'map', 'March', 'market', 'married', 'match', 'May', 'maybe', 'me', 'meal', 'mean', 'meaning', 'meat', 'meet', 'meeting', 'member', 'menu', 'message', 'metre', 'midnight', 'mile', 'milk', 'million', 'minute', 'miss', 'mistake', 'model', 'modern', 'moment', 'Monday', 'money', 'month', 'more', 'morning', 'most', 'mother', 'mountain', 'mouse', 'mouth', 'move', 'movie', 'much', 'mum', 'museum', 'music', 'must', 'my', 'name', 'natural', 'near', 'need', 'negative', 'neighbour', 'never', 'new', 'news', 'newspaper', 'next', 'next', 'nice', 'night', 'nine', 'nineteen', 'ninety', 'no', 'no one', 'nobody', 'north', 'nose', 'not', 'note', 'nothing', 'November', 'now', 'number', 'nurse', 'object', 'o’clock', 'October', 'of', 'off', 'office', 'often', 'oh', 'OK', 'old', 'on', 'once', 'one', 'onion', 'online', 'only', 'open', 'opinion', 'opposite', 'or', 'orange', 'order', 'other', 'our', 'out', 'outside', 'over', 'own', 'page', 'paint', 'painting', 'pair', 'paper', 'paragraph', 'parent', 'park', 'part', 'partner', 'party', 'passport', 'past', 'pay', 'pen', 'pencil', 'people', 'pepper', 'perfect', 'period', 'person', 'personal', 'phone', 'photo', 'photograph', 'phrase', 'piano', 'picture', 'piece', 'pig', 'pink', 'place', 'plan', 'plane', 'plant', 'play', 'player', 'please', 'point', 'police', 'policeman', 'pool', 'poor', 'popular', 'positive', 'possible', 'post', 'potato', 'pound', 'practice', 'practise', 'prefer', 'prepare', 'present', 'pretty', 'price', 'probably', 'problem', 'product', 'programme', 'project', 'purple', 'put', 'quarter', 'question', 'quick', 'quickly', 'quiet', 'quite', 'radio', 'rain', 'read', 'reader', 'reading', 'ready', 'real', 'really', 'reason', 'red', 'relax', 'remember', 'repeat', 'report', 'restaurant', 'result', 'return', 'rice', 'rich', 'ride', 'right', 'river', 'road', 'room', 'routine', 'rule', 'run', 'sad', 'salad', 'salt', 'same', 'sandwich', 'Saturday', 'say', 'school', 'science', 'scientist', 'sea', 'second', 'section', 'see', 'sell', 'send', 'sentence', 'September', 'seven', 'seventeen', 'seventy', 'share', 'she', 'sheep', 'shirt', 'shoe', 'shop', 'shopping', 'short', 'should', 'show', 'shower', 'sick', 'similar', 'sing', 'singer', 'sister', 'sit', 'situation', 'six', 'sixteen', 'sixty', 'skill', 'skirt', 'sleep', 'slow', 'small', 'snake', 'snow', 'so', 'some', 'somebody', 'someone', 'something', 'sometimes', 'son', 'song', 'soon', 'sorry', 'sound', 'soup', 'south', 'space', 'speak', 'special', 'spell', 'spelling', 'spend', 'sport', 'spring', 'stand', 'star', 'start', 'statement', 'station', 'stay', 'still', 'stop', 'story', 'street', 'strong', 'student', 'study', 'style', 'subject', 'success', 'sugar', 'summer', 'sun', 'Sunday', 'supermarket', 'sure', 'sweater', 'swim', 'swimming', 'table', 'take', 'talk', 'tall', 'taxi', 'tea', 'teach', 'teacher', 'team', 'teenager', 'telephone', 'television', 'tell', 'ten', 'tennis', 'terrible', 'test', 'text', 'than', 'thank', 'thanks', 'that', 'the', 'theatre', 'their', 'them', 'then', 'there', 'they', 'thing', 'think', 'third', 'thirsty', 'thirteen', 'thirty', 'this', 'thousand', 'three', 'through', 'Thursday', 'ticket', 'time', 'tired', 'title', 'to', 'today', 'together', 'toilet', 'tomato', 'tomorrow', 'tonight', 'too', 'tooth', 'topic', 'tourist', 'town', 'traffic', 'train', 'travel', 'tree', 'trip', 'trousers', 'true', 'try', 'T-shirt', 'Tuesday', 'turn', 'TV', 'twelve', 'twenty', 'twice', 'two', 'type', 'umbrella', 'uncle', 'under', 'understand', 'university', 'until', 'up', 'upstairs', 'us', 'use', 'useful', 'usually', 'vacation', 'vegetable', 'very', 'video', 'village', 'visit', 'visitor', 'wait', 'waiter', 'wake', 'walk', 'wall', 'want', 'warm', 'wash', 'watch', 'water', 'way', 'we', 'wear', 'weather', 'website', 'Wednesday', 'week', 'weekend', 'welcome', 'well', 'west', 'what', 'when', 'where', 'which', 'white', 'who', 'why', 'wife', 'will', 'win', 'window', 'wine', 'winter', 'with', 'without', 'woman', 'wonderful', 'word', 'work', 'worker', 'world', 'would', 'write', 'writer', 'writing', 'wrong', 'yeah', 'year', 'yellow', 'yes', 'yesterday', 'you', 'young', 'your', 'yourself']

In [28]:
ox3_words_a2 = ['ability', 'able', 'abroad', 'accept', 'accident', 'according', 'achieve', 'act', 'active', 'actually', 'adult', 'advantage', 'adventure', 'advertise', 'advertisement', 'advertising', 'affect', 'after', 'against', 'ah', 'airline', 'alive', 'all', 'all', 'allow', 'almost', 'alone', 'along', 'already', 'alternative', 'although', 'among', 'amount', 'ancient', 'ankle', 'any', 'anybody', 'any', 'anyway', 'anywhere', 'app', 'appear', 'appearance', 'apply', 'architect', 'architecture', 'argue', 'argument', 'army', 'arrange', 'arrangement', 'as', 'asleep', 'assistant', 'athlete', 'attack', 'attend', 'attention', 'attractive', 'audience', 'author', 'available', 'average', 'avoid', 'award', 'awful', 'back', 'background', 'badly', 'bar', 'baseball', 'based', 'basketball', 'bean', 'bear', 'beat', 'beef', 'before', 'behave', 'behaviour', 'belong', 'belt', 'benefit', 'best', 'better', 'between', 'billion', 'bin', 'biology', 'birth', 'biscuit', 'bit', 'blank', 'blood', 'blow', 'board', 'boil', 'bone', 'book', 'borrow', 'boss', 'bottom', 'bowl', 'brain', 'bridge', 'bright', 'brilliant', 'broken', 'brush', 'burn', 'businessman', 'button', 'camp', 'camping', 'can', 'care', 'careful', 'carefully', 'carpet', 'cartoon', 'case', 'cash', 'castle', 'catch', 'cause', 'celebrate', 'celebrity', 'certain', 'certainly', 'chance', 'character', 'charity', 'chat', 'check', 'chef', 'chemistry', 'chip', 'choice', 'church', 'cigarette', 'circle', 'classical', 'clear', 'clearly', 'clever', 'climate', 'close', 'closed', 'clothing', 'cloud', 'coach', 'coast', 'code', 'colleague', 'collect', 'column', 'comedy', 'comfortable', 'comment', 'communicate', 'community', 'compete', 'competition', 'complain', 'completely', 'condition', 'conference', 'connect', 'connected', 'consider', 'contain', 'context', 'continent', 'continue', 'control', 'cook', 'cooker', 'copy', 'corner', 'correctly', 'count', 'couple', 'cover', 'crazy', 'creative', 'credit', 'crime', 'criminal', 'cross', 'crowd', 'crowded', 'cry', 'cupboard', 'curly', 'cycle', 'daily', 'danger', 'dark', 'data', 'dead', 'deal', 'dear', 'death', 'decision', 'deep', 'definitely', 'degree', 'dentist', 'department', 'depend', 'desert', 'designer', 'destroy', 'detective', 'develop', 'device', 'diary', 'differently', 'digital', 'direct', 'direction', 'director', 'disagree', 'disappear', 'disaster', 'discover', 'discovery', 'discussion', 'disease', 'distance', 'divorced', 'document', 'double', 'download', 'downstairs', 'drama', 'drawing', 'dream', 'drive', 'driving', 'drop', 'drug', 'dry', 'earn', 'earth', 'easily', 'education', 'effect', 'either', 'electric', 'electrical', 'electricity', 'electronic', 'employ', 'employee', 'employer', 'empty', 'ending', 'energy', 'engine', 'engineer', 'enormous', 'enter', 'environment', 'equipment', 'error', 'especially', 'essay', 'everyday', 'everywhere', 'evidence', 'exact', 'exactly', 'excellent', 'except', 'exist', 'expect', 'experience', 'experiment', 'expert', 'explanation', 'express', 'expression', 'extreme', 'extremely', 'factor', 'factory', 'fail', 'fair', 'fall', 'fan', 'farm', 'farming', 'fashion', 'fat', 'fear', 'feature', 'feed', 'female', 'fiction', 'field', 'fight', 'figure', 'film', 'final', 'finally', 'finger', 'finish', 'first', 'firstly', 'fish', 'fishing', 'fit', 'fix', 'flat', 'flu', 'fly', 'flying', 'focus', 'following', 'foreign', 'forest', 'fork', 'formal', 'fortunately', 'forward', 'free', 'fresh', 'fridge', 'frog', 'fun', 'furniture', 'further', 'future', 'gallery', 'gap', 'gas', 'gate', 'general', 'gift', 'goal', 'god', 'gold', 'golf', 'good', 'government', 'grass', 'greet', 'ground', 'guest', 'guide', 'gun', 'guy', 'habit', 'half', 'hall', 'happily', 'have', 'headache', 'heart', 'heat', 'heavy', 'height', 'helpful', 'hero', 'hers', 'herself', 'hide', 'high', 'hill', 'himself', 'his', 'hit', 'hockey', 'hold', 'hole', 'home', 'hope', 'huge', 'human', 'hurt', 'ideal', 'identify', 'ill', 'illness', 'image', 'immediately', 'impossible', 'included', 'including', 'increase', 'incredible', 'independent', 'individual', 'industry', 'informal', 'injury', 'insect', 'inside', 'instead', 'instruction', 'instructor', 'instrument', 'intelligent', 'international', 'introduction', 'invent', 'invention', 'invitation', 'invite', 'involve', 'item', 'itself', 'jam', 'jazz', 'jewellery', 'joke', 'journalist', 'jump', 'kid', 'kill', 'king', 'knee', 'knife', 'knock', 'knowledge', 'lab', 'lady', 'lake', 'lamp', 'land', 'laptop', 'last', 'last', 'later', 'laughter', 'law', 'lawyer', 'lazy', 'lead', 'leader', 'learning', 'least', 'lecture', 'lemon', 'lend', 'less', 'level', 'lifestyle', 'lift', 'light', 'light', 'likely', 'link', 'listener', 'little', 'lock', 'look', 'lorry', 'lost', 'loud', 'loudly', 'lovely', 'low', 'luck', 'lucky', 'mail', 'major', 'male', 'manage', 'manager', 'manner', 'mark', 'marry', 'material', 'mathematics', 'maths', 'matter', 'may', 'media', 'medical', 'medicine', 'memory', 'mention', 'metal', 'method', 'middle', 'might', 'mind', 'mine', 'mirror', 'missing', 'mobile', 'monkey', 'moon', 'mostly', 'motorcycle', 'movement', 'musical', 'musician', 'myself', 'narrow', 'national', 'nature', 'nearly', 'necessary', 'neck', 'need', 'neither', 'nervous', 'network', 'noise', 'noisy', 'none', 'normal', 'normally', 'notice', 'novel', 'nowhere', 'number', 'nut', 'ocean', 'offer', 'officer', 'oil', 'onto', 'opportunity', 'option', 'ordinary', 'organization', 'organize', 'original', 'ourselves', 'outside', 'oven', 'own', 'owner', 'pack', 'pain', 'painter', 'palace', 'pants', 'parking', 'particular', 'pass', 'passenger', 'past', 'patient', 'pattern', 'pay', 'peace', 'penny', 'per', 'per', 'perform', 'perhaps', 'permission', 'personality', 'pet', 'petrol', 'photograph', 'physical', 'physics', 'pick', 'pilot', 'planet', 'plant', 'plastic', 'plate', 'platform', 'please', 'pleased', 'pocket', 'polite', 'pollution', 'pop', 'population', 'position', 'possession', 'possibility', 'poster', 'power', 'predict', 'present', 'president', 'prevent', 'print', 'printer', 'prison', 'prize', 'process', 'produce', 'professional', 'professor', 'profile', 'program', 'progress', 'promise', 'pronounce', 'protect', 'provide', 'pub', 'public', 'publish', 'pull', 'purpose', 'push', 'quality', 'quantity', 'queen', 'question', 'quietly', 'race', 'railway', 'raise', 'rate', 'rather', 'reach', 'react', 'realize', 'receive', 'recent', 'recently', 'reception', 'recipe', 'recognize', 'recommend', 'record', 'recording', 'recycle', 'reduce', 'refer', 'refuse', 'region', 'regular', 'relationship', 'remove', 'repair', 'replace', 'reply', 'report', 'reporter', 'request', 'research', 'researcher', 'respond', 'response', 'rest', 'rest', 'review', 'ride', 'ring', 'rise', 'rock', 'rock', 'role', 'roof', 'round', 'route', 'rubbish', 'rude', 'run', 'runner', 'running', 'sadly', 'safe', 'sail', 'sailing', 'salary', 'sale', 'sauce', 'save', 'scared', 'scary', 'scene', 'schedule', 'score', 'screen', 'search', 'season', 'seat', 'second', 'secondly', 'secret', 'secretary', 'seem', 'sense', 'separate', 'series', 'serious', 'serve', 'service', 'several', 'shake', 'shall', 'shape', 'sheet', 'ship', 'shoulder', 'shout', 'shut', 'side', 'sign', 'silver', 'simple', 'since', 'singing', 'single', 'sir', 'site', 'size', 'ski', 'skiing', 'skin', 'sky', 'sleep', 'slowly', 'smartphone', 'smell', 'smile', 'smoke', 'smoking', 'soap', 'soccer', 'social', 'society', 'sock', 'soft', 'soldier', 'solution', 'solve', 'somewhere', 'sort', 'source', 'speaker', 'specific', 'speech', 'speed', 'spider', 'spoon', 'square', 'stage', 'stair', 'stamp', 'star', 'start', 'state', 'stay', 'steal', 'step', 'stomach', 'stone', 'store', 'storm', 'straight', 'strange', 'strategy', 'stress', 'structure', 'stupid', 'succeed', 'successful', 'such', 'suddenly', 'suggest', 'suggestion', 'suit', 'support', 'suppose', 'sure', 'surprise', 'surprised', 'surprising', 'survey', 'sweet', 'symbol', 'system', 'tablet', 'talk', 'target', 'task', 'taste', 'teaching', 'technology', 'teenage', 'temperature', 'term', 'text', 'themselves', 'thick', 'thief', 'thin', 'thinking', 'third', 'thought', 'throw', 'tidy', 'tie', 'tip', 'tool', 'top', 'touch', 'tour', 'tourism', 'towards', 'towel', 'tower', 'toy', 'track', 'tradition', 'traditional', 'train', 'trainer', 'training', 'transport', 'traveller', 'trouble', 'truck', 'twin', 'typical', 'underground', 'understanding', 'unfortunately', 'unhappy', 'uniform', 'unit', 'united', 'unusual', 'upstairs', 'use', 'used', 'user', 'usual', 'valley', 'van', 'variety', 'vehicle', 'view', 'virus', 'voice', 'wait', 'war', 'wash', 'washing', 'wave', 'weak', 'web', 'wedding', 'weight', 'welcome', 'wet', 'wheel', 'while', 'whole', 'whose', 'wide', 'wild', 'wind', 'winner', 'wish', 'wood', 'wooden', 'working', 'worried', 'worry', 'worse', 'worst', 'wow', 'yet', 'yours', 'zero']

In [29]:
ox3_words_b1 = ['absolutely', 'academic', 'access', 'accommodation', 'account', 'achievement', 'act', 'ad', 'addition', 'admire', 'admit', 'advanced', 'advise', 'afford', 'age', 'aged', 'agent', 'agreement', 'ahead', 'aim', 'alarm', 'album', 'alcohol', 'alcoholic', 'alternative', 'amazed', 'ambition', 'ambitious', 'analyse', 'analysis', 'announce', 'announcement', 'annoy', 'annoyed', 'annoying', 'apart', 'apologize', 'application', 'appointment', 'appreciate', 'approximately', 'arrest', 'arrival', 'assignment', 'assist', 'atmosphere', 'attach', 'attitude', 'attract', 'attraction', 'authority', 'average', 'award', 'aware', 'backwards', 'bake', 'balance', 'ban', 'bank', 'base', 'basic', 'basis', 'battery', 'battle', 'beauty', 'bee', 'belief', 'bell', 'bend', 'benefit', 'better', 'bite', 'block', 'board', 'bomb', 'border', 'bother', 'branch', 'brand', 'brave', 'breath', 'breathe', 'breathing', 'bride', 'bubble', 'bury', 'by', 'calm', 'campaign', 'campus', 'candidate', 'cap', 'captain', 'careless', 'category', 'ceiling', 'celebration', 'central', 'centre', 'ceremony', 'chain', 'challenge', 'champion', 'channel', 'chapter', 'charge', 'cheap', 'cheat', 'cheerful', 'chemical', 'chest', 'childhood', 'claim', 'clause', 'clear', 'click', 'client', 'climb', 'close', 'cloth', 'clue', 'coach', 'coal', 'coin', 'collection', 'coloured', 'combine', 'comment', 'commercial', 'commit', 'communication', 'comparison', 'competitor', 'competitive', 'complaint', 'complex', 'concentrate', 'conclude', 'conclusion', 'confident', 'confirm', 'confuse', 'confused', 'connection', 'consequence', 'consist', 'consume', 'consumer', 'contact', 'container', 'content', 'continuous', 'contrast', 'convenient', 'convince', 'cool', 'costume', 'cottage', 'cotton', 'count', 'countryside', 'court', 'cover', 'covered', 'cream', 'criminal', 'cruel', 'cultural', 'currency', 'current', 'currently', 'curtain', 'custom', 'cut', 'daily', 'damage', 'deal', 'decade', 'decorate', 'deep', 'define', 'definite', 'definition', 'deliver', 'departure', 'despite', 'destination', 'determine', 'determined', 'development', 'diagram', 'diamond', 'difficulty', 'direct', 'directly', 'dirt', 'disadvantage', 'disappointed', 'disappointing', 'discount', 'dislike', 'divide', 'documentary', 'donate', 'double', 'doubt', 'dressed', 'drop', 'drum', 'drunk', 'due', 'dust', 'duty', 'earthquake', 'eastern', 'economic', 'economy', 'edge', 'editor', 'educate', 'educated', 'educational', 'effective', 'effectively', 'effort', 'election', 'element', 'embarrassed', 'embarrassing', 'emergency', 'emotion', 'employment', 'empty', 'encourage', 'enemy', 'engaged', 'engineering', 'entertain', 'entertainment', 'entrance', 'entry', 'environmental', 'episode', 'equal', 'equally', 'escape', 'essential', 'eventually', 'examine', 'except', 'exchange', 'excitement', 'exhibition', 'expand', 'expected', 'expedition', 'experience', 'experienced', 'experiment', 'explode', 'explore', 'explosion', 'export', 'extra', 'face', 'fairly', 'familiar', 'fancy', 'far', 'fascinating', 'fashionable', 'fasten', 'favour', 'fear', 'feature', 'fence', 'fighting', 'file', 'financial', 'fire', 'fitness', 'fixed', 'flag', 'flood', 'flour', 'flow', 'fold', 'folk', 'following', 'force', 'forever', 'frame', 'freeze', 'frequently', 'friendship', 'frighten', 'frightened', 'frightening', 'frozen', 'fry', 'fuel', 'function', 'fur', 'further', 'garage', 'gather', 'generally', 'generation', 'generous', 'gentle', 'gentleman', 'ghost', 'giant', 'glad', 'global', 'glove', 'go', 'goods', 'grade', 'graduate', 'grain', 'grateful', 'growth', 'guard', 'guilty', 'hand', 'hang', 'happiness', 'hardly', 'hate', 'head', 'headline', 'heating', 'heavily', 'helicopter', 'highlight', 'highly', 'hire', 'historic', 'historical', 'honest', 'horrible', 'horror', 'host', 'hunt', 'hurricane', 'hurry', 'identity', 'ignore', 'illegal', 'imaginary', 'immediate', 'immigrant', 'impact', 'import', 'importance', 'impression', 'impressive', 'improvement', 'incredibly', 'indeed', 'indicate', 'indirect', 'indoor', 'indoors', 'influence', 'ingredient', 'injure', 'injured', 'innocent', 'intelligence', 'intend', 'intention', 'invest', 'investigate', 'involved', 'iron', 'issue', 'IT', 'journal', 'judge', 'keen', 'key', 'keyboard', 'kick', 'killing', 'kind', 'kiss', 'knock', 'label', 'laboratory', 'lack', 'latest', 'lay', 'layer', 'lead', 'leading', 'leaf', 'leather', 'legal', 'leisure', 'length', 'level', 'lie', 'like', 'limit', 'lip', 'liquid', 'literature', 'live', 'living', 'local', 'locate', 'located', 'location', 'lonely', 'loss', 'luxury', 'mad', 'magic', 'mainly', 'mall', 'management', 'market', 'marketing', 'marriage', 'meanwhile', 'measure', 'medium', 'mental', 'mention', 'mess', 'mild', 'mine', 'mix', 'mixture', 'mood', 'move', 'mud', 'murder', 'muscle', 'musical', 'mystery', 'nail', 'narrative', 'nation', 'native', 'naturally', 'necessarily', 'need', 'needle', 'neighbourhood', 'neither', 'net', 'next', 'nor', 'normal', 'northern', 'note', 'now', 'nuclear', 'obvious', 'obviously', 'occasion', 'occur', 'odd', 'official', 'old-fashioned', 'once', 'operation', 'organized', 'organizer', 'original', 'originally', 'ought', 'ours', 'outdoor', 'outdoors', 'pack', 'package', 'painful', 'pale', 'pan', 'participate', 'particularly', 'pass', 'passion', 'path', 'payment', 'peaceful', 'percentage', 'perfectly', 'performance', 'personally', 'persuade', 'photographer', 'photography', 'pin', 'pipe', 'place', 'planning', 'pleasant', 'pleasure', 'plenty', 'plot', 'plus', 'poem', 'poet', 'poetry', 'point', 'poison', 'poisonous', 'policy', 'political', 'politician', 'politics', 'port', 'portrait', 'possibly', 'pot', 'pour', 'poverty', 'powder', 'powerful', 'practical', 'pray', 'prayer', 'prediction', 'prepared', 'presentation', 'press', 'pressure', 'pretend', 'previous', 'previously', 'priest', 'primary', 'prince', 'princess', 'printing', 'prisoner', 'private', 'producer', 'production', 'profession', 'profit', 'program', 'promote', 'proper', 'properly', 'property', 'protest', 'proud', 'prove', 'pull', 'punish', 'punishment', 'push', 'qualification', 'qualified', 'qualify', 'queue', 'quit', 'quotation', 'quote', 'race', 'racing', 'range', 'rare', 'rarely', 'reaction', 'reality', 'receipt', 'recommendation', 'reference', 'reflect', 'regularly', 'reject', 'relate', 'related', 'relation', 'relative', 'relaxed', 'relaxing', 'release', 'reliable', 'religion', 'religious', 'remain', 'remind', 'remote', 'rent', 'repair', 'repeat', 'repeated', 'represent', 'request', 'require', 'reservation', 'resource', 'respect', 'responsibility', 'responsible', 'result', 'retire', 'retired', 'revise', 'ring', 'rise', 'risk', 'robot', 'roll', 'romantic', 'rope', 'rough', 'row', 'royal', 'rugby', 'rule', 'safety', 'sail', 'sailor', 'sample', 'sand', 'scan', 'scientific', 'script', 'sculpture', 'secondary', 'security', 'seed', 'sensible', 'separate', 'seriously', 'servant', 'set', 'set', 'setting', 'sex', 'sexual', 'shake', 'share', 'sharp', 'shelf', 'shell', 'shift', 'shine', 'shiny', 'shoot', 'shy', 'sight', 'signal', 'silent', 'silly', 'similarity', 'similarly', 'simply', 'since', 'sink', 'slice', 'slightly', 'slow', 'smart', 'smooth', 'software', 'soil', 'solid', 'sort', 'southern', 'specifically', 'spending', 'spicy', 'spirit', 'spoken', 'spot', 'spread', 'spring', 'stadium', 'staff', 'standard', 'state', 'statistic', 'statue', 'stick', 'stick', 'still', 'store', 'stranger', 'strength', 'string', 'strongly', 'studio', 'stuff', 'substance', 'successfully', 'sudden', 'suffer', 'suit', 'suitable', 'summarize', 'summary', 'supply', 'supporter', 'surely', 'surface', 'survive', 'swim', 'switch', 'symptom', 'tail', 'talent', 'talented', 'tape', 'tax', 'technical', 'technique', 'tend', 'tent', 'that', 'theirs', 'theme', 'theory', 'therefore', 'this', 'though', 'throat', 'throughout', 'tight', 'till', 'tin', 'tiny', 'tip', 'toe', 'tongue', 'total', 'totally', 'touch', 'tour', 'trade', 'translate', 'translation', 'transport', 'treat', 'treatment', 'trend', 'trick', 'truth', 'tube', 'type', 'typically', 'tyre', 'ugly', 'unable', 'uncomfortable', 'underwear', 'unemployed', 'unemployment', 'unfair', 'union', 'unless', 'unlike', 'unlikely', 'unnecessary', 'unpleasant', 'update', 'upon', 'upset', 'used', 'used', 'valuable', 'value', 'various', 'version', 'victim', 'view', 'viewer', 'violent', 'volunteer', 'vote', 'warm', 'warn', 'warning', 'waste', 'water', 'wave', 'weapon', 'weigh', 'western', 'whatever', 'whenever', 'whether', 'while', 'whole', 'will', 'win', 'wing', 'within', 'wonder', 'wool', 'worldwide', 'worry', 'worse', 'worst', 'worth', 'written', 'wrong', 'yard', 'young', 'youth']

In [30]:
ox3_words_b2 = ['abandon', 'absolute', 'academic', 'acceptable', 'accompany', 'account', 'accurate', 'accuse', 'acknowledge', 'acquire', 'actual', 'adapt', 'additional', 'address', 'administration', 'adopt', 'advance', 'affair', 'afterwards', 'agency', 'agenda', 'aggressive', 'aid', 'aircraft', 'alarm', 'alter', 'amount', 'anger', 'angle', 'anniversary', 'annual', 'anxious', 'apparent', 'apparently', 'appeal', 'approach', 'appropriate', 'approval', 'approve', 'arise', 'armed', 'arms', 'artificial', 'artistic', 'ashamed', 'aspect', 'assess', 'assessment', 'associate', 'associated', 'association', 'assume', 'attempt', 'back', 'bacteria', 'bar', 'barrier', 'basically', 'battle', 'bear', 'beat', 'beg', 'being', 'bent', 'bet', 'beyond', 'bill', 'bitter', 'blame', 'blind', 'bond', 'border', 'breast', 'brief', 'broad', 'broadcast', 'budget', 'bullet', 'bunch', 'burn', 'bush', 'but', 'cable', 'calculate', 'cancel', 'cancer', 'capable', 'capacity', 'capture', 'cast', 'catch', 'cell', 'chain', 'chair', 'chairman', 'challenge', 'characteristic', 'chart', 'chief', 'circumstance', 'cite', 'citizen', 'civil', 'classic', 'close', 'closely', 'collapse', 'combination', 'comfort', 'command', 'commission', 'commitment', 'committee', 'commonly', 'complex', 'complicated', 'component', 'concentration', 'concept', 'concern', 'concerned', 'conduct', 'confidence', 'conflict', 'confusing', 'conscious', 'conservative', 'consideration', 'consistent', 'constant', 'constantly', 'construct', 'construction', 'contemporary', 'contest', 'contract', 'contribute', 'contribution', 'convert', 'convinced', 'core', 'corporate', 'council', 'county', 'courage', 'crash', 'creation', 'creature', 'credit', 'crew', 'crisis', 'criterion', 'critic', 'critical', 'criticism', 'criticize', 'crop', 'crucial', 'cry', 'cure', 'current', 'curve', 'curved', 'date', 'debate', 'debt', 'decent', 'declare', 'decline', 'decoration', 'decrease', 'deeply', 'defeat', 'defence', 'defend', 'delay', 'deliberate', 'deliberately', 'delight', 'delighted', 'delivery', 'demand', 'demonstrate', 'deny', 'depressed', 'depressing', 'depth', 'desert', 'deserve', 'desire', 'desperate', 'detail', 'detailed', 'detect', 'dig', 'disc', 'discipline', 'discount', 'dishonest', 'dismiss', 'display', 'distribute', 'distribution', 'district', 'divide', 'division', 'document', 'domestic', 'dominate', 'downwards', 'dozen', 'draft', 'drag', 'dramatic', 'edit', 'edition', 'efficient', 'elderly', 'elect', 'elsewhere', 'emerge', 'emotional', 'emphasis', 'emphasize', 'enable', 'encounter', 'engage', 'enhance', 'enquiry', 'ensure', 'enthusiasm', 'enthusiastic', 'entire', 'entirely', 'equal', 'establish', 'estate', 'estimate', 'ethical', 'evaluate', 'even', 'evil', 'examination', 'excuse', 'executive', 'existence', 'expectation', 'expense', 'exploration', 'expose', 'extend', 'extent', 'external', 'extraordinary', 'extreme', 'facility', 'failure', 'faith', 'fault', 'favour', 'feather', 'fee', 'feed', 'feedback', 'feel', 'fellow', 'figure', 'file', 'finance', 'finding', 'firm', 'fix', 'flame', 'flash', 'flexible', 'float', 'fold', 'folding', 'following', 'forgive', 'former', 'fortune', 'forward', 'found', 'free', 'freedom', 'frequency', 'fuel', 'fully', 'function', 'fund', 'fundamental', 'funding', 'furthermore', 'gain', 'gang', 'generate', 'genre', 'govern', 'grab', 'grade', 'gradually', 'grand', 'grant', 'guarantee', 'handle', 'harm', 'harmful', 'hearing', 'heaven', 'heel', 'hell', 'hesitate', 'high', 'hire', 'hold', 'hollow', 'holy', 'honour', 'host', 'house', 'household', 'housing', 'humorous', 'humour', 'hunt', 'hunting', 'hurt', 'ideal', 'illustrate', 'illustration', 'imagination', 'impatient', 'imply', 'impose', 'impress', 'impressed', 'inch', 'incident', 'income', 'increasingly', 'industrial', 'infection', 'inform', 'initial', 'initially', 'initiative', 'inner', 'insight', 'insist', 'inspire', 'install', 'instance', 'institute', 'institution', 'insurance', 'intended', 'intense', 'internal', 'interpret', 'interrupt', 'investigation', 'investment', 'issue', 'joy', 'judgement', 'junior', 'justice', 'justify', 'labour', 'landscape', 'largely', 'latest', 'launch', 'leadership', 'league', 'lean', 'leave', 'level', 'licence', 'limited', 'line', 'lively', 'load', 'loan', 'logical', 'long-term', 'loose', 'lord', 'low', 'lower', 'lung', 'maintain', 'majority', 'make', 'map', 'mass', 'massive', 'master', 'matching', 'material', 'maximum', 'means', 'measurement', 'medium', 'melt', 'military', 'mineral', 'minimum', 'minister', 'minor', 'minority', 'mission', 'mistake', 'mixed', 'model', 'modify', 'monitor', 'moral', 'motor', 'mount', 'multiple', 'multiply', 'mysterious', 'narrow', 'national', 'neat', 'negative', 'nerve', 'nevertheless', 'nightmare', 'notion', 'numerous', 'obey', 'object', 'objective', 'obligation', 'observation', 'observe', 'obtain', 'occasionally', 'offence', 'offend', 'offensive', 'official', 'opening', 'operate', 'opponent', 'oppose', 'opposed', 'opposition', 'organ', 'origin', 'otherwise', 'outcome', 'outer', 'outline', 'overall', 'owe', 'pace', 'package', 'panel', 'parliament', 'participant', 'partly', 'passage', 'patient', 'pension', 'permanent', 'permit', 'perspective', 'phase', 'phenomenon', 'philosophy', 'pick', 'picture', 'pile', 'pitch', 'plain', 'plot', 'plus', 'pointed', 'popularity', 'pose', 'position', 'positive', 'possess', 'potential', 'power', 'praise', 'pregnant', 'preparation', 'presence', 'preserve', 'price', 'prime', 'principle', 'print', 'priority', 'privacy', 'procedure', 'process', 'produce', 'professional', 'progress', 'project', 'proof', 'proposal', 'propose', 'prospect', 'protection', 'psychologist', 'psychology', 'publication', 'pupil', 'purchase', 'pure', 'pursue', 'range', 'rank', 'rapid', 'rapidly', 'rate', 'raw', 'reach', 'realistic', 'reasonable', 'recall', 'recover', 'reduction', 'regard', 'regional', 'register', 'regret', 'regulation', 'relatively', 'relevant', 'relief', 'rely', 'remark', 'representative', 'reputation', 'requirement', 'rescue', 'reserve', 'resident', 'resist', 'resolve', 'resort', 'retain', 'reveal', 'revolution', 'reward', 'rhythm', 'rid', 'root', 'round', 'routine', 'rub', 'rubber', 'rural', 'rush', 'sample', 'satellite', 'satisfied', 'satisfy', 'saving', 'scale', 'schedule', 'scheme', 'scream', 'screen', 'seat', 'sector', 'secure', 'seek', 'select', 'selection', 'self', 'senior', 'sense', 'sensitive', 'sentence', 'sequence', 'session', 'settle', 'severe', 'shade', 'shadow', 'shallow', 'shame', 'shape', 'shelter', 'shift', 'ship', 'shock', 'shocked', 'shooting', 'shot', 'significant', 'significantly', 'silence', 'silk', 'sincere', 'slave', 'slide', 'slight', 'slip', 'slope', 'solar', 'somewhat', 'soul', 'specialist', 'species', 'speed', 'spiritual', 'split', 'sponsor', 'spot', 'spread', 'stable', 'stage', 'stand', 'stare', 'status', 'steady', 'steel', 'steep', 'step', 'sticky', 'stiff', 'stock', 'stream', 'stretch', 'strict', 'strike', 'structure', 'struggle', 'stuff', 'subject', 'submit', 'sum', 'surgery', 'surround', 'surrounding', 'survey', 'suspect', 'swear', 'sweep', 'switch', 'sympathy', 'tale', 'tank', 'target', 'tear', 'tear', 'temporary', 'term', 'therapy', 'threat', 'threaten', 'thus', 'time', 'title', 'tone', 'tough', 'track', 'transfer', 'transform', 'transition', 'trial', 'trip', 'tropical', 'trouble', 'truly', 'trust', 'try', 'tune', 'tunnel', 'ultimately', 'unconscious', 'unexpected', 'unique', 'universe', 'unknown', 'upper', 'upwards', 'urban', 'urge', 'value', 'vary', 'vast', 'venue', 'very', 'via', 'victory', 'violence', 'virtual', 'vision', 'visual', 'vital', 'vitamin', 'volume', 'wage', 'way', 'weakness', 'wealth', 'wealthy', 'whereas', 'wherever', 'whisper', 'whom', 'widely', 'wildlife', 'willing', 'wind', 'wire', 'wise', 'witness', 'worse', 'worst', 'worth', 'wound', 'wrap', 'wrong', 'yet', 'zone']

In [31]:
reader = PdfReader('The_Oxford_5000_by_CEFR_level.pdf')
  
text = []
for x in range(len(reader.pages)):
    page = reader.pages[x]    
    page_text = page.extract_text().split('\n')
    for x in page_text:
        text.append(x)
b2 = []
c1 = []
b2_trigger = 0
c1_trigger = 0

while(len(text) > 0):
    x = text.pop(0)
    if b2_trigger == 1:
        b2.append(x)
    if x == 'B2':
        b2_trigger = 1
        c1_trigger = 0
        
    if c1_trigger == 1:
        c1.append(x)
    if x == 'C1':
        b2_trigger = 0
        c1_trigger = 1
        
ox5_words_b2 = []
ox5_words_c1 = []
for x in b2:
    ox5_words_b2.append(x.split()[0])
for x in c1:
    ox5_words_c1.append(x.split()[0])

In [32]:
ox5_words_b2 = ['absorb', 'abstract', 'accent', 'accidentally', 'accommodate', 'accomplish', 'accountant', 'accuracy', 'accurately', 'acid', 'activate', 'addiction', 'additionally', 'adequate', 'adequately', 'adjust', 'affordable', 'agriculture', 'AIDS', 'alien', 'alongside', 'altogether', 'ambulance', 'amusing', 'analyst', 'ancestor', 'animation', 'annually', 'anticipate', 'anxiety', 'apology', 'applicant', 'appropriately', 'arrow', 'artwork', 'aside', 'asset', 'assign', 'assistance', 'assumption', 'assure', 'astonishing', 'attachment', 'auction', 'audio', 'automatic', 'automatically', 'awareness', 'awkward', 'badge', 'balanced', 'ballet', 'balloon', 'barely', 'bargain', 'basement', 'basket', 'bat', 'beside', 'besides', 'bias', 'bid', 'biological', 'blanket', 'blow', 'bold', 'bombing', 'booking', 'boost', 'bound', 'brick', 'briefly', 'broadcaster', 'broadly', 'bug', 'cabin', 'canal', 'candle', 'carbon', 'casual', 'cave', 'certainty', 'certificate', 'challenging', 'championship', 'charming', 'chase', 'cheek', 'cheer', 'choir', 'chop', 'circuit', 'civilization', 'clarify', 'classify', 'clerk', 'cliff', 'clinic', 'clip', 'coincidence', 'collector', 'colony', 'colourful', 'comic', 'commander', 'comparative', 'completion', 'compose', 'composer', 'compound', 'comprehensive', 'comprise', 'compulsory', 'concrete', 'confess', 'confusion', 'consequently', 'considerable', 'considerably', 'consistently', 'conspiracy', 'consult', 'consultant', 'consumption', 'controversial', 'controversy', 'convenience', 'convention', 'conventional', 'convey', 'convincing', 'cope', 'corporation', 'corridor', 'counter', 'coverage', 'crack', 'craft', 'creativity', 'critically', 'cruise', 'cue', 'curious', 'curriculum', 'cute', 'dairy', 'dare', 'darkness', 'database', 'deadline', 'deadly', 'dealer', 'deck', 'defender', 'delete', 'democracy', 'democratic', 'demonstration', 'depart', 'dependent', 'deposit', 'depression', 'derive', 'desperately', 'destruction', 'determination', 'devote', 'differ', 'disability', 'disabled', 'disagreement', 'disappoint', 'disappointment', 'discourage', 'disorder', 'distant', 'distinguish', 'distract', 'disturb', 'dive', 'diverse', 'diversity', 'divorce', 'dominant', 'donation', 'dot', 'downtown', 'dramatically', 'drought', 'dull', 'dump', 'duration', 'dynamic', 'economics', 'economist', 'editorial', 'efficiently', 'elbow', 'electronics', 'elegant', 'elementary', 'eliminate', 'embrace', 'emission', 'emotionally', 'empire', 'enjoyable', 'entertaining', 'entrepreneur', 'envelope', 'equip', 'equivalent', 'era', 'erupt', 'essentially', 'ethic', 'ethnic', 'evaluation', 'evident', 'evolution', 'evolve', 'exceed', 'exception', 'excessive', 'exclude', 'exhibit', 'exit', 'exotic', 'expansion', 'expertise', 'exploit', 'exposure', 'extension', 'extensive', 'extensively', 'extract', 'fabric', 'fabulous', 'failed', 'fake', 'fame', 'fantasy', 'fare', 'federal', 'fever', 'firefighter', 'firework', 'firm', 'firmly', 'flavour', 'fond', 'fool', 'forbid', 'forecast', 'format', 'formation', 'formerly', 'fortunate', 'forum', 'fossil', 'foundation', 'founder', 'fraction', 'fragment', 'framework', 'fraud', 'freely', 'frequent', 'fulfil', 'full-time', 'fundamentally', 'furious', 'gaming', 'gay', 'gender', 'gene', 'genetic', 'genius', 'genuine', 'genuinely', 'gesture', 'gig', 'globalization', 'globe', 'golden', 'goodness', 'gorgeous', 'governor', 'graphic', 'graphics', 'greatly', 'greenhouse', 'grocery', 'guideline', 'habitat', 'harbour', 'headquarters', 'heal', 'healthcare', 'helmet', 'hence', 'herb', 'hidden', 'highway', 'hilarious', 'historian', 'homeless', 'honesty', 'hook', 'hopefully', 'hunger', 'hypothesis', 'icon', 'ID', 'identical', 'illusion', 'immigration', 'immune', 'implement', 'implication', 'incentive', 'incorporate', 'incorrect', 'independence', 'index', 'indication', 'inevitable', 'inevitably', 'infer', 'inflation', 'info', 'infrastructure', 'inhabitant', 'inherit', 'ink', 'innovation', 'innovative', 'input', 'insert', 'inspector', 'installation', 'instant', 'instantly', 'integrate', 'intellectual', 'interact', 'interaction', 'interpretation', 'interval', 'invade', 'invasion', 'investor', 'isolate', 'isolated', 'jail', 'jet', 'joint', 'journalism', 'jury', 'kit', 'ladder', 'landing', 'lane', 'lately', 'leaflet', 'legend', 'lens', 'lifetime', 'lighting', 'likewise', 'limitation', 'literally', 'literary', 'litre', 'logo', 'lottery', 'loyal', 'lyric', 'magnificent', 'make-up', 'making', 'manufacture', 'manufacturing', 'marathon', 'margin', 'marker', 'martial', 'mate', 'mayor', 'mechanic', 'mechanical', 'mechanism', 'medal', 'medication', 'membership', 'memorable', 'metaphor', 'miner', 'miserable', 'mode', 'modest', 'monster', 'monthly', 'monument', 'moreover', 'mortgage', 'mosque', 'motion', 'motivate', 'motivation', 'moving', 'myth', 'naked', 'nasty', 'navigation', 'nearby', 'necessity', 'negotiate', 'negotiation', 'neutral', 'newly', 'norm', 'notebook', 'novelist', 'nowadays', 'nursing', 'nutrition', 'obesity', 'observer', 'obstacle', 'occupation', 'occupy', 'offender', 'ongoing', 'openly', 'opera', 'operator', 'optimistic', 'orchestra', 'organic', 'outfit', 'output', 'outstanding', 'overnight', 'overseas', 'ownership', 'oxygen', 'packet', 'palm', 'panic', 'parade', 'parallel', 'participation', 'partnership', 'part-time', 'passionate', 'password', 'patience', 'pause', 'peer', 'penalty', 'perceive', 'perception', 'permanently', 'pill', 'pity', 'placement', 'portion', 'potentially', 'precede', 'precious', 'precise', 'precisely', 'predictable', 'preference', 'pride', 'primarily', 'principal', 'prior', 'probability', 'probable', 'proceed', 'programming', 'progressive', 'prohibit', 'promising', 'promotion', 'prompt', 'proportion', 'protein', 'protester', 'psychological', 'publicity', 'publishing', 'punk', 'purely', 'pursuit', 'puzzle', 'questionnaire', 'racial', 'racism', 'racist', 'radiation', 'rail', 'random', 'rat', 'rating', 'reasonably', 'rebuild', 'receiver', 'recession', 'reckon', 'recognition', 'recovery', 'recruit', 'recruitment', 'referee', 'refugee', 'registration', 'regulate', 'reinforce', 'relieve', 'relieved', 'remarkable', 'remarkably', 'reporting', 'resign', 'resolution', 'restore', 'restrict', 'restriction', 'retail', 'retirement', 'revenue', 'revision', 'ridiculous', 'risky', 'rival', 'rob', 'robbery', 'rocket', 'romance', 'rose', 'roughly', 'ruin', 'satisfaction', 'scandal', 'scare', 'scenario', 'scholar', 'scholarship', 'scratch', 'screening', 'seeker', 'seminar', 'settler', 'severely', 'sexy', 'shaped', 'shocking', 'shore', 'shortage', 'shortly', 'short-term', 'sibling', 'signature', 'significance', 'skilled', 'skull', 'slogan', 'so-called', 'somehow', 'sometime', 'sophisticated', 'spare', 'specialize', 'specify', 'spectacular', 'spectator', 'speculate', 'speculation', 'spice', 'spite', 'spoil', 'spokesman', 'spokesperson', 'spokeswoman', 'sponsorship', 'sporting', 'stall', 'stance', 'starve', 'steadily', 'steam', 'stimulate', 'strengthen', 'strictly', 'stroke', 'stunning', 'subsequent', 'subsequently', 'suburb', 'suffering', 'sufficient', 'sufficiently', 'super', 'surgeon', 'survival', 'survivor', 'suspend', 'sustainable', 'swallow', 'sympathetic', 'tackle', 'tag', 'tap', 'technological', 'teens', 'temple', 'temporarily', 'tendency', 'tension', 'terminal', 'terms', 'terribly', 'terrify', 'territory', 'terror', 'terrorism', 'terrorist', 'testing', 'textbook', 'theft', 'therapist', 'thesis', 'thorough', 'thoroughly', 'thumb', 'timing', 'tissue', 'ton', 'tonne', 'tournament', 'trace', 'trading', 'tragedy', 'tragic', 'trait', 'transmit', 'transportation', 'trap', 'tribe', 'trigger', 'trillion', 'troop', 'tsunami', 'ultimate', 'unacceptable', 'uncertainty', 'undergo', 'undertake', 'unfold', 'unfortunate', 'unite', 'unity', 'universal', 'urgent', 'usage', 'useless', 'valid', 'variation', 'vertical', 'viewpoint', 'visa', 'visible', 'voluntary', 'voting', 'wander', 'warming', 'weekly', 'weird', 'welfare', 'wheat', 'whoever', 'widespread', 'wisdom', 'withdraw', 'workforce', 'workplace', 'workshop', 'worm', 'wrist']

In [33]:
ox5_words_c1 = ['abolish', 'abortion', 'absence', 'absent', 'absurd', 'abundance', 'abuse', 'academy', 'accelerate', 'acceptance', 'accessible', 'accomplishment', 'accordance', 'accordingly', 'accountability', 'accountable', 'accumulate', 'accumulation', 'accusation', 'accused', 'acid', 'acquisition', 'acre', 'activation', 'activist', 'acute', 'adaptation', 'adhere', 'adjacent', 'adjustment', 'administer', 'administrative', 'administrator', 'admission', 'adolescent', 'adoption', 'adverse', 'advocate', 'aesthetic', 'affection', 'aftermath', 'aggression', 'agricultural', 'aide', 'albeit', 'alert', 'alien', 'align', 'alignment', 'alike', 'allegation', 'allege', 'allegedly', 'alliance', 'allocate', 'allocation', 'allowance', 'ally', 'aluminium', 'amateur', 'ambassador', 'amend', 'amendment', 'amid', 'analogy', 'anchor', 'angel', 'anonymous', 'apparatus', 'appealing', 'appetite', 'applaud', 'applicable', 'appoint', 'appreciation', 'arbitrary', 'architectural', 'archive', 'arena', 'arguably', 'arm', 'array', 'articulate', 'ash', 'aspiration', 'aspire', 'assassination', 'assault', 'assemble', 'assembly', 'assert', 'assertion', 'assurance', 'asylum', 'atrocity', 'attain', 'attendance', 'attorney', 'attribute', 'audit', 'authentic', 'authorize', 'auto', 'autonomy', 'availability', 'await', 'backdrop', 'backing', 'backup', 'bail', 'ballot', 'banner', 'bare', 'barrel', 'bass', 'bat', 'battlefield', 'bay', 'beam', 'beast', 'behalf', 'beloved', 'bench', 'benchmark', 'beneath', 'beneficiary', 'betray', 'bind', 'biography', 'bishop', 'bizarre', 'blade', 'blast', 'bleed', 'blend', 'bless', 'blessing', 'boast', 'bonus', 'boom', 'bounce', 'boundary', 'bow', 'breach', 'breakdown', 'breakthrough', 'breed', 'broadband', 'browser', 'brutal', 'buck', 'buddy', 'buffer', 'bulk', 'burden', 'bureaucracy', 'burial', 'burst', 'cabinet', 'calculation', 'canvas', 'capability', 'capitalism', 'capitalist', 'cargo', 'carriage', 'carve', 'casino', 'casualty', 'catalogue', 'cater', 'cattle', 'caution', 'cautious', 'cease', 'cemetery', 'chamber', 'chaos', 'characterize', 'charm', 'charter', 'chronic', 'chunk', 'circulate', 'circulation', 'citizenship', 'civic', 'civilian', 'clarity', 'clash', 'classification', 'cling', 'clinical', 'closure', 'cluster', 'coalition', 'coastal', 'cocktail', 'cognitive', 'coincide', 'collaborate', 'collaboration', 'collective', 'collision', 'colonial', 'columnist', 'combat', 'commence', 'commentary', 'commentator', 'commerce', 'commissioner', 'commodity', 'communist', 'companion', 'comparable', 'compassion', 'compel', 'compelling', 'compensate', 'compensation', 'competence', 'competent', 'compile', 'complement', 'complexity', 'compliance', 'complication', 'comply', 'composition', 'compromise', 'compute', 'conceal', 'concede', 'conceive', 'conception', 'concession', 'condemn', 'confer', 'confession', 'configuration', 'confine', 'confirmation', 'confront', 'confrontation', 'congratulate', 'congregation', 'congressional', 'conquer', 'conscience', 'consciousness', 'consecutive', 'consensus', 'consent', 'conserve', 'consistency', 'consolidate', 'constituency', 'constitute', 'constitution', 'constitutional', 'constraint', 'consultation', 'contemplate', 'contempt', 'contend', 'contender', 'content', 'contention', 'continually', 'contractor', 'contradiction', 'contrary', 'contributor', 'conversion', 'convict', 'conviction', 'cooperate', 'cooperative', 'coordinate', 'coordination', 'coordinator', 'cop', 'copper', 'copyright', 'correction', 'correlate', 'correlation', 'correspond', 'correspondence', 'correspondent', 'corresponding', 'corrupt', 'corruption', 'costly', 'councillor', 'counselling', 'counsellor', 'counter', 'counterpart', 'countless', 'coup', 'courtesy', 'craft', 'crawl', 'creator', 'credibility', 'credible', 'creep', 'critique', 'crown', 'crude', 'crush', 'crystal', 'cult', 'cultivate', 'curiosity', 'custody', 'cutting', 'cynical', 'dam', 'damaging', 'dawn', 'debris', 'debut', 'decision-making', 'decisive', 'declaration', 'dedicated', 'dedication', 'deed', 'deem', 'default', 'defect', 'defensive', 'deficiency', 'deficit', 'defy', 'delegate', 'delegation', 'delicate', 'demon', 'denial', 'denounce', 'dense', 'density', 'dependence', 'depict', 'deploy', 'deployment', 'deposit', 'deprive', 'deputy', 'descend', 'descent', 'designate', 'desirable', 'desktop', 'destructive', 'detain', 'detection', 'detention', 'deteriorate', 'devastate', 'devil', 'devise', 'diagnose', 'diagnosis', 'dictate', 'dictator', 'differentiate', 'dignity', 'dilemma', 'dimension', 'diminish', 'dip', 'diplomat', 'diplomatic', 'directory', 'disastrous', 'discard', 'discharge', 'disclose', 'disclosure', 'discourse', 'discretion', 'discrimination', 'dismissal', 'displace', 'disposal', 'dispose', 'dispute', 'disrupt', 'disruption', 'dissolve', 'distinction', 'distinctive', 'distort', 'distress', 'disturbing', 'divert', 'divine', 'doctrine', 'documentation', 'domain', 'dominance', 'donor', 'dose', 'drain', 'drift', 'driving', 'drown', 'dual', 'dub', 'dumb', 'duo', 'dynamic', 'eager', 'earnings', 'ease', 'echo', 'ecological', 'educator', 'effectiveness', 'efficiency', 'ego', 'elaborate', 'electoral', 'elevate', 'eligible', 'elite', 'embark', 'embarrassment', 'embassy', 'embed', 'embody', 'emergence', 'empirical', 'empower', 'enact', 'encompass', 'encouragement', 'encouraging', 'endeavour', 'endless', 'endorse', 'endorsement', 'endure', 'enforce', 'enforcement', 'engagement', 'engaging', 'enquire', 'enrich', 'enrol', 'ensue', 'enterprise', 'enthusiast', 'entitle', 'entity', 'epidemic', 'equality', 'equation', 'erect', 'escalate', 'essence', 'establishment', 'eternal', 'evacuate', 'evoke', 'evolutionary', 'exaggerate', 'excellence', 'exceptional', 'excess', 'exclusion', 'exclusive', 'exclusively', 'execute', 'execution', 'exert', 'exile', 'exit', 'expenditure', 'experimental', 'expire', 'explicit', 'explicitly', 'exploitation', 'explosive', 'extract', 'extremist', 'facilitate', 'faction', 'faculty', 'fade', 'fairness', 'fatal', 'fate', 'favourable', 'feat', 'feminist', 'fibre', 'fierce', 'film-maker', 'filter', 'fine', 'firearm', 'fit', 'fixture', 'flaw', 'flawed', 'flee', 'fleet', 'flesh', 'flexibility', 'flourish', 'fluid', 'footage', 'foreigner', 'forge', 'formula', 'formulate', 'forth', 'forthcoming', 'foster', 'fragile', 'franchise', 'frankly', 'frustrated', 'frustrating', 'frustration', 'functional', 'fundraising', 'funeral', 'gallon', 'gambling', 'gathering', 'gaze', 'gear', 'generic', 'genocide', 'glance', 'glimpse', 'glorious', 'glory', 'governance', 'grace', 'grasp', 'grave', 'grave', 'gravity', 'grid', 'grief', 'grin', 'grind', 'grip', 'gross', 'guerrilla', 'guidance', 'guilt', 'gut', 'hail', 'halfway', 'halt', 'handful', 'handling', 'handy', 'harassment', 'hardware', 'harmony', 'harsh', 'harvest', 'hatred', 'haunt', 'hazard', 'heighten', 'heritage', 'hierarchy', 'high-profile', 'hint', 'homeland', 'hook', 'hopeful', 'horizon', 'horn', 'hostage', 'hostile', 'hostility', 'humanitarian', 'humanity', 'humble', 'hydrogen', 'identification', 'ideological', 'ideology', 'idiot', 'ignorance', 'imagery', 'immense', 'imminent', 'implementation', 'imprison', 'imprisonment', 'inability', 'inadequate', 'inappropriate', 'incidence', 'inclined', 'inclusion', 'incur', 'indicator', 'indictment', 'indigenous', 'induce', 'indulge', 'inequality', 'infamous', 'infant', 'infect', 'inflict', 'influential', 'inherent', 'inhibit', 'initiate', 'inject', 'injection', 'injustice', 'inmate', 'insertion', 'insider', 'inspect', 'inspection', 'inspiration', 'instinct', 'institutional', 'instruct', 'instrumental', 'insufficient', 'insult', 'intact', 'intake', 'integral', 'integrated', 'integration', 'integrity', 'intellectual', 'intensify', 'intensity', 'intensive', 'intent', 'interactive', 'interface', 'interfere', 'interference', 'interim', 'interior', 'intermediate', 'intervene', 'intervention', 'intimate', 'intriguing', 'investigator', 'invisible', 'invoke', 'involvement', 'ironic', 'ironically', 'irony', 'irrelevant', 'isolation', 'judicial', 'junction', 'jurisdiction', 'just', 'justification', 'kidnap', 'kidney', 'kingdom', 'lad', 'landlord', 'landmark', 'lap', 'large-scale', 'laser', 'latter', 'lawn', 'lawsuit', 'layout', 'leak', 'leap', 'legacy', 'legendary', 'legislation', 'legislative', 'legislature', 'legitimate', 'lengthy', 'lesbian', 'lesser', 'lethal', 'liable', 'liberal', 'liberation', 'liberty', 'license', 'lifelong', 'likelihood', 'limb', 'linear', 'line-up', 'linger', 'listing', 'literacy', 'liver', 'lobby', 'log', 'logic', 'long-standing', 'long-time', 'loom', 'loop', 'loyalty', 'machinery', 'magical', 'magistrate', 'magnetic', 'magnitude', 'mainland', 'mainstream', 'maintenance', 'mandate', 'mandatory', 'manifest', 'manipulate', 'manipulation', 'manuscript', 'march', 'marginal', 'marine', 'marketplace', 'mask', 'massacre', 'mathematical', 'mature', 'maximize', 'meaningful', 'meantime', 'medieval', 'meditation', 'melody', 'memo', 'memoir', 'memorial', 'mentor', 'merchant', 'mercy', 'mere', 'merely', 'merge', 'merger', 'merit', 'methodology', 'midst', 'migration', 'militant', 'militia', 'mill', 'minimal', 'minimize', 'mining', 'ministry', 'minute', 'miracle', 'misery', 'misleading', 'missile', 'mob', 'mobility', 'mobilize', 'moderate', 'modification', 'momentum', 'monk', 'monopoly', 'morality', 'motive', 'motorist', 'municipal', 'mutual', 'namely', 'nationwide', 'naval', 'neglect', 'neighbouring', 'nest', 'net', 'newsletter', 'niche', 'noble', 'nod', 'nominate', 'nomination', 'nominee', 'nonetheless', 'non-profit', 'nonsense', 'noon', 'notable', 'notably', 'notify', 'notorious', 'novel', 'nursery', 'objection', 'oblige', 'obsess', 'obsession', 'occasional', 'occurrence', 'odds', 'offering', 'offspring', 'operational', 'opt', 'optical', 'optimism', 'oral', 'organizational', 'orientation', 'originate', 'outbreak', 'outing', 'outlet', 'outlook', 'outrage', 'outsider', 'overlook', 'overly', 'oversee', 'overturn', 'overwhelm', 'overwhelming', 'pad', 'parameter', 'parental', 'parish', 'parliamentary', 'partial', 'partially', 'passing', 'passive', 'pastor', 'patch', 'patent', 'pathway', 'patrol', 'patron', 'peak', 'peasant', 'peculiar', 'persist', 'persistent', 'personnel', 'petition', 'philosopher', 'philosophical', 'physician', 'pioneer', 'pipeline', 'pirate', 'pit', 'plea', 'plead', 'pledge', 'plug', 'plunge', 'pole', 'poll', 'pond', 'pop', 'portfolio', 'portray', 'postpone', 'post-war', 'practitioner', 'preach', 'precedent', 'precision', 'predator', 'predecessor', 'predominantly', 'pregnancy', 'prejudice', 'preliminary', 'premier', 'premise', 'premium', 'prescribe', 'prescription', 'presently', 'preservation', 'preside', 'presidency', 'presidential', 'prestigious', 'presumably', 'presume', 'prevail', 'prevalence', 'prevention', 'prey', 'principal', 'privatization', 'privilege', 'probe', 'problematic', 'proceedings', 'proceeds', 'processing', 'processor', 'proclaim', 'productive', 'productivity', 'profitable', 'profound', 'projection', 'prominent', 'pronounced', 'propaganda', 'proposition', 'prosecute', 'prosecution', 'prosecutor','prospective', 'prosperity', 'protective', 'protocol', 'province', 'provincial', 'provision', 'provoke', 'psychiatric', 'pulse', 'pump', 'punch', 'query', 'quest', 'quota', 'radar', 'radical', 'rage', 'raid', 'rally', 'ranking', 'rape', 'ratio', 'rational', 'ray', 'readily', 'realization', 'realm', 'rear', 'reasoning', 'reassure', 'rebel', 'rebellion', 'recipient', 'reconstruction', 'recount', 'referendum', 'reflection', 'reform', 'refuge', 'refusal', 'regain', 'regardless', 'regime', 'regulator', 'regulatory', 'rehabilitation', 'reign', 'rejection', 'relevance', 'reliability', 'reluctant', 'remainder', 'remains', 'remedy', 'reminder', 'removal', 'render', 'renew', 'renowned', 'rental', 'replacement', 'reportedly', 'representation', 'reproduce', 'reproduction', 'republicn.', 'resemble', 'reside', 'residence', 'residential', 'residue', 'resignation', 'resistance', 'respective', 'respectively', 'restoration', 'restraint', 'resume', 'retreat', 'retrieve', 'revelation', 'reven', 'reverse', 'revival', 'revive', 'revolutionary', 'rhetoric', 'rifle', 'riot', 'rip', 'ritual', 'robust', 'rock', 'rod', 'rotate', 'rotation', 'ruling', 'rumour', 'sack', 'sacred', 'sacrifice', 'saint', 'sake', 'sanction', 'say', 'scattered', 'sceptical', 'scope', 'screw', 'scrutiny', 'seal', 'secular', 'seemingly', 'segment', 'seize', 'seldom', 'selective', 'senator', 'sensation', 'sensitivity', 'sentiment', 'separation', 'serial', 'settlement', 'set-up', 'sexuality', 'shareholder', 'shatter', 'shed', 'sheer', 'shipping', 'shoot', 'shrink', 'shrug', 'sigh', 'simulate', 'simulation', 'simultaneously', 'sin', 'situated', 'sketch', 'skip', 'slam', 'slap', 'slash', 'slavery', 'slot', 'smash', 'snap', 'soak', 'soar', 'socialist', 'sole', 'solely', 'solicitor', 'solidarity', 'solo', 'sound', 'sovereignty', 'spam', 'span', 'spare', 'spark', 'specialized', 'specification', 'specimen', 'spectacle', 'spectrum', 'spell', 'sphere', 'spin', 'spine', 'spotlight', 'spouse', 'spy', 'squad', 'squeeze', 'stab', 'stability', 'stabilize', 'stake', 'standing', 'stark', 'statistical', 'steer', 'stem', 'stereotype', 'stimulus', 'stir', 'storage', 'straightforward', 'strain', 'strand', 'strategic', 'striking', 'strip', 'strip', 'strive', 'structural', 'stumble', 'stun', 'submission', 'subscriber', 'subscription', 'subsidy', 'substantial', 'substantially', 'substitute', 'substitution', 'subtle', 'suburban', 'succession', 'successive', 'successor', 'suck', 'sue', 'suicide', 'suite', 'summit', 'superb', 'superior', 'supervise', 'supervision', 'supervisor', 'supplement', 'supportive', 'supposedly', 'suppress', 'supreme', 'surge', 'surgical', 'surplus', 'surrender', 'surveillance', 'suspension', 'suspicion', 'suspicious', 'sustain', 'swing', 'sword', 'sword', 'symbolic', 'syndrome', 'synthesis', 'systematic', 'tackle', 'tactic', 'tactical', 'taxpayer', 'tempt', 'tenant', 'tender', 'tenure', 'terminal', 'terminate', 'terrain', 'terrific', 'testify', 'testimony', 'texture', 'thankfully', 'theatrical', 'theology', 'theoretical', 'thereafter', 'thereby', 'thoughtful', 'thought-provoking', 'thread', 'threshold', 'thrilled', 'thrive', 'tide', 'tighten', 'timber', 'timely', 'tobacco', 'tolerance', 'tolerate', 'toll', 'top', 'torture', 'toss', 'total', 'toxic', 'trace', 'trademark', 'trail', 'trailer', 'transaction', 'transcript', 'transformation', 'transit', 'transmission', 'transparency', 'transparent', 'trauma', 'treaty', 'tremendous', 'tribal', 'tribunal', 'tribute', 'trigger', 'trio', 'triumph', 'trophy', 'trustee', 'tuition', 'turnout', 'turnover', 'twist', 'undergraduate', 'underlying', 'undermine', 'undoubtedly', 'unify', 'unprecedented', 'unveil', 'upcoming', 'upgrade', 'uphold', 'utility', 'utilize', 'utterly', 'vacuum', 'vague', 'validity', 'vanish', 'variable', 'varied', 'vein', 'verbal', 'verdict', 'verify', 'verse', 'versus', 'vessel', 'veteran', 'viable', 'vibrant', 'vice', 'vicious', 'villager', 'violate', 'violation', 'virtue', 'vocal', 'vow', 'vulnerability', 'vulnerable', 'ward', 'warehouse', 'warfare', 'warrant', 'warrior', 'weaken', 'weed', 'well', 'well-being', 'whatsoever', 'whereby', 'whilst', 'whip', 'wholly', 'widen', 'widow', 'width', 'willingness', 'wipe', 'wit', 'withdrawal', 'workout', 'worship', 'worthwhile', 'worthy', 'yell', 'yield', 'youngster']

Соберем слова из всех словарей по категориям и удалим повторяющиеся.

In [34]:
words_a1 = am3_words_a1 + ox3_words_a1    
words_a1 = list(set(words_a1))

In [35]:
words_a2 = am3_words_a2 + ox3_words_a2
words_a2 = list(set(words_a2))

In [36]:
words_b1 = am3_words_b1 + ox3_words_b1
words_b1 = list(set(words_b1))

In [37]:
words_b2 = am3_words_b2 + ox3_words_b2 + am5_words_b2 + ox5_words_b2
words_b2 = list(set(words_b2))

In [38]:
words_c1 = am5_words_c1 + ox5_words_c1
words_c1 = list(set(words_c1))

In [39]:
def word_count(row, words):             # функция, которая сравнивает слова в субтитрах со словами в категориях 
    counter = 0                         # и считает долю
    sub_words = row['subs'].split()
    for x in sub_words:
        for y in words:
            if x == y:
                counter +=1
    part = counter / len(sub_words)
    return part

In [40]:
%%time
df['A1'] = df.apply(word_count, axis=1, words=words_a1)    # добавляем столбцы с категорией в датасет
df['A2'] = df.apply(word_count, axis=1, words=words_a2)
df['B1'] = df.apply(word_count, axis=1, words=words_b1)
df['B2'] = df.apply(word_count, axis=1, words=words_b2)
df['C1'] = df.apply(word_count, axis=1, words=words_c1)

CPU times: total: 5min 12s
Wall time: 5min 13s


In [41]:
df

Unnamed: 0,id,Movie,Level,subs,A1,A2,B1,B2,C1
0,0,10_Cloverfield_lane(2016),B1,fixed synced bozxphd enjoy flick ben phone ...,0.403160,0.134522,0.083874,0.057536,0.021070
1,1,10_things_I_hate_about_you(1999),B1,hey ill right cameron go nine schools y...,0.452248,0.129578,0.088549,0.065369,0.034075
2,2,A_knights_tale(2001),B2,resync xenzainef retail help hes due list...,0.373500,0.155692,0.082911,0.083178,0.038390
3,3,A_star_is_born(2018),B2,synced corrected mrcjnthn get black eyes o...,0.485156,0.132990,0.068616,0.063011,0.025144
4,4,Aladdin(1992),A2/A2+,oh come land faraway place caravan came...,0.371068,0.157750,0.101952,0.080827,0.029162
...,...,...,...,...,...,...,...,...,...
236,236,Matilda(2022),C1,chiming music playing mummy says im miracle ...,0.387944,0.135497,0.074875,0.071956,0.037781
237,237,Bullet train,B1,boomslang stolen zoo last night extremely...,0.455738,0.172035,0.081967,0.074638,0.030665
238,238,Thor love and thunder,B2,oh great mighty rapu pray water sustenan...,0.409139,0.164536,0.083840,0.079229,0.031440
239,239,Lightyear,B2,buzz buzz lightyear mission log stardate senso...,0.372978,0.167456,0.089349,0.104931,0.040237


В наших размеченных данных, есть смежные категории, а в словарях только одинарные, приведем их к общему виду, смежные сделаем одинарными в пользу более сложного уровня.

In [42]:
df['Level'].unique()

array(['B1', 'B2', 'A2/A2+', 'C1', 'B1, B2', 'A2/A2+, B1', 'A2'],
      dtype=object)

In [43]:
df['Level'] = df['Level'].replace('A2/A2+', 'A2')

In [44]:
df['Level'] = df['Level'].replace('A2/A2+, B1', 'B1')

In [45]:
df['Level'] = df['Level'].replace('B1, B2', 'B2')

In [46]:
df['Level'].unique()

array(['B1', 'B2', 'A2', 'C1'], dtype=object)

In [47]:
df.describe()

Unnamed: 0,id,A1,A2,B1,B2,C1
count,241.0,241.0,241.0,241.0,241.0,241.0
mean,120.0,0.410353,0.144597,0.080703,0.074686,0.028679
std,69.714896,0.041192,0.014463,0.009794,0.013116,0.006407
min,0.0,0.248975,0.10851,0.058748,0.043619,0.010217
25%,60.0,0.387856,0.135427,0.074425,0.065639,0.024027
50%,120.0,0.409206,0.142505,0.080817,0.073397,0.028396
75%,180.0,0.4369,0.151773,0.086408,0.082273,0.032654
max,240.0,0.522329,0.202627,0.117504,0.117647,0.044548


Применим кодирование к словам в столбце с субтитрами.

In [48]:
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))

In [49]:
vec_sub = vectorizer.fit_transform(df['subs']).toarray()
vec_sub

array([[0, 3, 0, ..., 0, 1, 0],
       [0, 1, 1, ..., 0, 1, 0],
       [0, 1, 2, ..., 1, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 2, 0]], dtype=int64)

In [50]:
col_vec_sub = vectorizer.get_feature_names_out()
col_vec_sub

array(['aah', 'able', 'absolutely', ..., 'york', 'young', 'zane'],
      dtype=object)

In [51]:
df_vec = pd.DataFrame(vec_sub, columns = col_vec_sub)
df_vec

Unnamed: 0,aah,able,absolutely,accept,access,accident,account,across,act,acting,...,wrote,ya,yall,yelling,yep,yesterday,yo,york,young,zane
0,0,3,0,1,0,3,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0,1,1,0,0,0,0,1,1,2,...,0,2,0,0,0,1,0,0,1,0
2,0,1,2,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,0,3,0,0,0,0,0,2,0,0,...,7,1,1,0,2,0,5,0,0,0
4,3,1,1,0,0,0,0,0,0,0,...,0,1,0,0,1,0,5,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,0,3,1,0,0,1,0,1,1,0,...,0,1,0,14,1,1,0,0,2,0
237,0,1,1,1,0,5,0,0,1,0,...,0,0,0,0,2,0,0,0,6,0
238,0,0,1,2,0,0,0,0,1,2,...,3,1,0,0,1,0,0,0,0,0
239,0,1,1,0,0,0,0,1,0,0,...,0,2,0,0,4,0,0,0,0,0


Присоединим к изначальному датасету закодированную часть.

In [52]:
df2 = df.join(df_vec, lsuffix='_left', rsuffix='_right')
df2

Unnamed: 0,id,Movie,Level,subs,A1,A2,B1,B2,C1,aah,...,wrote,ya,yall,yelling,yep,yesterday,yo,york,young,zane
0,0,10_Cloverfield_lane(2016),B1,fixed synced bozxphd enjoy flick ben phone ...,0.403160,0.134522,0.083874,0.057536,0.021070,0,...,0,0,0,0,1,0,0,0,1,0
1,1,10_things_I_hate_about_you(1999),B1,hey ill right cameron go nine schools y...,0.452248,0.129578,0.088549,0.065369,0.034075,0,...,0,2,0,0,0,1,0,0,1,0
2,2,A_knights_tale(2001),B2,resync xenzainef retail help hes due list...,0.373500,0.155692,0.082911,0.083178,0.038390,0,...,0,0,0,0,0,0,0,1,0,0
3,3,A_star_is_born(2018),B2,synced corrected mrcjnthn get black eyes o...,0.485156,0.132990,0.068616,0.063011,0.025144,0,...,7,1,1,0,2,0,5,0,0,0
4,4,Aladdin(1992),A2,oh come land faraway place caravan came...,0.371068,0.157750,0.101952,0.080827,0.029162,3,...,0,1,0,0,1,0,5,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,236,Matilda(2022),C1,chiming music playing mummy says im miracle ...,0.387944,0.135497,0.074875,0.071956,0.037781,0,...,0,1,0,14,1,1,0,0,2,0
237,237,Bullet train,B1,boomslang stolen zoo last night extremely...,0.455738,0.172035,0.081967,0.074638,0.030665,0,...,0,0,0,0,2,0,0,0,6,0
238,238,Thor love and thunder,B2,oh great mighty rapu pray water sustenan...,0.409139,0.164536,0.083840,0.079229,0.031440,0,...,3,1,0,0,1,0,0,0,0,0
239,239,Lightyear,B2,buzz buzz lightyear mission log stardate senso...,0.372978,0.167456,0.089349,0.104931,0.040237,0,...,0,2,0,0,4,0,0,0,0,0


Теперь можно приступать к разделению на features и target и обучению модели.

In [53]:
target = df2['Level']
features = df2.drop(['Level', 'id', 'Movie', 'subs'], axis=1)

In [54]:
features

Unnamed: 0,A1,A2,B1,B2,C1,aah,able,absolutely,accept,access,...,wrote,ya,yall,yelling,yep,yesterday,yo,york,young,zane
0,0.403160,0.134522,0.083874,0.057536,0.021070,0,3,0,1,0,...,0,0,0,0,1,0,0,0,1,0
1,0.452248,0.129578,0.088549,0.065369,0.034075,0,1,1,0,0,...,0,2,0,0,0,1,0,0,1,0
2,0.373500,0.155692,0.082911,0.083178,0.038390,0,1,2,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0.485156,0.132990,0.068616,0.063011,0.025144,0,3,0,0,0,...,7,1,1,0,2,0,5,0,0,0
4,0.371068,0.157750,0.101952,0.080827,0.029162,3,1,1,0,0,...,0,1,0,0,1,0,5,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,0.387944,0.135497,0.074875,0.071956,0.037781,0,3,1,0,0,...,0,1,0,14,1,1,0,0,2,0
237,0.455738,0.172035,0.081967,0.074638,0.030665,0,1,1,1,0,...,0,0,0,0,2,0,0,0,6,0
238,0.409139,0.164536,0.083840,0.079229,0.031440,0,0,1,2,0,...,3,1,0,0,1,0,0,0,0,0
239,0.372978,0.167456,0.089349,0.104931,0.040237,0,1,1,0,0,...,0,2,0,0,4,0,0,0,0,0


In [55]:
features_train, features_valid, target_train, target_valid = train_test_split(features, target, test_size=0.2,
                                                                           random_state=12345)

In [56]:
print(target_train.count())
print(target_valid.count())

192
49


In [57]:
classifier = RandomForestClassifier(n_estimators=1000, random_state=12345)
classifier.fit(features_train, target_train) 
valid_pred = classifier.predict(features_valid)

print(confusion_matrix(target_valid,valid_pred))

print(classification_report(target_valid,valid_pred))

print(f"F1 Score: {f1_score(target_valid, valid_pred, average='weighted')}")
print(f"accuracy_score: {accuracy_score(target_valid, valid_pred)}")

[[ 0  6  3  0]
 [ 0  7  3  0]
 [ 0  2 17  0]
 [ 0  0  8  3]]
              precision    recall  f1-score   support

          A2       0.00      0.00      0.00         9
          B1       0.47      0.70      0.56        10
          B2       0.55      0.89      0.68        19
          C1       1.00      0.27      0.43        11

    accuracy                           0.55        49
   macro avg       0.50      0.47      0.42        49
weighted avg       0.53      0.55      0.47        49

F1 Score: 0.47416909620991254
accuracy_score: 0.5510204081632653


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [58]:
logreg = LogisticRegression(n_jobs=3,C=1e5, solver='saga', 
                                           multi_class='multinomial',
                                           max_iter=1000,
                                           random_state=12345)


logreg.fit(features_train, target_train)
valid_pred = logreg.predict(features_valid)

print(confusion_matrix(target_valid,valid_pred))

print(classification_report(target_valid,valid_pred))

print(f"F1 Score: {f1_score(target_valid, valid_pred, average='weighted')}")
print(f"accuracy_score: {accuracy_score(target_valid, valid_pred)}")

[[ 3  3  3  0]
 [ 0  8  2  0]
 [ 1  8 10  0]
 [ 0  1  1  9]]
              precision    recall  f1-score   support

          A2       0.75      0.33      0.46         9
          B1       0.40      0.80      0.53        10
          B2       0.62      0.53      0.57        19
          C1       1.00      0.82      0.90        11

    accuracy                           0.61        49
   macro avg       0.69      0.62      0.62        49
weighted avg       0.69      0.61      0.62        49

F1 Score: 0.6172310682514763
accuracy_score: 0.6122448979591837




Вторая модель показала метрики лучше.

In [59]:
with open('model.pcl','wb') as fid: # сохраняем модель в файл, чтобы загрузить ее через streamlit.
    dump(logreg, fid)

In [60]:
with open('vectorizer.pcl','wb') as fid: # сохраняем модель в файл, чтобы загрузить ее через streamlit.
    dump(vectorizer, fid)