In [1]:
import numpy as np
import pandas as pd
import re
import csv

In [2]:
# I read in my txt version of the textbook dictionary using utf-8 encoding and I'm able to read it in as a list of
# lowercased russian words their page number and definition.
nti = open('./textbook_vocab_data/novice_to_intermediate.txt', encoding= 'utf8')
nti = nti.read()
nti # very messy ugh

'Авария 13 – traffic accident\nавтобус 12 – bus\nавтомобиль/машина 12 – car\nавтомойка 13 – car wash\nаксессуары 13, 15 – accessories\nалкогольный напиток 16 – alcoholic drink\nаллергия 16 – allergy\nальпинизм 10 – mountain climbing\nангина 16 – strep throat\nанкета 14 – application form\nапельсин 5 – orange\nарбуз 5 – watermelon\nаренда 11 – rent n.\nаспирантура 1 – graduate school\nафиша 10 – poster\nБанан 5 – banana\nбанкомат 13 – ATM\nбарабаны 10 – drums\nбаскетбол 10 – basketball\nбег 10 – running\nбежать ~ бегать 12 – to run\nбез + gen. 3 – without\nбезветренный 15 – windless, calm\nбелок, pl. белки 5 – protein\nбензин 13 – gasoline\nберег, prep.: на берегу 1 – сoast, shore\nбесплатный 14 – free\nбеспокоить impf. кого? 16 – to bother, disturb\nбеспокоиться impf. о ком? о чём? (за кого? за что? colloq.) 16 – to worry, be worried\nбижутерия 13 – jewelry\nблин, pl. -ы 6 – pancake\nблокнот 7 – notebook\nблондин/блондинка 2 – blond (man)/blonde (woman)\nблузка 15 – blouse\nблюдо 5 – d

In [3]:
nti_txt = nti.split('\n') # I turn the file into a list, splitting on linebreaks to divide most of the vocab items
nti_txt_low = [x.lower() for x in nti_txt] # Stress marking was encoded with capital letters, so I lower everything
ntiplay = nti_txt_low[:30] # let't take a look at some of the entries and make it a play set
ntiplay

['авария 13 – traffic accident',
 'автобус 12 – bus',
 'автомобиль/машина 12 – car',
 'автомойка 13 – car wash',
 'аксессуары 13, 15 – accessories',
 'алкогольный напиток 16 – alcoholic drink',
 'аллергия 16 – allergy',
 'альпинизм 10 – mountain climbing',
 'ангина 16 – strep throat',
 'анкета 14 – application form',
 'апельсин 5 – orange',
 'арбуз 5 – watermelon',
 'аренда 11 – rent n.',
 'аспирантура 1 – graduate school',
 'афиша 10 – poster',
 'банан 5 – banana',
 'банкомат 13 – atm',
 'барабаны 10 – drums',
 'баскетбол 10 – basketball',
 'бег 10 – running',
 'бежать ~ бегать 12 – to run',
 'без + gen. 3 – without',
 'безветренный 15 – windless, calm',
 'белок, pl. белки 5 – protein',
 'бензин 13 – gasoline',
 'берег, prep.: на берегу 1 – сoast, shore',
 'бесплатный 14 – free',
 'беспокоить impf. кого? 16 – to bother, disturb',
 'беспокоиться impf. о ком? о чём? (за кого? за что? colloq.) 16 – to worry, be worried',
 'бижутерия 13 – jewelry']

It was ultimately easier to use a less coding-intensive solution to end up with a txt file that I can read into this environment. I'm going to try to keep the russian vocabulary item and its english definition so I'll try to get rid of some of the formatting weirdness with regular expressions. I will have to be careful with these extended definitions that continue on to multiple lines

In [4]:
# First, We need to get rid of any page numbers and extraneous white spaces
ntiplay = [re.sub('(\s\d+,|\s\d+\s|\|)', ' ', x) for x in ntiplay]
ntiplay

['авария – traffic accident',
 'автобус – bus',
 'автомобиль/машина – car',
 'автомойка – car wash',
 'аксессуары  – accessories',
 'алкогольный напиток – alcoholic drink',
 'аллергия – allergy',
 'альпинизм – mountain climbing',
 'ангина – strep throat',
 'анкета – application form',
 'апельсин – orange',
 'арбуз – watermelon',
 'аренда – rent n.',
 'аспирантура – graduate school',
 'афиша – poster',
 'банан – banana',
 'банкомат – atm',
 'барабаны – drums',
 'баскетбол – basketball',
 'бег – running',
 'бежать ~ бегать – to run',
 'без + gen. – without',
 'безветренный – windless, calm',
 'белок, pl. белки – protein',
 'бензин – gasoline',
 'берег, prep.: на берегу – сoast, shore',
 'бесплатный – free',
 'беспокоить impf. кого? – to bother, disturb',
 'беспокоиться impf. о ком? о чём? (за кого? за что? colloq.) – to worry, be worried',
 'бижутерия – jewelry']

In [5]:
# let's make this into a dataframe and see what we can do
df_ntiplay=pd.DataFrame(ntiplay,columns=['Entry'])
df_ntiplay

Unnamed: 0,Entry
0,авария – traffic accident
1,автобус – bus
2,автомобиль/машина – car
3,автомойка – car wash
4,аксессуары – accessories
5,алкогольный напиток – alcoholic drink
6,аллергия – allergy
7,альпинизм – mountain climbing
8,ангина – strep throat
9,анкета – application form


In [6]:
df_ntiplay.join(df_ntiplay['Entry'].str.split('–', 1, expand=True).rename(columns={0:'Russian', 1:'English'}))

Unnamed: 0,Entry,Russian,English
0,авария – traffic accident,авария,traffic accident
1,автобус – bus,автобус,bus
2,автомобиль/машина – car,автомобиль/машина,car
3,автомойка – car wash,автомойка,car wash
4,аксессуары – accessories,аксессуары,accessories
5,алкогольный напиток – alcoholic drink,алкогольный напиток,alcoholic drink
6,аллергия – allergy,аллергия,allergy
7,альпинизм – mountain climbing,альпинизм,mountain climbing
8,ангина – strep throat,ангина,strep throat
9,анкета – application form,анкета,application form


In [7]:
nti_sub = [re.sub('(\s\d+,|\s\d+\s|\|)', ' ', x) for x in nti_txt_low]
df_nti=pd.DataFrame(nti_sub,columns=['Entry'])
df_nti=df_nti.join(df_nti['Entry'].str.split('–', 1, expand=True).rename(columns={0:'Russian', 1:'English'}))
df_nti=df_nti[['Russian','English']]
df_nti

Unnamed: 0,Russian,English
0,авария,traffic accident
1,автобус,bus
2,автомобиль/машина,car
3,автомойка,car wash
4,аксессуары,accessories
...,...,...
912,яблоко,apple
913,явля́ться impf. кем? чем?,to be (in official or formal contexts)
914,я́года,berry
915,яи́чница,fried eggs


In [8]:
df_nti['Level'] = 'Int'
df_nti

Unnamed: 0,Russian,English,Level
0,авария,traffic accident,Int
1,автобус,bus,Int
2,автомобиль/машина,car,Int
3,автомойка,car wash,Int
4,аксессуары,accessories,Int
...,...,...,...
912,яблоко,apple,Int
913,явля́ться impf. кем? чем?,to be (in official or formal contexts),Int
914,я́года,berry,Int
915,яи́чница,fried eggs,Int


In [9]:
ita = open('./textbook_vocab_data/intermediate_to_advanced.txt', encoding= 'utf8')
ita = ita.read()
ita # very messy ugh

"АбсолЮтно 3 - absolutely \nаварИйное состоЯние 5 - unsafe condition \nакварель f 3 - watercolor \nаккуратный 2 - neat, tidy, well organized; orderly \nактуально 1 - timely \nакция (акции) 9 - share (stocks) \nальтернатИва 6 - alternative \nамфитеатр 3 - (raised) rear part in orchestra section (theater) \nанализИровать/проанализИровать (что?) 1 - to analyze \nангина 7 - strep throat \nантракт 3 - intermission \nаполитИчность 6 - indifference towards politics \nаттестат 1 - high school diploma \nБаловать/избаловать (кого?) 4 - to indulge, to pamper \nбедность 4 - poverty \nбездельничать 2 - to loaf, not do anything \nбездомный 5 - homeless \nбезопасность 4 - security \nбезработица 2 - unemployment, joblessness \nбезработный (п.) 2 - unemployed (person) \nбезразлИчный 6 - indifferent \nбезусловно 1 - unconditionally; for sure \nбелкИ, sing. белlоlк 7 - proteins \nбельэтаж 3 - dress circle; first tier \nбензИн 9 - gas (gasoline) \nберечь impf (кого?) 4 - to take good care of \nбеспокоить 

In [10]:
ita_txt = ita.split('\n') # I turn the file into a list, splitting on linebreaks to divide most of the vocab items
ita_txt_low = [x.lower() for x in ita_txt] # Stress marking was encoded with capital letters, so I lower everything
itaplay = ita_txt_low[:30] # let't take a look at some of the entries and make it a play set
itaplay

['абсолютно 3 - absolutely ',
 'аварийное состояние 5 - unsafe condition ',
 'акварель f 3 - watercolor ',
 'аккуратный 2 - neat, tidy, well organized; orderly ',
 'актуально 1 - timely ',
 'акция (акции) 9 - share (stocks) ',
 'альтернатива 6 - alternative ',
 'амфитеатр 3 - (raised) rear part in orchestra section (theater) ',
 'анализировать/проанализировать (что?) 1 - to analyze ',
 'ангина 7 - strep throat ',
 'антракт 3 - intermission ',
 'аполитичность 6 - indifference towards politics ',
 'аттестат 1 - high school diploma ',
 'баловать/избаловать (кого?) 4 - to indulge, to pamper ',
 'бедность 4 - poverty ',
 'бездельничать 2 - to loaf, not do anything ',
 'бездомный 5 - homeless ',
 'безопасность 4 - security ',
 'безработица 2 - unemployment, joblessness ',
 'безработный (п.) 2 - unemployed (person) ',
 'безразличный 6 - indifferent ',
 'безусловно 1 - unconditionally; for sure ',
 'белки, sing. белlоlк 7 - proteins ',
 'бельэтаж 3 - dress circle; first tier ',
 'бензин 9 - ga

In [11]:
ita_sub = [re.sub('(\s\d+,|\s\d+\s|\|)', ' ', x) for x in ita_txt_low]
df_ita=pd.DataFrame(ita_sub,columns=['Entry'])
df_ita=df_ita.join(df_ita['Entry'].str.split('-', 1, expand=True).rename(columns={0:'Russian', 1:'English'}))
df_ita=df_ita[['Russian','English']]
df_ita

Unnamed: 0,Russian,English
0,абсолютно,absolutely
1,аварийное состояние,unsafe condition
2,акварель f,watercolor
3,аккуратный,"neat, tidy, well organized; orderly"
4,актуально,timely
...,...,...
1200,ядерное оружие,nuclear weapons
1201,язва,ulcer
1202,яма,pothole
1203,яркий,bright; яркие огни - bright lights


In [12]:
df_ita['Level'] = 'Adv'
df_ita

Unnamed: 0,Russian,English,Level
0,абсолютно,absolutely,Adv
1,аварийное состояние,unsafe condition,Adv
2,акварель f,watercolor,Adv
3,аккуратный,"neat, tidy, well organized; orderly",Adv
4,актуально,timely,Adv
...,...,...,...
1200,ядерное оружие,nuclear weapons,Adv
1201,язва,ulcer,Adv
1202,яма,pothole,Adv
1203,яркий,bright; яркие огни - bright lights,Adv


In [13]:
df_vocab = pd.concat([df_nti,df_ita], axis=0)
df_vocab = df_vocab.sort_values(by='Russian')
df_vocab.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2122 entries, 685 to 1204
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Russian  2122 non-null   object
 1   English  2075 non-null   object
 2   Level    2122 non-null   object
dtypes: object(3)
memory usage: 66.3+ KB


In [14]:
df_vocab=df_vocab.dropna()
df_vocab.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2075 entries, 476 to 1204
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Russian  2075 non-null   object
 1   English  2075 non-null   object
 2   Level    2075 non-null   object
dtypes: object(3)
memory usage: 64.8+ KB


In [15]:
df_vocab.sample(n=30)

Unnamed: 0,Russian,English,Level
303,звонlоlк,"bell, school bell",Adv
330,инвестиция,investment,Adv
7,альпинизм,mountain climbing,Int
574,"прекра́сный, -ая, -ое, -ые",great adj.,Int
871,разделение труда,divisioп oflabor,Adv
918,рецензия на (что?),review,Adv
353,иудейский,"]ewish,]udaic",Adv
243,естественнонаучный,пatural scieпces (adj.j,Adv
493,независимость,independence,Adv
1035,средства,теапs,Adv


There are so many other considerations that have to be made for pulling some of these items out. I have to think more about what needs to be removed because there is a LOT of grammatical information coming in tandem with some of these items. For example the preposition "без" is being given with the case it governs (genitive). The verbs of motion "бежать ~ бегать" have present and past conjugations and imperatives along with the definition. Splitting on the new line character to create a list was a first good step, but I might have to go low tech and try to figure out which addtional info needs to be removed... frustrating

In [16]:
# I'm running into more 
russ_words = pd.read_csv("./textbook_vocab_data/russian-word-list-total.csv")

ParserError: Error tokenizing data. C error: Expected 1 fields in line 6, saw 2
