In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import re
import requests
import urllib
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import nltk
from nltk.corpus import stopwords
import pymorphy2
import pyLDAvis
import pyLDAvis.gensim
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
#nltk.download('stopwords')
#nltk.download('punkt')

In [None]:
separator = r'\', \''
frame_columns_names = ["Тема обращения","Суть обращения",
                        "Чиновник","Должность",
                        "Дата подачи заявления"]
df = pd.read_csv('Mosru_dump_all_pages.csv',
                    encoding = 'utf-8',
                    sep = separator,
                    names = None,
                    skiprows = 1)
df.columns = frame_columns_names

In [None]:
stop_words = stopwords.words('russian')
morph = pymorphy2.MorphAnalyzer()

In [None]:
def clean_string(string):
    #dirt_patterns = [r'\'',r'\\n',r'\n',r'\\\\/S+',r'\\\\',r'/\S+',r'\(.*\)']
    dirt_patterns = [r'\'', r'\\n', r'\n', r'Б\\\\/Н', r'\\\\/\d+',r'\\\\/', r'\\{1,}',r'\(.*\)',r'c\.$',r'ул\.$',r'-\d+',r'пер\.$', r'д\.']
    for pattern in dirt_patterns:
        string = re.sub(pattern,'',string)
    return string

In [None]:
def search_theme(string):
    res = re.findall(r'«.*»',string)
    return res[0] if len(res)>0 else ""

In [None]:
def search_address(string):
    res = re.findall(r'по адресу\s?(.*$)',string)
    return res[0] if len(res)>0 else ""

In [None]:
def preprocessingText(line):
    line = line.lower()
    line = re.sub(r'\d+','',line,flags=re.UNICODE)#delete numbers
    line = re.sub(r'[^\w\s]',' ',line,flags=re.UNICODE)
    tl = nltk.word_tokenize(line) #tl - tokenize line
    nft = [morph.parse(token)[0].normal_form for token in tl if len(token)>1]#nft - normal form token
    clean_words_list = [normalToken for normalToken in nft if normalToken not in stop_words and len(normalToken)>1]#преобразованный список
    return ' '.join(clean_words_list)

In [None]:
def prepare_date(date_string):
    months = {'Января':'01','Февраля':'02','Марта':'03','Апреля':'04',
                'Мая':'05','Июня':'06','Июля':'07','Августа':'08',
                'Сентября':'09','Октября':'10','Ноября':'11','Декабря':'12'}
    date_string = re.sub(r'\s{2,}',' ',str(date_string.strip()))
    try:   
        split_date_string = date_string.split(' ')
        day = split_date_string[0]
        month = months[split_date_string[1]]
        year = split_date_string[2]
        time = split_date_string[4]
        full_date_string = "{0}.{1}.{2} {3}:00".format(day,month,year,time)      
    except Exception as error:
        full_date_string = date_string
    return full_date_string

In [None]:
for column in frame_columns_names:
    df[column] = df[column].apply(clean_string)

In [None]:
df["Адрес"] = df['Тема обращения'].apply(search_address)

In [None]:
df["Тема обращения2"] = df['Тема обращения'].apply(search_theme)

In [None]:
df['Обращение нормализованное'] = df['Суть обращения'].apply(preprocessingText)

In [None]:
df["Дата подачи заявления"] = df['Дата подачи заявления'].apply(prepare_date)

In [None]:
df.to_csv('test.csv', encoding='utf-8', sep='|')

In [None]:
df.head(2)

In [None]:
df.to_csv('clean_frame.csv',sep='|',encoding='utf-8')

Вторая часть после загрузки и обработки данных (подтянуть адреса c помощью mapbox api)

In [None]:
n_df = pd.read_csv('clean_frame.csv', sep='|', encoding='utf-8')

In [None]:
unique_addresses = n_df['Адрес'].unique().tolist()

In [None]:
mapbox_token = "pk.eyJ1IjoidmFkaW05NiIsImEiOiJja25nMGowN2QyNnFlMnFtdTkzYjBjejdkIn0.18dpTc8Vgja2gPDUYgrSkw"

In [None]:
def search_district(address):
    url = "https://api.mapbox.com/geocoding/v5/mapbox.places/{0}.json?access_token={1}".format(urllib.parse.quote("Москва "+str(address)), mapbox_token)
    response = requests.get(url)
    district = ""
    try:
        district = response.json()['features'][0]['place_name']
    except Exception as error:
        print("Возникла ошибка-{0}, адрес - {1}".format(str(error),str(address)) )
    finally:
        return district

In [None]:
with open('unique_addresses_frame.csv','w',encoding='utf-8') as file:
    file.write('Адрес|Полный ответ\n')
    for address in tqdm(unique_addresses):
        full_response = search_district(address)
        file.write('{0}|{1}\n'.format(address,full_response))

Схлестываем значения адресов с "чистым фреймом"

In [None]:
addresses_frame = pd.read_csv('unique_addresses_frame.csv',encoding='utf-8',sep='|')

In [None]:
clean_frame = pd.read_csv('clean_frame.csv', encoding='utf-8',sep='|')

In [None]:
res_frame = clean_frame.merge(addresses_frame, left_on='address', right_on='Адрес', how='left')

In [None]:
res_frame = pd.merge(clean_frame, addresses_frame,on="Адрес",how='left')###################

In [None]:
res_frame.head()

In [None]:
pivot_frame = pd.pivot_table(res_frame, index=['Чиновник'], values=['theme'], aggfunc=[len])

In [None]:
def search_district(full_address):
    searched = re.findall(r'округ\s+?(.+?),',full_address)
    result = full_address if len(searched)==0 else searched[0]
    return result        

In [None]:
res_frame2 = res_frame

In [None]:
res_frame2['Округ'] = res_frame2['Полный ответ'].apply(search_district)

In [None]:
districts = res_frame2['Округ'].unique().tolist()

In [None]:
districts[50:100]

Тематическое моделирование

In [None]:
df = pd.read_csv('clean_frame.csv',encoding='utf-8',sep='|')

In [None]:
data = df['Obr_clean'].values.tolist()

In [None]:
data_words = [nltk.word_tokenize(str(line)) for line in tqdm(data)]

In [None]:
id2word = corpora.Dictionary(data_words)
texts = data_words
#term document frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
#количество тем
num_topics = 10
lda_model = gensim.models.LdaMulticore(corpus = corpus, id2word=id2word, num_topics = num_topics)

In [None]:
pyLDAvis.enable_notebook()

In [None]:
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
LDAvis_prepared