In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import requests
import urllib
import matplotlib.pyplot as plt

In [None]:
from TextWorker import *

In [None]:
separator = r'\', \''
frame_columns_names = ["Тема обращения","Суть обращения",
                        "Чиновник","Должность",
                        "Дата подачи заявления"]
df = pd.read_csv('Mosru_dump_all_pages.csv',
                    encoding = 'utf-8',
                    sep = separator,
                    names = None,
                    skiprows = 1)
df.columns = frame_columns_names

In [None]:
for column in frame_columns_names:
    df[column] = df[column].apply(clean_string)

In [None]:
df["Адрес"] = df['Тема обращения'].apply(search_address)

In [None]:
df["Тема обращения"] = df['Тема обращения'].apply(search_theme)

In [None]:
df['Обращение нормализованное'] = df['Суть обращения'].apply(preprocessingText)

In [None]:
df["Дата подачи заявления"] = df['Дата подачи заявления'].apply(prepare_date)

In [None]:
df.to_csv('clean_frame.csv',sep='|',encoding='utf-8')

Вторая часть после загрузки и обработки данных (подтянуть адреса c помощью mapbox api)

In [None]:
n_df = pd.read_csv('clean_frame.csv', sep='|', encoding='utf-8')

In [None]:
unique_addresses = n_df['Адрес'].unique().tolist()

In [None]:
mapbox_token = "YOUR TOKEN"

In [None]:
def search_district(address):
    url = "https://api.mapbox.com/geocoding/v5/mapbox.places/{0}.json?access_token={1}".format(urllib.parse.quote("Москва "+str(address)), mapbox_token)
    response = requests.get(url)
    district = ""
    try:
        district = response.json()['features'][0]['place_name']
    except Exception as error:
        print("Возникла ошибка-{0}, адрес - {1}".format(str(error),str(address)) )
    finally:
        return district

In [None]:
with open('unique_addresses_frame.csv','w',encoding='utf-8') as file:
    file.write('Адрес|Полный ответ\n')
    for address in tqdm(unique_addresses):
        full_response = search_district(address)
        file.write('{0}|{1}\n'.format(address,full_response))

Схлестываем значения адресов с "чистым фреймом"

In [None]:
addresses_frame = pd.read_csv('unique_addresses_frame.csv',encoding='utf-8',sep='|')

In [None]:
clean_frame = pd.read_csv('clean_frame.csv', encoding='utf-8',sep='|')

In [None]:
res_frame = pd.merge(clean_frame, addresses_frame,on="Адрес",how='left')

In [None]:
res_frame['Округ'] = res_frame['Полный ответ'].apply(search_district)

In [None]:
res_frame['Проверка округа'] = res_frame[['Округ']].apply(lambda x: 1 if ',' in str(x.values) else 0, axis=1)

In [None]:
res_adr_frame = res_frame[res_frame['Проверка округа']==0]

Тематическое моделирование

In [None]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim
import nltk

In [None]:
df = pd.read_csv('clean_frame.csv',encoding='utf-8',sep='|')

In [None]:
data = df['Обращение нормализованное'].values.tolist()

In [None]:
data_words = [nltk.word_tokenize(str(line)) for line in data]

In [None]:
id2word = corpora.Dictionary(data_words)
texts = data_words
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.LdaMulticore(corpus = corpus, id2word=id2word, num_topics = num_topics)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_words, start=2, limit=40, step=6)

In [None]:
# Show graph
limit=40; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
num_topics = 3 #количество тем
lda_model = gensim.models.LdaMulticore(corpus = corpus, id2word=id2word, num_topics = num_topics)

In [None]:
pyLDAvis.enable_notebook()

In [None]:
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
LDAvis_prepared

Работа с картой и итоговые графики

In [None]:
import geopandas as gpd
from MapWorker import *

In [None]:
df = gpd.read_file("mo.geojson")

In [None]:
df['NAME'] = df['NAME'].apply( lambda x: x.replace('ё', 'е') )

In [None]:
df['NAME'] = df['NAME'].apply(lambda x: x.capitalize())

In [None]:
res_adr_frame['NAME'] = res_adr_frame['Округ'].apply(lambda x: x.capitalize())

In [None]:
def create_statistic_df(df,column_for_metric,headers):
    fdf = df[column_for_metric].value_counts().to_frame()
    res_df = pd.DataFrame(fdf)
    res_df = res_df.reset_index()
    res_df.columns = headers
    return res_df

In [None]:
df_districts_counts = create_statistic_df(res_adr_frame,'NAME',['NAME','COUNT'])

In [None]:
df_districts_counts.plot(x='NAME', y='COUNT', kind='barh',figsize=(15,30))

In [None]:
districts_statistic = pd.merge(df,df_districts_counts, on='NAME', how='left')

In [None]:
districts_statistic[['COUNT']] = districts_statistic[['COUNT']].fillna(value=0)

In [None]:
map_obj = GeoMap(55.75,37.61)

In [None]:
map_obj.create_coropleth(districts_statistic,
                        districts_statistic,
                        ['NAME','COUNT'],
                        'feature.properties.NAME',
                        'Количество обращений по округам')

In [None]:
map_obj.add_info_on_map(districts_statistic,
                        ['NAME','COUNT'],
                        ['Название округа','Количество обращений'])

In [None]:
map_obj.geo_map

In [None]:
res_adr_frame['Год'] = res_adr_frame['Дата подачи заявления'].apply(
                                                    lambda x: (str(x).split(' ')[0]).split('.')[2])

In [None]:
years = create_statistic_df(res_adr_frame,'Год',['Год','Количество обращений'])

In [None]:
years.plot(x='Год', y='Количество обращений', kind='bar')

In [None]:
applications = create_statistic_df(res_adr_frame,'Тема обращения',['Обращение','Количество'])

In [None]:
applications[:10].plot(x='Обращение',y='Количество', kind='barh')

In [None]:
chin = create_statistic_df(res_adr_frame, 'Чиновник',['Чиновник','Количество обращений'])

In [None]:
chin[:10].plot(x='Чиновник',y='Количество обращений',kind='barh')