In [None]:
import requests
import re
from bs4 import BeautifulSoup
import datetime
from bs4 import SoupStrainer
import pandas as pd
import time
from datetime import date
import csv
from googletrans import Translator
from textblob import TextBlob
import nltk
from textblob.sentiments import NaiveBayesAnalyzer
import os
import geopandas as gpd
from pyproj import CRS

In [None]:
#индексы загружаются в переменную из файла
with open ("D:/RW/all_regions.txt","r") as f:
    region_url=f.read().split(",")
print(region_url)
len(region_url)

# 1 этап. Сбор ссылок на оперативные новости

In [None]:
def create_folder(folder_path): #создание папки 
    today = datetime.date.today()
    folder_name = today.strftime("%d_%m_%Y")
    folder_path = os.path.join(folder_path, folder_name)
    os.makedirs(folder_path, exist_ok=True)
    return folder_path

def get_news_links(url): #получение ссылок на новости и добавление их в список
    set_news=[]
    resp = requests.get(url)
    data = resp.text
    soup = BeautifulSoup(data, 'lxml')
    for link in soup.find_all('a'):
        set_news.append(link.get('href'))
    all_news=[]
    for w in set_news:
        if re.search("\A/online/news/\d", w) is None:
            continue
        else:
            all_news.append(w)
    return list(set(all_news))

def save_news_links(folder_path, region, all_news): #сохранение полученных ссылок в датафрейм и затем в csv файл
    now_datetime = datetime.datetime.now().strftime("%B %d, %Y")
    news_data = {'link': all_news, 'datetime': [now_datetime] * len(all_news)}
    news_df = pd.DataFrame(news_data)
    file_name = f"new_results_only_news_{region}.csv"
    file_path = os.path.join(folder_path, file_name)
    news_df.to_csv(file_path, mode='a', index=False, header=True, encoding='utf-8')

def main(base_path, region_url):
    folder_path = create_folder(base_path)
    for region in region_url:
        url = f"https://www.{region}.kp.ru/online"
        all_news = get_news_links(url) #функция поиска нужного 
        save_news_links(folder_path, region, all_news) #функция сохранения 
        time.sleep(15)

if __name__ == "__main__":
    base_path = "D:/RW/data_diplom/new_ssilki/"
    region_url #?
    main(base_path, region_url)

# 2 и 3 этапы. Сбор текстовых материалов, тегов, перевод текста и проведение анализа тональности

In [None]:
def create_folder_text(base_path, folder_name):
    text_folder = os.path.join(base_path, 'texts', 'mar', folder_name)
    ton_folder = os.path.join(base_path, 'ton', 'mar', folder_name)
    os.makedirs(text_folder, exist_ok=True)
    os.makedirs(ton_folder, exist_ok=True)
    return text_folder, ton_folder


def extract_text_and_tags(link,region):
    urls = 'https://www.'+region+'.kp.ru'+link
    print(urls)
    result = ""
    res_tags = ""
    try:
        response = requests.get(urls)
        print(response.status_code)
    except:
        print('Ничего не получилось')
    else:
        response.encoding = "utf-8"
        all_text = response.text
        soup = BeautifulSoup(all_text)
        body_tags = soup.find_all("p", class_="sc-1wayp1z-16 dqbiXu")
        pattern_tags = soup.find_all("a", class_='sc-1vxg2pp-0 cXMtmu')
        for i in body_tags:
            result += i.text + "\n"
        for p in pattern_tags:
            res_tags += p.text
    return result, res_tags   


def translation_text(ru_text):
    translator = Translator()
    en_text = translator.translate(ru_text, src='ru', dest='en')
    en_text = str(en_text.text)
    return en_text

def save_texts(text_folder,result, region, en_text, res_tags):
        all_texts_1=list(set(result))
        all_texts_dict={'ru_text':[result],'en_text':[en_text],'tags':[res_tags]}
        all_texts_df = pd.DataFrame(all_texts_dict)
        file_path=os.path.join(text_folder,f"new_results_texts_{region}.csv")
        all_texts_df.to_csv(file_path, mode='a', index=False, encoding='utf-8')
        print('Скачалось')
        time.sleep(8)

                         
def sent_analysis(ton_folder, text_folder, region, folder_name):
    texts_file = os.path.join(text_folder, f"new_results_texts_{region}.csv")
    try:
        df1 = pd.read_csv(texts_file)
        df2 = df1.iloc[::2]  # Берем каждую вторую строку
        df2.reset_index(drop=True, inplace=True)
        ton_texts_dict = {'ton': []}
        for i, row in df2.iterrows():
            text_ = row['en_text']
            string = TextBlob(text_, analyzer=NaiveBayesAnalyzer())
            tonalnost = string.sentiment
            ton_texts_dict['ton'].append(tonalnost)
        ton_texts_df = pd.DataFrame(ton_texts_dict)
        df3 = pd.concat([df2, ton_texts_df], axis=1)
        df3['ton_kachestvo'], df3['pol'], df3['neg'] = zip(*df3['ton'])
        ton_file = os.path.join(ton_folder, f"results_ton_texts_{region}.csv")
        df3.to_csv(ton_file, mode='a', header=False, index=False, encoding="utf-8")
        print('ура, скачалось')
    except Exception as e:
        print(f"Ошибка при анализе тональности: {e}")
                         
def main(base_path, folder_name, region_url):
    text_folder, ton_folder = create_folder_text(base_path, folder_name)
    for region in region_url:
        links_file = os.path.join(base_path, 'new_ssilki', folder_name, f"new_results_only_news_{region}.csv")
        df = pd.read_csv(links_file)
        links_list = df.link.to_list()
        for link in links_list:
            result, res_tags = extract_text_and_tags(link, region)
            en_text = translation_text(result)
            save_texts(text_folder, result, region, en_text, res_tags)
            sent_analysis(ton_folder, text_folder, region, folder_name)

if __name__=='__main__':
    base_path="D:/RW/data_diplom/"
    folder_name=datetime.date.today().strftime("%d_%m_%Y")
    region_url
    main(base_path,folder_name,region_url)

# 4 этап. Приведение данных к виду, удобному для картографирования и счет новостей

In [None]:
def process_region(folder_name, tag_values, result_, tags, region):
    try:
        df = pd.read_csv(f"d:/rw/data_diplom/ton/dec/{folder_name}/results_ton_texts_{region}.csv")
        new_df = df.iloc[:, [2, 6, 7]]
        new_df.columns = ['tag', 'pos', 'neg']

        res_pos = new_df.groupby('tag')['pos'].mean()
        res_neg = new_df.groupby('tag')['neg'].mean()

        meanneg_df = pd.DataFrame({'tag': res_neg.index, 'mean': res_neg.values})
        meanpos_df = pd.DataFrame({'tag': res_pos.index, 'mean': res_pos.values})

        obsh_df = pd.merge(meanpos_df, meanneg_df, on='tag')
        obsh_df.columns = ['tag', 'mean_pos', 'mean_neg']

        allowed_values = [tag_values]
        obsh_df = obsh_df[obsh_df['tag'].isin(allowed_values)]
        obsh_df['count'] = len(new_df[new_df['tag'] == tag_values])
        regions = cycle([region])
        obsh_df['region'] = [next(regions) for _ in range(len(obsh_df))]

        obsh_df = obsh_df.reset_index(drop=True)
        os.makedirs(f"D:/RW/data_diplom/for_map/{result_}/tags/{folder_name}", exist_ok=True)
        obsh_df.to_csv(f"D:/RW/data_diplom/for_map/{result_}/tags/{folder_name}/results_for_map_{tags}.csv", mode='a', index=False, encoding="utf-8", header=False)
    except:
        pass

if __name__ == "__main__":
    folder_name = '31_12_2023'
    tag_values = 'Происшествия'
    result_ = 'res_dec'
    tags = 'prois'

    for region in region_url:
        process_region(folder_name, tag_values, result_, tags, region)

In [None]:
workspace=r'D:\RW\data_diplom\for_map'
month=r'res_dec/tags'
tags=['ekon', 'obsh', 'pol', 'prois', 'sport']
def get_mean_by_month(workspace, month):
    folders= [ f.path for f in os.scandir(os.path.join(workspace, month)) if f.is_dir() and '2023' in f.path ]
    
    df_ekon=pd.DataFrame(columns=['tag', 'mean_pos', 'mean_neg', 'count', 'region'])
    
    df_sport=pd.DataFrame(columns=['tag', 'mean_pos', 'mean_neg', 'count', 'region'])
    df_prois=pd.DataFrame(columns=['tag', 'mean_pos', 'mean_neg', 'count', 'region'])
    df_pol=pd.DataFrame(columns=['tag', 'mean_pos', 'mean_neg', 'count', 'region'])
    df_obsh=pd.DataFrame(columns=['tag', 'mean_pos', 'mean_neg', 'count', 'region'])
    for folder in folders:

        files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]
        if len(files)==5:
            #ekon
            ekon=os.path.join(folder, files[0])
            df_ekon_read=pd.read_csv(ekon)
            df_ekon_read=df_ekon_read[df_ekon_read['tag']!='tag']
            df_ekon=pd.concat([df_ekon, df_ekon_read])
            df_ekon.reset_index(drop=True, inplace=True)
            #sport
            sport=os.path.join(folder, files[4])
            df_sport_read=pd.read_csv(sport)
            df_sport_read=df_sport_read[df_sport_read['tag']!='tag']
            df_sport=pd.concat([df_sport, df_sport_read])
            df_sport.reset_index(drop=True, inplace=True)
            #prois
            prois=os.path.join(folder, files[3])
            df_prois_read=pd.read_csv(prois)
            df_prois_read=df_prois_read[df_prois_read['tag']!='tag']
            df_prois=pd.concat([df_prois, df_prois_read])
            df_prois.reset_index(drop=True, inplace=True)
            #pol
            pol=os.path.join(folder, files[2])
            df_pol_read=pd.read_csv(pol)
            df_pol_read=df_pol_read[df_pol_read['tag']!='tag']
            df_pol=pd.concat([df_pol, df_pol_read])
            df_pol.reset_index(drop=True, inplace=True)
            #obsh
            obsh=os.path.join(folder, files[1])
            df_obsh_read=pd.read_csv(obsh)
            df_obsh_read=df_obsh_read[df_obsh_read['tag']!='tag']
            df_obsh=pd.concat([df_obsh, df_obsh_read])
            df_obsh.reset_index(drop=True, inplace=True)
    
    df_ekon['count']=pd.to_numeric(df_ekon['count'], errors='coerce')
    df_ekon['mean_pos']=pd.to_numeric(df_ekon['mean_pos'], errors='coerce')
    df_ekon['mean_neg']=pd.to_numeric(df_ekon['mean_neg'], errors='coerce')
    df_ekon_group=df_ekon.groupby(['region']).agg({'count': 'sum', 'mean_pos': 'mean', 'mean_neg': 'mean'})
    df_ekon_group['mean_mon']=df_ekon_group['mean_pos']-df_ekon_group['mean_neg']
    df_ekon_group.to_csv(os.path.join(workspace, month)+r'/ekon.csv', mode='w', index=True, encoding = "utf-8")
    # настрой, куда сохранять
    df_sport['count']=pd.to_numeric(df_sport['count'], errors='coerce')
    df_sport['mean_pos']=pd.to_numeric(df_sport['mean_pos'], errors='coerce')
    df_sport['mean_neg']=pd.to_numeric(df_sport['mean_neg'], errors='coerce')
    df_sport_group=df_sport.groupby(['region']).agg({'count': 'sum', 'mean_pos': 'mean', 'mean_neg': 'mean'})
    df_sport_group['mean_mon']=df_sport_group['mean_pos']-df_sport_group['mean_neg']
    df_sport_group.to_csv(os.path.join(workspace, month)+r'/sport.csv', mode='w', index=True, encoding = "utf-8")
    
    df_prois['count']=pd.to_numeric(df_prois['count'], errors='coerce')
    df_prois['mean_pos']=pd.to_numeric(df_prois['mean_pos'], errors='coerce')
    df_prois['mean_neg']=pd.to_numeric(df_prois['mean_neg'], errors='coerce')
    df_prois_group=df_prois.groupby(['region']).agg({'count': 'sum', 'mean_pos': 'mean', 'mean_neg': 'mean'})
    df_prois_group['mean_mon']=df_prois_group['mean_pos']-df_prois_group['mean_neg']
    df_prois_group.to_csv(os.path.join(workspace, month)+r'/prois.csv', mode='w', index=True, encoding = "utf-8")
    
    df_pol['count']=pd.to_numeric(df_pol['count'], errors='coerce')
    df_pol['mean_pos']=pd.to_numeric(df_pol['mean_pos'], errors='coerce')
    df_pol['mean_neg']=pd.to_numeric(df_pol['mean_neg'], errors='coerce')
    df_pol_group=df_pol.groupby(['region']).agg({'count': 'sum', 'mean_pos': 'mean', 'mean_neg': 'mean'})
    df_pol_group['mean_mon']=df_pol_group['mean_pos']-df_pol_group['mean_neg']
    df_pol_group.to_csv(os.path.join(workspace, month)+r'/pol.csv', mode='w', index=True, encoding = "utf-8")
    
    df_obsh['count']=pd.to_numeric(df_obsh['count'], errors='coerce')
    df_obsh['mean_pos']=pd.to_numeric(df_obsh['mean_pos'], errors='coerce')
    df_obsh['mean_neg']=pd.to_numeric(df_obsh['mean_neg'], errors='coerce')
    df_obsh_group=df_obsh.groupby(['region']).agg({'count': 'sum', 'mean_pos': 'mean', 'mean_neg': 'mean'})
    df_obsh_group['mean_mon']=df_obsh_group['mean_pos']-df_obsh_group['mean_neg']
    df_obsh_group.to_csv(os.path.join(workspace, month)+r'/obsh.csv', mode='w', index=True, encoding = "utf-8")
get_mean_by_month(workspace, month)

In [None]:
def convert_to_geojson(shp_file, csv_file, region_col, name_col, output_file):
    csv_data = pd.read_csv(csv_file)
    merged_data = shp_file.merge(csv_data, left_on=name_col, right_on=region_col, how='left')
    merged_data = merged_data.drop([region_col], axis=1)
    target_crs = CRS.from_epsg(4326)
    merged_crs_data = merged_data.to_crs(target_crs)
    merged_crs_data.to_file(output_file, driver='GeoJSON')

name_mon = ['dec', 'jan', 'feb', 'mar']
name_tag = ['sport', 'ekon', 'prois', 'obsh', 'pol']
path_name='D:/RW/data_diplom/for_map'
shp_file = gpd.read_file(r'D:\RW\data_diplom\karta\Redakcion_borders.shp')

for a in name_mon:
    for i in name_tag:
        csv_file = f'{path_name}/res_{a}/tags/{i}.csv'
        output_file = f'{path_name}/res_{a}/gis/{i}.geojson'
        convert_to_geojson(shp_file, csv_file, 'region', 'gn_name', output_file)

### Вспомогательные коды

In [None]:
#Найти индекс элемента, если произошло прерываение работы цикла
value = input("Введите значение элемента: ")

# Проверяем, есть ли такой элемент в списке
if value in region_url:
    # Находим индекс элемента
    index = region_url.index(value)
    print("Индекс элемента", value, ":", index)

In [None]:
# Открываем первый GeoJSON файл в виде таблицы
df = gpd.read_file(r'D:\RW\data_diplom\for_map\res_dec\gis\obrazi_reg_dec_2.geojson')

# Присоединяем второй GeoJSON файл по столбцу 'id'
df2 = gpd.read_file(r'D:\RW\data_diplom\for_map\res_jan\gis\obrazi_reg_jan.geojson')
merged = df.merge(df2, on='gn_name', how='left')

# Присоединяем третий GeoJSON файл по столбцу 'name'
df3 = gpd.read_file(r'D:\RW\data_diplom\for_map\res_feb\gis\obrazi_reg_feb.geojson')
merged = merged.merge(df3, on='gn_name', how='left')

merged.to_excel(r'D:\RW\data_diplom\for_map\all_im.xlsx')

df = pd.read_excel(r'D:\RW\data_diplom\for_map\all_im.xlsx')
df4 = gpd.read_file(r'D:\RW\data_diplom\for_map\res_mar\gis\obrazi_reg_mar.geojson')
merged = df.merge(df4, on='gn_name', how='left')

merged.to_excel(r'D:\RW\data_diplom\for_map\all_im_2.xlsx')