In [2]:
#import
import os
import numpy as np
import pandas as pd
import re
import jieba
import jieba.posseg as pseg

In [3]:
df_texts = pd.read_csv('../DATAFRAMES/df_texts_with_index.csv') #reading the dataframe with texts

In [4]:
df_texts.head()

Unnamed: 0.1,Unnamed: 0,id,texts
0,0,1,个临时发往武汉的口罩\n个临时发往武汉的口罩个临时发往武汉的口罩个临时发往武汉的口罩吴呈杰报...
1,1,2,武汉隔离疫区信息孤岛与一辆鄂车的漂流\n武汉隔离疫区信息孤岛与一辆鄂车的漂流武汉隔离疫区信息...
2,2,3,我家离华南海鲜市场很近返乡封城过年一位武汉大学生的过去一周\n我家离华南海鲜市场很近返乡封城...
3,3,4,疫情危机中不被看见的人们武汉周边城市百姓的自救行动\n疫情危机中不被看见的人们武汉周边城市百...
4,4,5,孝感前线医生武汉更难我们下面不好意思提要求\n孝感前线医生武汉更难我们下面不好意思提要求孝感...


Defining the function to keep only places mentioned in the texts

In [5]:
#function to keep words based on jieba tagging, here we only keep places (ns)

def keep_words(s, to_keep=['ns']):
    s_ = []
    pos_list = pseg.lcut(s)
    for item in pos_list:
        if item.flag in to_keep:
            s_.append(item.word)
    return s_

In [6]:
df_texts['places'] = df_texts.texts.apply(keep_words)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.487 seconds.
Prefix dict has been built successfully.


In [7]:
df_texts.head()

Unnamed: 0.1,Unnamed: 0,id,texts,places
0,0,1,个临时发往武汉的口罩\n个临时发往武汉的口罩个临时发往武汉的口罩个临时发往武汉的口罩吴呈杰报...,"[武汉, 武汉, 武汉, 武汉, 武汉, 广州, 中康, 中国, 美国, 河南, 中断, 江..."
1,1,2,武汉隔离疫区信息孤岛与一辆鄂车的漂流\n武汉隔离疫区信息孤岛与一辆鄂车的漂流武汉隔离疫区信息...,"[武汉, 鄂车, 武汉, 鄂车, 武汉, 鄂车, 武汉, 鄂车, 武汉, 鄂州, 东西, 武..."
2,2,3,我家离华南海鲜市场很近返乡封城过年一位武汉大学生的过去一周\n我家离华南海鲜市场很近返乡封城...,"[华南, 海鲜, 武汉, 华南, 海鲜, 武汉, 华南, 海鲜, 武汉, 华南, 海鲜, 武..."
3,3,4,疫情危机中不被看见的人们武汉周边城市百姓的自救行动\n疫情危机中不被看见的人们武汉周边城市百...,"[武汉, 周边城市, 武汉, 周边城市, 武汉, 周边城市, 武汉, 周边城市, 武汉, 华..."
4,4,5,孝感前线医生武汉更难我们下面不好意思提要求\n孝感前线医生武汉更难我们下面不好意思提要求孝感...,"[武汉, 武汉, 武汉, 武汉, 武安, 孝感市, 武汉, 武汉, 武汉, 武汉, 武安, ..."


Now we have extracted all the places, we need to count them. To do that we need first make one list out of the list of list.

In [8]:
from  itertools import chain

In [9]:
fp = pd.Series(list(chain.from_iterable(df_texts['places'])))
fp.head(5)

0    武汉
1    武汉
2    武汉
3    武汉
4    武汉
dtype: object

In [10]:
cp = fp.value_counts()
cp.head(50)

武汉     44339
中国     23038
湖北     10436
美国      9838
武汉市     7931
意大利     6849
北京      6619
城市      5219
上海      4912
湖北省     4072
韩国      3945
日本      3588
英国      3174
伊朗      3123
东西      2764
欧洲      2595
德国      2269
香港      2039
海鲜      1942
华南      1906
回国      1853
广州      1670
深度      1491
钟南山     1488
黄冈      1353
广东      1330
纽约      1289
法国      1288
西班牙     1287
深圳      1244
汉口      1103
浙江      1100
神山      1028
中东       961
武昌       903
东京       899
新加坡      860
温州       853
中南       831
泉州       816
上海市      806
印度       774
杭州       760
大邱       759
米兰       751
京        742
河南       739
黄冈市      706
山东       697
伦敦       685
dtype: int64

Wuhan is definitely on top of that list and by a lot of counts. However, there are also a lot of trash: 东西 is not a place f.e. If we want to use this places to put them on a map we need to clean them a little or edit them manually. We are first going to try to do some cleaning, removing 东西, removing 市 from 武汉市, 省 from 湖北省 etc.

In [11]:
df_texts['texts'] = pd.Series(df_texts.texts).str.replace('市', '', regex=True)
#I am not sure how to write the character to replace at once, so we can just do one after the other. It works for sure
df_texts['texts'] = pd.Series(df_texts.texts).str.replace('东西', '', regex=True)
df_texts['texts'] = pd.Series(df_texts.texts).str.replace('省', '', regex=True)
df_texts['texts'] = pd.Series(df_texts.texts).str.replace('回国', '', regex=True)
df_texts['texts'] = pd.Series(df_texts.texts).str.replace('深度', '', regex=True)
df_texts['texts'] = pd.Series(df_texts.texts).str.replace('神山', '', regex=True)
df_texts['texts'] = pd.Series(df_texts.texts).str.replace('哥哥', '', regex=True)
df_texts['texts'] = pd.Series(df_texts.texts).str.replace('太阳', '', regex=True)
df_texts['texts'] = pd.Series(df_texts.texts).str.replace('上岗 ', '', regex=True)


In [12]:
df_texts['places'] = df_texts.texts.apply(keep_words)
fp = pd.Series(list(chain.from_iterable(df_texts['places'])))
cp = fp.value_counts()
cp.head(100)

武汉      52335
中国      23035
湖北      14979
美国       9936
北京       7374
意大利      6849
上海       5813
韩国       3945
日本       3601
英国       3174
伊朗       3123
欧洲       2595
德国       2269
黄冈       2058
香港       2054
海鲜       1942
华南       1906
广州       1862
广东       1698
纽约       1512
钟南山      1488
浙江       1469
深圳       1454
西班牙      1287
法国       1287
汉口       1103
温州        973
天津        973
泉州        964
中东        961
        ...  
潜江        458
埃         443
东湖        440
福建        437
长江        426
天门        425
贵州        406
亚洲        405
纽约州       389
京东        388
巴西        376
加州        374
郑州        372
加拿大       368
宜昌        360
十堰        350
河北        346
越南        340
三联        339
上岗        337
亚太地区      336
咸宁        335
北非        332
瑞士        324
德塞        323
甘肃        322
新鲜        320
苏州        319
关联        318
黑龙江       318
Length: 100, dtype: int64

There are still some mistakes such as 海鲜, 三联, 上岗, but we can delete them directly. The biggest problem was removing 市 from 武汉市, etc because in those cases, the occurences total was separated in two.

In [13]:
dcp = cp.to_frame().reset_index()

In [14]:
dcp.rename(columns = {'index':'places', 0: 'freq'}, inplace = True)

In [15]:
dcp.head(30)

Unnamed: 0,places,freq
0,武汉,52335
1,中国,23035
2,湖北,14979
3,美国,9936
4,北京,7374
5,意大利,6849
6,上海,5813
7,韩国,3945
8,日本,3601
9,英国,3174


We are going to keep the first 100 values. Let's select the first 200 values clean it and take the first 100 based on that

In [16]:
dcpf = dcp.head(200)
dcpf

Unnamed: 0,places,freq
0,武汉,52335
1,中国,23035
2,湖北,14979
3,美国,9936
4,北京,7374
5,意大利,6849
6,上海,5813
7,韩国,3945
8,日本,3601
9,英国,3174


Deleting the problematic values. I did not keep places abbreviation like '京' or '美' as their meaning can vary depending on the context.

In [17]:
mask = dcpf['places'].isin(['海鲜', '三联', '上岗', '钟南山','京', '美', '英雄','西韦','上门','云','海关','商城','南','埃','京东','德塞','新鲜','关联','立马','德','中断','鄂','吉利','离汉','后湖','北'])
df_test = dcpf[mask]# testing the mask
print('values to delete frome places')
df_test

values to delete frome places


Unnamed: 0,places,freq
15,海鲜,1942
20,钟南山,1488
41,京,743
45,美,653
47,英雄,620
50,西韦,598
51,上门,591
57,云,544
60,海关,536
62,商城,490


to clean properly our dataframe, this time we need to see it in full.

In [18]:
pd.get_option("display.max_row")

60

In [19]:
pd.set_option("display.max_rows", 200)

In [20]:
cleaned_dcpf = dcpf[~mask]
cleaned_dcpf

Unnamed: 0,places,freq
0,武汉,52335
1,中国,23035
2,湖北,14979
3,美国,9936
4,北京,7374
5,意大利,6849
6,上海,5813
7,韩国,3945
8,日本,3601
9,英国,3174


We can check the places' name. If I have a doubt about a place, I will search it with google map, if that brings no result, I delete it by adding it to the mask and so on. I had to clean until 123 to keep the first 100.

In [21]:
selected = cleaned_dcpf.head(100)
selected

Unnamed: 0,places,freq
0,武汉,52335
1,中国,23035
2,湖北,14979
3,美国,9936
4,北京,7374
5,意大利,6849
6,上海,5813
7,韩国,3945
8,日本,3601
9,英国,3174


If we look at the end of the dataframe, we do not see any problematic values. We can go further and eventually we will still have to correct a few thing while displaying it on the map.

In [22]:
selected = selected.sort_values(by = 'freq', ascending = False ).reset_index(drop=True)
selected

Unnamed: 0,places,freq
0,武汉,52335
1,中国,23035
2,湖北,14979
3,美国,9936
4,北京,7374
5,意大利,6849
6,上海,5813
7,韩国,3945
8,日本,3601
9,英国,3174


In [23]:
from geopy.geocoders import Nominatim
import time

In [24]:
geolocator = Nominatim(user_agent="Your_Name")

def ex_lat(address):
    time.sleep(1) #adding 1 second delay before each request to avoid making too many request to nominatim servers
    location = geolocator.geocode(address)
    return location.latitude
    
def ex_long(address):
    time.sleep(1)
    location = geolocator.geocode(address)
    return location.longitude

In [25]:
selected['lat'] = selected['places'].apply(ex_lat)
selected['long'] = selected['places'].apply(ex_long)
selected

Unnamed: 0,places,freq,lat,long
0,武汉,52335,30.595105,114.299935
1,中国,23035,35.000074,104.999927
2,湖北,14979,31.151725,112.878322
3,美国,9936,39.78373,-100.445882
4,北京,7374,39.906217,116.391276
5,意大利,6849,42.638426,12.674297
6,上海,5813,31.232276,121.469207
7,韩国,3945,36.638392,127.696119
8,日本,3601,36.574844,139.239418
9,英国,3174,54.702354,-3.276575


In [26]:
# import folium
import folium

In [27]:
# Make an empty map
m = folium.Map(location=[32.0609736, 118.7916458], tiles="OpenStreetMap", zoom_start=4)

# Show the map
m

In [28]:
for places, freq, lat, long in zip(
    selected.places, 
    selected.freq, 
    selected.lat,
    selected.long):
    
    label= '{}, {}'.format(places, freq) # Define label here to reuse after
    
    m.add_child(
        folium.Circle(
            [lat,long],
            label = label,
            popup = label,
            radius = float(freq)*7, # define how big is the circle
            color = 'crimson',
            fill = True,
            fill_color = 'crimson',
            fill_opacity = 0.3
        ).add_to(m)
    )

In [29]:
m

In [30]:
#save the map to inspect it
#m.save("maps/places_1st100freq.html")

With this map, we could see that there was still a few problem with our data. 中南 South central Chine and 中东 Middle east are wrongly located, same for 加州 JiaZhou California showing in Japan. Also concerned are Yataidiqu: Asia-pacific area, Huanan, Dacheng, Beifei. 

In [31]:
print(geolocator.geocode('中东'))
print(geolocator.geocode('الشرق الاوسط'))
#only giving a hint with viewbox actually help a little to locate middle east properly,
#we will have to enter the coordinates manually

geolocator.geocode('Middle East', viewbox = ((28,30),(41,43)) ) 


中垌, 南宁市, 广西壮族自治区, 中国
الشرق الاوسط, باب الخان(امام حسن), محافظة كربلاء, كربلاء, ناحية مرکز قضاء الکربلاء, قضاء کربلاء, محافظة كربلاء, 56001, العراق / عێراق


Location(Middle East, Baltimore City 13th Council District, Baltimore, Maryland, United States, (39.3014159, -76.5888477, 0.0))

In [32]:
print(geolocator.geocode('中南'))
print(geolocator.geocode('中南地区'))

충청남도, 대한민국
충청남도, 대한민국


No success here as well. Let's just enter manually the coordinate of Changsha, which is more or less in the center of the area voir wiki: https://fr.wikipedia.org/wiki/Zhongnan coordinates: 29.00244602807003, 112.96734232025437

However, Changsha is also in the dataframe and if two coordinates are the same, the map only keep the last one. We need to pay attention to that and enter coordinates nearby but not the same.
We can print only the location we want to edit first to know their index number.

In [33]:
mask = selected['places'].isin(['华南','中东','中南', '北非', '亚太地区' ,'加州', '大城'])
df_look = selected[mask]# check
df_look

Unnamed: 0,places,freq,lat,long
15,华南,1906,46.235427,130.557065
27,中东,961,23.7265,108.735
35,中南,831,36.6593,126.6729
68,加州,374,35.726318,138.793924
75,亚太地区,336,31.932,120.915383
77,北非,332,47.858433,-3.605622
92,大城,271,14.353543,100.564568


Let's edit manually our dataframe.

In [34]:
selected.at[15, 'lat'] = '24.990994022242646' #edit 华南
selected.at[15, 'long'] = '112.86380926909548' #edit 华南

selected.at[27, 'lat'] = '30.980457' #edit 中东
selected.at[27, 'long'] = '40.857491' #edit 中东

selected.at[35, 'lat'] = '27.865395854384015' #edit 中南
selected.at[35, 'long'] = '112.37254013136757' #edit 中南

selected.at[68, 'lat'] = '36.48091646860382' #edit 加州, California
selected.at[68, 'long'] = '-119.51141777744824' #edit 加州, California

selected.at[75, 'lat'] =  '10.381340' #edit 亚太地区, Asia-Pacific
selected.at[75, 'long'] = '112.615423' #edit 亚太地区, Asia-Pacific

selected.at[77, 'lat'] = '19.54918' #edit 北非
selected.at[77, 'long'] = '-7.7429868' #edit 北非

selected.at[92, 'lat'] = '38.70030284043825'  #edit 大城
selected.at[92, 'long'] = '116.64556403067809' #edit 大城

selected

Unnamed: 0,places,freq,lat,long
0,武汉,52335,30.595105,114.299935
1,中国,23035,35.000074,104.999927
2,湖北,14979,31.151725,112.878322
3,美国,9936,39.78373,-100.445882
4,北京,7374,39.906217,116.391276
5,意大利,6849,42.638426,12.674297
6,上海,5813,31.232276,121.469207
7,韩国,3945,36.638392,127.696119
8,日本,3601,36.574844,139.239418
9,英国,3174,54.702354,-3.276575


We can create a new column for easier readibility while displaying the popup. The column freq is used for computing the radius of the circle and freq2 for display.

In [35]:
def coupe(number):
    x = "{:,}".format(number)
    return x

In [36]:
selected['freq2'] = selected.freq.apply(coupe)
selected

Unnamed: 0,places,freq,lat,long,freq2
0,武汉,52335,30.595105,114.299935,52335
1,中国,23035,35.000074,104.999927,23035
2,湖北,14979,31.151725,112.878322,14979
3,美国,9936,39.78373,-100.445882,9936
4,北京,7374,39.906217,116.391276,7374
5,意大利,6849,42.638426,12.674297,6849
6,上海,5813,31.232276,121.469207,5813
7,韩国,3945,36.638392,127.696119,3945
8,日本,3601,36.574844,139.239418,3601
9,英国,3174,54.702354,-3.276575,3174


In [37]:
#reset the map
m = folium.Map(location=[32.0609736, 118.7916458], tiles="OpenStreetMap", zoom_start=4)

#add a title
title = "First 100 places' occurence in nCovMemory corpus"
title_html = '''
             <h3 align="center" style="font-size:16px"><b>{}</b></h3>
             '''.format(title)   


m.get_root().html.add_child(folium.Element(title_html))

#write again the circles with the changes made
for places, freq, lat, long, freq2 in zip(
    selected.places, 
    selected.freq, 
    selected.lat,
    selected.long,
    selected.freq2):
    
    label= '{}: {}'.format(places, freq2) # Define label here to reuse after
    
    m.add_child(
        folium.Circle(
            [lat,long],
            label = label,
            popup = label,
            radius = float(freq)*7, # define how big is the circle
            color = 'crimson',
            fill = True,
            fill_color = 'crimson',
            fill_opacity = 0.3
        ).add_to(m)
    )

In [38]:
m

In [39]:
#save the map
#m.save("maps/places_1st100freq.html")

et voilà!!