In [22]:
import pandas as pd
import geopandas as gpd
import folium
from folium.plugins import HeatMap
%matplotlib inline

In [120]:
name = 'Horoshevo-Mnevniki'
district = gpd.read_file('../Data dive/dd2/{}/{}_districts.geojson'.format(name,name))
pointInPolys = geo_df.intersects(district.iloc[0].geometry)
neigh_posts = geo_df[pointInPolys]
os.mkdir('social_media/{}'.format(name))
neigh_posts.to_csv('social_media/{}/vk.csv'.format(name))

# MODELING

In [135]:
from tqdm import tqdm
from collections import defaultdict
import gensim
from gensim.corpora import Dictionary
from gensim.models import Phrases
from gensim.models import LdaModel
import pyLDAvis.gensim
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from pymystem3 import Mystem

In [124]:
vk_users = pd.DataFrame({'post' : neigh_posts.groupby('userId').apply( lambda x: ' '.join(x['text']))}).reset_index()

In [127]:
extra_words = ['http','br','id','com','www', 'instagram', 'vsco', 'https', 'instasize','repost',
              'whatsapp', 'вотсап', 'repostapp','маникюр', 'бровь', 'ресница', 'губа', 'instacollage', 'опубликовывать',
                'фото', 'новосибирск', 'novosibirsk','novosibirsk_russia','russia','новосиб', 'москва', 'moscow']
def process_docs(docs):
    """
    Function to process texts. Following are the steps we take:
    
    1. Text tokenization.
    2. Removing numbers 
    3. Stopword and short words Removal.
    4. Lemmatization and filter words by their length.
    
    Args:
    ----------
    texts: Tokenized texts.
    
    Returns:
    -------
    texts: Pre-processed tokenized texts.
    """
    m = Mystem()
    # Split the documents into tokens.
    tokenizer = RegexpTokenizer(r'\w+')
    stops = stopwords.words('russian') + stopwords.words('english') + extra_words
    
    for idx in tqdm(range(len(docs))):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not any(c.isdigit() for c in token) and ('id' not in token or 'club' not in token or 'ru' not in token)] for doc in tqdm(docs)]
    #Lemmatize words
    docs = [[m.lemmatize(token)[0] for token in doc ] for doc in tqdm(docs)]
    #Remove stopwords
    docs = [[token for token in doc if token not in stops] for doc in tqdm(docs)]
    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 3] for doc in tqdm(docs)]
    return docs
    
def get_corpus(docs):
    
    """Add bigrams to docs and create corpus and dictionary for training
    
    Args:
        docs: list of tokenized and cleaned texts;
    Returns:
        corpus: list of lists of tuples, where first element of tuple is a word id
        and the second is the count of that word in the whole corpus
        dictionary: gensim.corpora.dictionary.Dictionary 
  
    """
    
    frequency = defaultdict(int)
    for text in tqdm(docs):
        for token in text:
            frequency[token] += 1

    texts = [[token for token in text if frequency[token] > 3] for text in tqdm(docs)]

    #Take the bigram, if token is a bigram, add to document.
    bigram = Phrases(texts, min_count = 20)
    for idx in tqdm(range(len(texts))):
        for token in bigram[texts[idx]]:
            if '_' in token:
                texts[idx].append(token)
    
    # Create a dictionary representation of the documents.
    dictionary = Dictionary(texts)
    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    #dictionary.filter_extremes(no_below=20, no_above=0.5)
    corpus = [dictionary.doc2bow(doc) for doc in tqdm(texts)]
    
    print('Number of unique tokens: {}'.format(len(dictionary)))
    print('Number of documents: {}'.format(len(corpus)))
    
    return corpus, dictionary

In [128]:
texts = vk_users['post'].copy()

In [133]:
docs = process_docs(texts.values)

100%|██████████| 13100/13100 [00:00<00:00, 33847.56it/s]
100%|██████████| 13100/13100 [00:00<00:00, 16385.15it/s]
100%|██████████| 13100/13100 [00:45<00:00, 285.53it/s]
100%|██████████| 13100/13100 [00:01<00:00, 8310.26it/s]
100%|██████████| 13100/13100 [00:00<00:00, 180642.16it/s]


In [136]:
corpus, dictionary = get_corpus(docs)

100%|██████████| 13100/13100 [00:00<00:00, 99553.16it/s]
100%|██████████| 13100/13100 [00:00<00:00, 131964.78it/s]
100%|██████████| 13100/13100 [00:01<00:00, 9768.57it/s] 
100%|██████████| 13100/13100 [00:00<00:00, 31987.62it/s]

Number of unique tokens: 8684
Number of documents: 13100





In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


#The training model - we use online LDA model which allows to update the model 
#and the following parameters should be defined
num_topics = 10 # number of topics
chunksize = 1000 
passes = 10
iterations = 400
eval_every = 10  #evaluate model perplexity.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

%time model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha=0.001, update_every = 1, \
                       num_topics=num_topics,\
                       eval_every=eval_every, passes = passes)


#%time model = HdpModel(corpus=corpus_tfidf, id2word=id2word)

In [None]:
# import nltk
# nltk.download()

In [142]:
data = pyLDAvis.gensim.prepare(model, corpus, dictionary); # visualize lda topics
pyLDAvis.display(data)

# Mapping

In [33]:
from geopandas import GeoDataFrame
from shapely.geometry import Point

# df = pd.read_csv('social_media/vk.csv')

geometry = [Point(xy) for xy in zip(df.lon, df.lat)]
crs = {'init': 'epsg:4326'}
geo_df = GeoDataFrame(df, crs=crs, geometry=geometry)

In [122]:
center_lat = list(district.centroid[0].coords)[0][1]
center_lon = list(district.centroid[0].coords)[0][0]

In [123]:
map_places = folium.Map([center_lat, center_lon], tiles='Stamen Toner', zoom_start=14,control_scale=True)

#Define style for geojson objects
style_function = lambda feature: dict(fillColor='#AECCAE',
                                      color='#AECCAE',
                                      weight=1,
                                      opacity=0.3)

houses = gpd.read_file('../Data dive/dd2/Nagornyj/Nagornyj_chkruchevki.geojson')
points = folium.features.GeoJson(houses,name='Khurshevki houses')
polygon = folium.features.GeoJson(district, style_function=style_function,name='district boundary')
# vk_points = folium.features.GeoJson(geo_df[:10000], name='VK')

#adding points to the map
map_places.add_child(points,name='Khurshevki houses')
map_places.add_child(polygon,name='district boundary')
# map_places.add_child(vk_points,name='VK locations')

colormap = {0.0: 'pink', 0.3: 'blue', 0.5: 'green',  1.0: 'red'}

transport_coords = list(zip(neigh_posts.lat, neigh_posts.lon))

HeatMap(transport_coords,
        name='transport',
        radius=10, 
        min_opacity=0.8,
       gradient={0.0: 'pink', 0.3: 'blue', 0.5: 'green',  1.0: 'red'}).add_to(map_places)


colormap = folium.LinearColormap(colors = colormap.values())
colormap.caption = 'Density of places'

#Switch between layers
folium.LayerControl().add_to(map_places)
map_places