In [1]:
from pprint import pprint
from collections import Counter
import os
import re
import logging
import string
import pickle
import numpy as np
import pandas as pd
import smart_open
import multiprocessing
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

# Gensim
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.phrases import Phrases, Phraser

# NLTK
import nltk
from nltk.corpus import stopwords
'''nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')'''
from nltk import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Spacy
import spacy

# Plotting
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value

# Clustering
from sklearn.cluster import KMeans
from sklearn.neighbors import KDTree
from sklearn.manifold import TSNE

# Suppressing warnings
import warnings
warnings.simplefilter("ignore", DeprecationWarning)

In [46]:
def clean_text(text):
    # Normalize tabs and remove newlines
    no_tabs = text.replace('\t', ' ').replace('\n', '').replace('Belt and Road', 'BRI').replace('- ', '').replace('globaltimes.com.cn', '').replace('SOURCE BUSINESS SPOTLIGHT','')
    # Remove all characters except A-Z and a dot.
    no_url = re.sub('Online: ((www\.[^\s]+)|(https?://[^\s]+))', '', no_tabs)
    alphas_only = re.sub("[^a-zA-Z\.]", " ", no_url);
    # Normalize spaces to 1
    multi_spaces = re.sub(" +", " ", alphas_only);
    # Strip trailing and leading spaces
    no_spaces = multi_spaces.strip();

    return no_spaces


def sentence_tokenize(text):
    sentence_doc = sent_tokenize(text)
    sentences = [gensim.utils.simple_preprocess(str(doc), deacc=True) for doc in
                 sentence_doc]  # deacc=True removes punctuations
    stop = set(stopwords.words('english') + ['factiva', 'asianreview', 'viewpoint', 'sourceupdate', 'stimes', 'prn', 'st'])
    no_stop = [[word for word in sentence if word not in stop] for sentence in sentences]

    return no_stop


def lemmatization(texts, allowed_postags=['NOUN']):
    """https://spacy.io/api/annotation"""
    texts_out = [[token.lemma_ for token in text if token.pos_ in allowed_postags] for text in texts]
    return texts_out

In [4]:
import os
path=os.listdir(r"C:\Users\Li\Desktop\中国（txt）")
datalist=[]

for i in path:
    domain= r"C:\\Users\\Li\\Desktop\\中国（txt）\\"+i
    #print(domain)
    with open(domain,"r",encoding="utf-8") as f:
        data=f.read()
        datalist.append(data)


In [101]:
text_li=[clean_text(i) for i in datalist]
text_li[0]

'ASIANREVIEW China seeks pragmatic cooperation despite US Indo Pacific StrategyThe divergences between China and the US have been going on for some time with the West Pacificregion in the throes of the tension between the two major powers. As China US competitionintensifies the US is becoming increasingly anxious about its global hegemony.The US has labeled China as a revisionist power and a strategic competitor. Some Americanpeople have a deep belief in the Thucydides Trap which leads to increasing US strategic suspicion toward China.In addition the US believes that China s theory of peaceful development is not logicallyvalid.Many countries have their own versions of the so called Indo Pacific Strategy such asAustralia Japan India and the US. They all more or less aim at countering or coordinating againstthe China proposed BRI Initiative BRI which reflects their anxiety over China s rise.Of all the versions the US Indo Pacific Strategy is the most systematic and the most offensiveone.

In [49]:
com_sent_li = [sentence_tokenize(text) for text in text_li]
com_sent_li[0]

[['china',
  'seeks',
  'pragmatic',
  'cooperation',
  'despite',
  'us',
  'indo',
  'pacific',
  'strategythe',
  'divergences',
  'china',
  'us',
  'going',
  'time',
  'west',
  'pacificregion',
  'throes',
  'tension',
  'two',
  'major',
  'powers'],
 ['china',
  'us',
  'us',
  'becoming',
  'increasingly',
  'anxious',
  'global',
  'hegemony',
  'us',
  'labeled',
  'china',
  'revisionist',
  'power',
  'strategic',
  'competitor'],
 ['americanpeople',
  'deep',
  'belief',
  'thucydides',
  'trap',
  'leads',
  'increasing',
  'us',
  'strategic',
  'suspicion',
  'toward',
  'china',
  'addition',
  'us',
  'believes',
  'china',
  'theory',
  'peaceful',
  'development',
  'logicallyvalid',
  'many',
  'countries',
  'versions',
  'called',
  'indo',
  'pacific',
  'strategy',
  'asaustralia',
  'japan',
  'india',
  'us'],
 ['less',
  'aim',
  'countering',
  'coordinating',
  'againstthe',
  'china',
  'proposed',
  'bri',
  'initiative',
  'bri',
  'reflects',
  'anxi

In [77]:
sent_li = []
for sentence in com_sent_li:
    for tokens in sentence:
        sent_li.append(tokens)

In [99]:
sent_li = [tokens for sentence in com_sent_li for tokens in sentence]
len(sent_li)

5119

In [79]:
'''with open(r"C:\Users\Li\Desktop\name.txt",'a') as f:
    for i in sent_li:
        for b in i:
            f.write(str(b)+'\n')'''

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 14-15: truncated \UXXXXXXXX escape (<ipython-input-79-efa0683b1aff>, line 1)

In [90]:
bigram = Phrases(sent_li, min_count=5, threshold=80)
trigram = Phrases(bigram[sent_li], threshold=80)  
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)
trigrams = [trigram_mod[bigram_mod[sentence]] for sentence in sent_li]

In [93]:
num_features = 50        # Word vector dimensionality (how many features each word will be given)
min_word_count = 2        # Minimum word count to be taken into account
num_workers = 8       # Number of threads to run in parallel (equal to your amount of cores)
context = 5              # Context window size
downsampling = 0 #1e-2    # Downsample setting for frequent words
seed_n = 1                # Seed for the random number generator (to create reproducible results) 
sg_n = 1                  # Skip-gram = 1, CBOW = 0

model = Word2Vec(trigrams, workers=num_workers, \
            vector_size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling, seed=seed_n, sg=sg_n)
model.init_sims(replace=True)

model.save("China_model.model")