In [133]:
import numpy as np 
import pandas as pd 
import nltk 
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from tqdm import tqdm 
from sklearn.metrics.pairwise import cosine_similarity 
stopwords = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aniru\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aniru\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [134]:
with open(r'input.txt') as fp:
    data = fp.read()

In [135]:
sentences = nltk.sent_tokenize(data)

In [136]:
sentences

["Millions go missing at China bank\nTwo senior officials at one of China's top commercial banks have reportedly disappeared after funds\nworth up to $120m (£64m) went missing.",
 'The pair both worked at Bank of China in the northern city of Harbin, the South China Morning Post\nreported.',
 "The latest scandal at Bank of China will do nothing to reassure foreign investors that China's\nbig four banks are ready for international listings.",
 'Government policy sees the bank listings as vital\neconomic reforms.',
 'Bank of China is one of two frontrunners in the race to list overseas.',
 'The other is\nChina Construction Bank.',
 'Both are expected to list abroad during 2005.',
 'They shared a $45bn state bailout in 2003, to help clean up their balance sheets in preparation for a\nforeign stock market debut.',
 'However, a report in the China-published Economic Observer said on Monday that the two banks may\nhave scrapped plans to list in New York because of the cost of meeting regulat

In [137]:
def process_data(sentence):

    sentence = sentence.strip()
    lower = sentence.lower()
    words = nltk.word_tokenize(lower)

    words = [word for word in words if word not in stopwords]
    lemm_words = [lemmatizer.lemmatize(word) for word in words]

    return " ".join(lemm_words)

In [138]:
processed_sentences = []
for sentence in sentences:
    processed_sentences.append(process_data(sentence))

In [139]:
processed_sentences

["million go missing china bank two senior official one china 's top commercial bank reportedly disappeared fund worth $ 120m ( £64m ) went missing .",
 'pair worked bank china northern city harbin , south china morning post reported .',
 "latest scandal bank china nothing reassure foreign investor china's big four bank ready international listing .",
 'government policy see bank listing vital economic reform .',
 'bank china one two frontrunners race list overseas .',
 'china construction bank .',
 'expected list abroad 2005 .',
 'shared $ 45bn state bailout 2003 , help clean balance sheet preparation foreign stock market debut .',
 'however , report china-published economic observer said monday two bank may scrapped plan list new york cost meeting regulatory requirement imposed since enron scandal .',
 "bank china country 's biggest foreign exchange dealer , china construction bank largest deposit holder .",
 "china 's banking sector burdened least $ 190bn bad debt according official

In [140]:
cv = CountVectorizer()
tfidf = TfidfVectorizer()

In [141]:
x_cf_train = cv.fit_transform(processed_sentences)
x_tf_train = tfidf.fit_transform(processed_sentences)

In [142]:
word2vec_model = Word2Vec(sentences=processed_sentences, vector_size=100, window=5, min_count=1, workers=4)
cbow = Word2Vec(sentences=processed_sentences, vector_size=100, window=5, min_count=1, workers=4 , sg = 0)
skipgram = Word2Vec(sentences=processed_sentences, vector_size=100, window=5, min_count=1, workers=4 , sg = 1)

In [143]:
def calculate_mean_embedding(tokens , model):
    embeddings = [model.wv[token] for token in tokens if token in model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0).astype(np.float32)
    else:
        return np.zeros(model.vector_size).astype(np.float32)

In [144]:
tqdm.pandas(desc="Calculating embeddings")
x_w2v_train = [calculate_mean_embedding(sentence , word2vec_model) for sentence in processed_sentences]

In [145]:
tqdm.pandas(desc="Calculating embeddings")
x_cbow_train = [calculate_mean_embedding(sentence , cbow) for sentence in processed_sentences]

In [146]:
tqdm.pandas(desc="Calculating embeddings")
x_sg_train = [calculate_mean_embedding(sentence , skipgram) for sentence in processed_sentences]

In [147]:
model_cv = KMeans()
model_tf = KMeans()
model_w2v = KMeans()
model_cbow = KMeans()
model_sg = KMeans()

In [148]:
x_w2v_train = [np.array(i) for i in x_w2v_train]
x_cbow_train = [np.array(i) for i in x_cbow_train]
x_sg_train = [np.array(i) for i in x_sg_train]

In [149]:
model_cv.fit(x_cf_train.toarray())
model_tf.fit(x_tf_train.toarray())
model_w2v.fit(x_w2v_train)
model_cbow.fit(x_cbow_train)
model_sg.fit(x_sg_train)



In [150]:
def find_closest(center , points , seen):
    min_dist = float('inf')
    ind = -1
    for i in range(len(points)):

        cs_dst = cosine_similarity(center.reshape(1 , -1) , points[i].reshape(1 , -1))

        if cs_dst <= min_dist:
            min_dist = cs_dst 
            if i not in seen:
                ind = i 
    
    return ind

In [151]:
summary_cv = ""
seen = []
for center in model_cv.cluster_centers_:
    sentence_index = find_closest(center , x_cf_train.toarray() , seen)
    seen.append(sentence_index)
    summary_cv+=sentences[sentence_index]

In [152]:
summary_tfidf = ""
seen = []
for center in model_tf.cluster_centers_:
    sentence_index = find_closest(center , x_tf_train.toarray() , seen)
    seen.append(sentence_index)
    summary_tfidf+=sentences[sentence_index]

In [153]:
summary_w2v = ""
seen = []
for center in model_w2v.cluster_centers_:
    sentence_index = find_closest(center , x_w2v_train , seen)
    seen.append(sentence_index)
    summary_w2v+=sentences[sentence_index]

In [154]:
summary_cbow = ""
seen = []
for center in model_cbow.cluster_centers_:
    sentence_index = find_closest(center , x_cbow_train , seen)
    seen.append(sentence_index)
    summary_cbow+=sentences[sentence_index]

In [155]:
summary_sg = ""
seen = []
for center in model_sg.cluster_centers_:
    sentence_index = find_closest(center , x_sg_train  , seen)
    seen.append(sentence_index)
    summary_sg+=sentences[sentence_index]

In [156]:
import textwrap

In [157]:
print(textwrap.fill(summary_cv , 100))

Another high-profile financial firm, China Life, is facing shareholder lawsuits and a probe by the
US Securities and Exchange Commission following its 2004 New York listing over its failure to
disclose accounting irregularities at its parent company.First, inefficient state enterprises
continue to receive protection from bankruptcy because they employ large numbers of people.Second,
many questionable loans come not from the big four, but from smaller banks.But two problems
persist.They shared a $45bn state bailout in 2003, to help clean up their balance sheets in
preparation for a foreign stock market debut.Officially, one in five loans is not being repaid.Both
are expected to list abroad during 2005.China's leaders see reforming the top four banks as vital to
distribute capital to profitable companies and protect the health of China's economic boom.


In [158]:
print(textwrap.fill(summary_tfidf , 100))

Second, many questionable loans come not from the big four, but from smaller banks.First,
inefficient state enterprises continue to receive protection from bankruptcy because they employ
large numbers of people.But two problems persist.Officially, one in five loans is not being
repaid.Both are expected to list abroad during 2005.They shared a $45bn state bailout in 2003, to
help clean up their balance sheets in preparation for a foreign stock market debut.The pair both
worked at Bank of China in the northern city of Harbin, the South China Morning Post
reported.Millions go missing at China bank Two senior officials at one of China's top commercial
banks have reportedly disappeared after funds worth up to $120m (£64m) went missing.


In [159]:
print(textwrap.fill(summary_w2v , 100))

But two problems persist.The other is China Construction Bank.Both are expected to list abroad
during 2005.The pair both worked at Bank of China in the northern city of Harbin, the South China
Morning Post reported.The latest scandal at Bank of China will do nothing to reassure foreign
investors that China's big four banks are ready for international listings.Bank of China is one of
two frontrunners in the race to list overseas.Millions go missing at China bank Two senior officials
at one of China's top commercial banks have reportedly disappeared after funds worth up to $120m
(£64m) went missing.Government policy sees the bank listings as vital economic reforms.


In [160]:
print(textwrap.fill(summary_cbow , 100))

The other is China Construction Bank.The latest scandal at Bank of China will do nothing to reassure
foreign investors that China's big four banks are ready for international listings.Bank of China is
one of two frontrunners in the race to list overseas.The pair both worked at Bank of China in the
northern city of Harbin, the South China Morning Post reported.Millions go missing at China bank Two
senior officials at one of China's top commercial banks have reportedly disappeared after funds
worth up to $120m (£64m) went missing.But two problems persist.Both are expected to list abroad
during 2005.Government policy sees the bank listings as vital economic reforms.


In [161]:
print(textwrap.fill(summary_sg , 100))

But two problems persist.The other is China Construction Bank.Government policy sees the bank
listings as vital economic reforms.The pair both worked at Bank of China in the northern city of
Harbin, the South China Morning Post reported.Millions go missing at China bank Two senior officials
at one of China's top commercial banks have reportedly disappeared after funds worth up to $120m
(£64m) went missing.The latest scandal at Bank of China will do nothing to reassure foreign
investors that China's big four banks are ready for international listings.Both are expected to list
abroad during 2005.Another high-profile financial firm, China Life, is facing shareholder lawsuits
and a probe by the US Securities and Exchange Commission following its 2004 New York listing over
its failure to disclose accounting irregularities at its parent company.
