In [181]:
import pandas as pd
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from textblob import Word

In [182]:
with open('C:\\Users\\Deepak\\Downloads\\WikiArticles.txt') as f:
    articles = list(f)
len(articles)

15

In [183]:
WikiDesc = pd.DataFrame({'Desc': articles})
WikiDesc

Unnamed: 0,Desc
0,Education is the process of facilitating learn...
1,Sachin Ramesh Tendulkar (/?s?t??n t?n?du?lk?r/...
2,Information technology (IT) is the use of comp...
3,The Graduate Aptitude Test in Engineering (GAT...
4,The Ranji Trophy is a domestic first-class cri...
5,"Gravity (from Latin gravitas, meaning 'weight'..."
6,A school is an institution designed to provide...
7,A computer is a device that can be instructed ...
8,Physics (from Ancient Greek: ?????? (????????)...
9,Sir Isaac Newton FRS PRS (25 December 1642 – 2...


In [184]:
with open('C:\\Users\\Deepak\\Downloads\\Labels.txt') as f:
    labels = list(f)
len(labels)

15

In [185]:
WikiLabels = pd.DataFrame({'labels': labels})
WikiLabels

Unnamed: 0,labels
0,Education\n
1,Sachin Tendulkar\n
2,Information Technology\n
3,GATE\n
4,Ranji Trophy\n
5,Gravity\n
6,School\n
7,Computer\n
8,Physics\n
9,Newton\n


In [186]:
WikiLabels['labels'] = WikiLabels['labels'].replace('\n','', regex=True)
WikiLabels['labels']

0                  Education
1           Sachin Tendulkar
2     Information Technology
3                       GATE
4               Ranji Trophy
5                    Gravity
6                     School
7                   Computer
8                    Physics
9                     Newton
10                   Student
11          Wankhede Stadium
12                     Dhoni
13           Albert Einstein
14                       IBM
Name: labels, dtype: object

In [187]:
freq = pd.Series(' '.join(WikiDesc['Desc']).split()).value_counts()[:30]
freq

the          133
of           112
and           72
in            58
is            47
to            45
a             38
for           24
as            24
by            22
The           19
or            16
his           15
which         13
that          13
also          12
most          12
one           11
with          11
an            10
He            10
Indian        10
who            9
was            9
theory         8
on             8
has            8
cricket        7
such           7
education      7
dtype: int64

In [188]:
tfidf_vec = TfidfVectorizer()
test_tfidf = tfidf_vec.fit_transform(WikiDesc['Desc']) 
print(test_tfidf.shape)
test_tfidf

(15, 938)


<15x938 sparse matrix of type '<class 'numpy.float64'>'
	with 1453 stored elements in Compressed Sparse Row format>

In [189]:
len(tfidf_vec.vocabulary_)

938

### Removing punctuations and stop words

In [190]:
stop = stopwords.words('English')

In [191]:
WikiDesc['Desc_Transformed'] = WikiDesc['Desc'].apply(lambda x: " ".join(x.lower() for x in x.split()))  # Lowering the text
WikiDesc['Desc_Transformed'] = WikiDesc['Desc_Transformed'].str.replace('[^\w\s]','')  # Special characters

In [192]:
WikiDesc['Desc_Transformed'] = WikiDesc['Desc_Transformed'].apply(lambda x: " ".join(x for x in x.split() if x not in stop)) # Stop words removal
WikiDesc['Desc_Transformed'] = WikiDesc['Desc_Transformed'].str.replace(r'\d+', ' ')  # Digits removal
WikiDesc['Desc_Transformed'] = WikiDesc['Desc_Transformed'].str.replace(r'\s+', ' ')  # Extra space removal

In [193]:
freq = pd.Series(' '.join(WikiDesc['Desc_Transformed']).split()).value_counts()[:30]
freq

cricket          17
education        13
one              12
also             12
indian           10
theory           10
physics          10
international    10
gravity          10
india             8
information       7
technology        7
relativity        7
stadium           6
gate              6
computers         6
time              6
natural           6
known             6
mechanics         6
school            6
university        5
ibm               5
general           5
business          5
philosophy        5
research          5
master            5
devices           5
motion            5
dtype: int64

In [194]:
To_remove = ['one', 'also', 'known', 'general']
WikiDesc['Desc_Transformed'] = WikiDesc['Desc_Transformed'].apply(lambda x: " ".join(x for x in x.split() if x not in To_remove))

### Lemmatization

In [195]:
WikiDesc['Desc_Transformed'] = WikiDesc['Desc_Transformed'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [196]:
WikiDesc['Desc'][1]

'Sachin Ramesh Tendulkar (/?s?t??n t?n?du?lk?r/ (About this soundlisten); born 24 April 1973) is a former Indian international cricketer and a former captain of the Indian national team, regarded as the greatest batsman of all time.[4] He is the highest run scorer of all time in International cricket. Tendulkar took up cricket at the age of eleven, made his Test debut on 15 November 1989 against Pakistan in Karachi at the age of sixteen, and went on to represent Mumbai domestically and India internationally for close to twenty-four years. He is the only player to have scored one hundred international centuries, the first batsman to score a double century in a ODI, the holder of the record for the most number of runs in both Test and ODI, and the only player to complete more than 30,000 runs in international cricket.[5] He is colloquially known as Little Master or Master Blaster,[6][7][8][9] and often referred to as the God of Cricket by Indian cricket followers.[10][11] Despite his rep

In [197]:
WikiDesc['Desc_Transformed'][1]

'sachin ramesh tendulkar stn tndulkr soundlisten born april former indian international cricketer former captain indian national team regarded greatest batsman time highest run scorer time international cricket tendulkar took cricket age eleven made test debut november pakistan karachi age sixteen went represent mumbai domestically india internationally close twentyfour year player scored hundred international century first batsman score double century odi holder record number run test odi player complete run international cricket colloquially little master master blaster often referred god cricket indian cricket follower despite reputation modesty humility stating god cricket make mistake god doesnt'

In [198]:
WikiDesc['Desc'][14]

'International Business Machines Corporation (IBM) is an American multinational information technology company headquartered in Armonk, New York, with operations in over 170 countries. The company began in 1911 as the Computing-Tabulating-Recording Company (CTR) and was renamed "International Business Machines" in 1924. IBM produces and sells computer hardware, middleware and software, and provides hosting and consulting services in areas ranging from mainframe computers to nanotechnology. IBM is also a major research organization, holding the record for most U.S. patents generated by a business (as of 2018) for 25 consecutive years.[5] Inventions by IBM include the automated teller machine (ATM), the floppy disk, the hard disk drive, the magnetic stripe card, the relational database, the SQL programming language, the UPC barcode, and dynamic random-access memory (DRAM). The IBM mainframe, exemplified by the System/360, was the dominant computing platform during the 1960s and 1970s.\n'

In [199]:
WikiDesc['Desc_Transformed'][14]

'international business machine corporation ibm american multinational information technology company headquartered armonk new york operation country company began computingtabulatingrecording company ctr renamed international business machine ibm produce sell computer hardware middleware software provides hosting consulting service area ranging mainframe computer nanotechnology ibm major research organization holding record u patent generated business consecutive year invention ibm include automated teller machine atm floppy disk hard disk drive magnetic stripe card relational database sql programming language upc barcode dynamic randomaccess memory dram ibm mainframe exemplified system dominant computing platform s s'

In [200]:
tfidf_vec = TfidfVectorizer()
test_tfidf = tfidf_vec.fit_transform(WikiDesc['Desc_Transformed']) 
print(test_tfidf.shape)
test_tfidf

(15, 762)


<15x762 sparse matrix of type '<class 'numpy.float64'>'
	with 985 stored elements in Compressed Sparse Row format>

In [201]:
words = tfidf_vec.get_feature_names()
words[0: 20]

['ability',
 'academic',
 'account',
 'accounted',
 'accurately',
 'acquisition',
 'act',
 'activity',
 'additionally',
 'admission',
 'affair',
 'age',
 'agency',
 'albert',
 'albt',
 'alongside',
 'although',
 'american',
 'ancient',
 'another']

In [202]:
svd = TruncatedSVD(n_components=50)
kmeans = KMeans(n_clusters=4)
pipeline = make_pipeline(svd, kmeans)
pipeline.fit(test_tfidf)
labels = pipeline.predict(test_tfidf)
labels

array([1, 0, 3, 1, 0, 2, 1, 3, 2, 2, 1, 0, 0, 2, 3])

In [203]:
len(tfidf_vec.vocabulary_)

762

In [204]:
# Vocabulary has been reduced after removing stop words

In [205]:
df = pd.DataFrame({'label': labels, 'article': WikiLabels['labels']})
print(df.sort_values(by='label'))

    label                 article
1       0        Sachin Tendulkar
4       0            Ranji Trophy
11      0        Wankhede Stadium
12      0                   Dhoni
0       1               Education
3       1                    GATE
6       1                  School
10      1                 Student
5       2                 Gravity
8       2                 Physics
9       2                  Newton
13      2         Albert Einstein
2       3  Information Technology
7       3                Computer
14      3                     IBM


In [207]:
# KNN correctly classified correlated articles into separate cluster