In [283]:
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import Word
from sklearn.decomposition import NMF
from sklearn.cluster import KMeans

In [284]:
with open('C:\\Users\\Deepak\\Downloads\\WikiArticles.txt') as f:
    articles = list(f)
len(articles)

15

In [285]:
WikiDesc = pd.DataFrame({'Desc': articles})
WikiDesc

Unnamed: 0,Desc
0,Education is the process of is am are facilita...
1,Sachin Ramesh Tendulkar (/?s?t??n t?n?du?lk?r/...
2,Information technology (IT) is the use of comp...
3,The Graduate Aptitude Test in Engineering (GAT...
4,The Ranji Trophy is a domestic first-class cri...
5,"Gravity (from Latin gravitas, meaning 'weight'..."
6,A school is an institution designed to provide...
7,A computer is a device that can be instructed ...
8,Physics (from Ancient Greek: ?????? (????????)...
9,Sir Isaac Newton FRS PRS (25 December 1642 – 2...


In [286]:
with open('C:\\Users\\Deepak\\Downloads\\Labels.txt') as f:
    labels = list(f)
len(labels)

15

In [287]:
WikiLabels = pd.DataFrame({'labels': labels})
WikiLabels.head()

Unnamed: 0,labels
0,Education\n
1,Sachin Tendulkar\n
2,Information Technology\n
3,GATE\n
4,Ranji Trophy\n


In [288]:
WikiLabels['labels'] = WikiLabels['labels'].replace('\n','', regex=True)
WikiLabels['labels']

0                  Education
1           Sachin Tendulkar
2     Information Technology
3                       GATE
4               Ranji Trophy
5                    Gravity
6                     School
7                   Computer
8                    Physics
9                     Newton
10                   Student
11          Wankhede Stadium
12                     Dhoni
13           Albert Einstein
14                       IBM
Name: labels, dtype: object

### Removing punctuations and stop words

In [289]:
stop = stopwords.words('English')

In [290]:
WikiDesc['Desc_Transformed'] = WikiDesc['Desc'].apply(lambda x: " ".join(x.lower() for x in x.split()))  # Lowering the text
WikiDesc['Desc_Transformed'] = WikiDesc['Desc_Transformed'].str.replace('[^\w\s]','')  # Special characters

In [291]:
WikiDesc['Desc_Transformed'] = WikiDesc['Desc_Transformed'].apply(lambda x: " ".join(x for x in x.split() if x not in stop)) # Stop words removal
WikiDesc['Desc_Transformed'] = WikiDesc['Desc_Transformed'].str.replace(r'\d+', ' ')  # Digits removal
WikiDesc['Desc_Transformed'] = WikiDesc['Desc_Transformed'].str.replace(r'\s+', ' ')  # Extra space removal

In [292]:
freq = pd.Series(' '.join(WikiDesc['Desc_Transformed']).split()).value_counts()[:30]
freq

cricket          17
education        13
one              12
also             12
international    10
indian           10
physics          10
theory           10
gravity          10
india             8
technology        7
relativity        7
information       7
school            6
known             6
stadium           6
gate              6
time              6
computers         6
natural           6
mechanics         6
general           5
higher            5
systems           5
university        5
ibm               5
motion            5
research          5
business          5
first             5
dtype: int64

In [293]:
To_remove = ['one', 'also', 'known', 'general']
WikiDesc['Desc_Transformed'] = WikiDesc['Desc_Transformed'].apply(lambda x: " ".join(x for x in x.split() if x not in To_remove))

### Lemmatization

In [294]:
WikiDesc['Desc_Transformed'] = WikiDesc['Desc_Transformed'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [295]:
WikiDesc['Desc'][1]

'Sachin Ramesh Tendulkar (/?s?t??n t?n?du?lk?r/ (About this soundlisten); born 24 April 1973) is a former Indian international cricketer and a former captain of the Indian national team, regarded as the greatest batsman of all time.[4] He is the highest run scorer of all time in International cricket. Tendulkar took up cricket at the age of eleven, made his Test debut on 15 November 1989 against Pakistan in Karachi at the age of sixteen, and went on to represent Mumbai domestically and India internationally for close to twenty-four years. He is the only player to have scored one hundred international centuries, the first batsman to score a double century in a ODI, the holder of the record for the most number of runs in both Test and ODI, and the only player to complete more than 30,000 runs in international cricket.[5] He is colloquially known as Little Master or Master Blaster,[6][7][8][9] and often referred to as the God of Cricket by Indian cricket followers.[10][11] Despite his rep

In [296]:
WikiDesc['Desc_Transformed'][1]

'sachin ramesh tendulkar stn tndulkr soundlisten born april former indian international cricketer former captain indian national team regarded greatest batsman time highest run scorer time international cricket tendulkar took cricket age eleven made test debut november pakistan karachi age sixteen went represent mumbai domestically india internationally close twentyfour year player scored hundred international century first batsman score double century odi holder record number run test odi player complete run international cricket colloquially little master master blaster often referred god cricket indian cricket follower despite reputation modesty humility stating god cricket make mistake god doesnt'

In [297]:
WikiDesc['Desc'][14]

'International Business Machines Corporation (IBM) is an American multinational information technology company headquartered in Armonk, New York, with operations in over 170 countries. The company began in 1911 as the Computing-Tabulating-Recording Company (CTR) and was renamed "International Business Machines" in 1924. IBM produces and sells computer hardware, middleware and software, and provides hosting and consulting services in areas ranging from mainframe computers to nanotechnology. IBM is also a major research organization, holding the record for most U.S. patents generated by a business (as of 2018) for 25 consecutive years.[5] Inventions by IBM include the automated teller machine (ATM), the floppy disk, the hard disk drive, the magnetic stripe card, the relational database, the SQL programming language, the UPC barcode, and dynamic random-access memory (DRAM). The IBM mainframe, exemplified by the System/360, was the dominant computing platform during the 1960s and 1970s.\n'

In [298]:
WikiDesc['Desc_Transformed'][14]

'international business machine corporation ibm american multinational information technology company headquartered armonk new york operation country company began computingtabulatingrecording company ctr renamed international business machine ibm produce sell computer hardware middleware software provides hosting consulting service area ranging mainframe computer nanotechnology ibm major research organization holding record u patent generated business consecutive year invention ibm include automated teller machine atm floppy disk hard disk drive magnetic stripe card relational database sql programming language upc barcode dynamic randomaccess memory dram ibm mainframe exemplified system dominant computing platform s s'

In [299]:
tfidf_vec = TfidfVectorizer()
test_tfidf = tfidf_vec.fit_transform(WikiDesc['Desc_Transformed']) 
print(test_tfidf.shape)
test_tfidf

(15, 762)


<15x762 sparse matrix of type '<class 'numpy.float64'>'
	with 985 stored elements in Compressed Sparse Row format>

In [300]:
words = tfidf_vec.get_feature_names()
print(len(words))
words[0:10]

762


['ability',
 'academic',
 'account',
 'accounted',
 'accurately',
 'acquisition',
 'act',
 'activity',
 'additionally',
 'admission']

In [301]:
# Non-Negative Matrix Factorization
model = NMF(n_components = 4)  # Clusters will be four, which means we will have only 4 components to describe complete data.
model.fit(test_tfidf)
nmf_features = model.transform(test_tfidf)

In [302]:
print(WikiDesc['Desc_Transformed'].shape)
print(test_tfidf.shape)  # Now this is our new array of 15 samples having 762 features, and this need to be dimensionally reduced
print(nmf_features.shape)  # which is done by NMF
print(model.components_.shape)  # We have now 4 components and 762 features

(15,)
(15, 762)
(15, 4)
(4, 762)


In [303]:
# By plotting these nmf_features, we can draw some patters
# Below we can see 4 components:
# Analysis is done is this fashion:
# Out of 762 features, it analyzed that 'cricket', 'international', 'indian', 'team', 'test' are matching in records 'Dhoni', 'Sachin' etc.
# Therefore, it created on component of above set ['cricket', 'team' . .] and created another component of set ['student', 'institution'. . .]
df = pd.DataFrame(nmf_features, index = WikiLabels['labels'])
df
# Now while plotting it found that Document 'Sachin' is having those words which are in Component 0, and it gave some score

Unnamed: 0_level_0,0,1,2,3
labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Education,0.0,0.697134,0.0,0.0
Sachin Tendulkar,0.543518,0.0,0.0,0.002309
Information Technology,0.0,0.0,0.004336,0.671488
GATE,0.132291,0.268194,0.009429,0.106107
Ranji Trophy,0.406181,0.0,0.0,0.0
Gravity,0.0,0.0,0.464894,0.0
School,0.0,0.784458,0.0,0.0
Computer,0.0,0.0,0.0,0.54266
Physics,0.0,0.025788,0.441423,0.010966
Newton,0.006363,0.0,0.603913,0.004468


In [304]:
print(list(df.loc['Sachin Tendulkar']))
print(list(df.loc['Dhoni']))

[0.5435177409054822, 0.0, 0.0, 0.0023092218122147874]
[0.5504743278144836, 0.0, 0.006552739755561959, 0.0]


In [305]:
# First column is having the largest values and on the basis of which clustering will be done.

In [306]:
print(list(df.loc['Physics']))
print(list(df.loc['Newton']))
print(list(df.loc['Gravity']))
print(list(df.loc['Albert Einstein']))

[0.0, 0.02578798377097337, 0.4414228183155255, 0.010966310492415764]
[0.006362700159756607, 0.0, 0.6039130579603365, 0.004468111529097032]
[0.0, 0.0, 0.46489429126541293, 0.0]
[0.0, 0.0, 0.6343605979311828, 0.0]


In [307]:
# Third column is having the largest values and on the basis of which clustering will be done.

In [308]:
components_df = pd.DataFrame(model.components_)   # Showing the features weight wise
components_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,752,753,754,755,756,757,758,759,760,761
0,0.0,0.0,0.000167,0.000167,0.0,0.0,0.0,0.0,0.043785,0.00948,...,0.070171,0.000167,0.043785,0.0,0.164007,0.0,0.100863,0.0,0.0,0.0
1,0.0,0.002313,0.0,0.0,0.0,0.04943,0.04943,0.038615,0.0,0.013302,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060752
2,0.0,0.061764,0.043809,0.043809,0.027345,0.0,0.0,0.0,0.0,0.000308,...,0.038212,0.043809,0.0,0.040461,0.030757,0.0,0.0,0.0,0.0,0.0
3,0.041942,0.000863,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006088,...,0.0,0.0,0.0,0.0,0.0,0.055214,0.038045,0.055214,0.051075,0.0


In [309]:
components_df = pd.DataFrame(model.components_, columns=words)
components_df

Unnamed: 0,ability,academic,account,accounted,accurately,acquisition,act,activity,additionally,admission,...,widely,wilhelm,witnessed,work,world,writing,year,yet,york,young
0,0.0,0.0,0.000167,0.000167,0.0,0.0,0.0,0.0,0.043785,0.00948,...,0.070171,0.000167,0.043785,0.0,0.164007,0.0,0.100863,0.0,0.0,0.0
1,0.0,0.002313,0.0,0.0,0.0,0.04943,0.04943,0.038615,0.0,0.013302,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060752
2,0.0,0.061764,0.043809,0.043809,0.027345,0.0,0.0,0.0,0.0,0.000308,...,0.038212,0.043809,0.0,0.040461,0.030757,0.0,0.0,0.0,0.0,0.0
3,0.041942,0.000863,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006088,...,0.0,0.0,0.0,0.0,0.0,0.055214,0.038045,0.055214,0.051075,0.0


In [310]:
component = components_df.iloc[0] # taking first component and checking what all features are having greater weights
print(component.nlargest())  # It means cricket is having highest weight. It can be the fact that 'cricket' occured more than any other word (feature)

cricket          0.618692
international    0.285682
indian           0.273586
test             0.227700
team             0.225801
Name: 0, dtype: float64


In [311]:
component = components_df.iloc[1]
print(component.nlargest())

education      0.449623
school         0.401763
student        0.206097
institution    0.191788
university     0.156740
Name: 1, dtype: float64


In [312]:
nmf_features

array([[0.        , 0.69713357, 0.        , 0.        ],
       [0.54351774, 0.        , 0.        , 0.00230922],
       [0.        , 0.        , 0.00433635, 0.67148785],
       [0.13229142, 0.2681941 , 0.00942917, 0.10610663],
       [0.40618131, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.46489429, 0.        ],
       [0.        , 0.78445763, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.54265988],
       [0.        , 0.02578798, 0.44142282, 0.01096631],
       [0.0063627 , 0.        , 0.60391306, 0.00446811],
       [0.        , 0.53197035, 0.        , 0.        ],
       [0.43795715, 0.        , 0.        , 0.        ],
       [0.55047433, 0.        , 0.00655274, 0.        ],
       [0.        , 0.        , 0.6343606 , 0.        ],
       [0.00675387, 0.        , 0.        , 0.66493922]])

In [313]:
kmeans = KMeans(n_clusters = 4)
kmeans.fit_transform(nmf_features)
labels = kmeans.predict(nmf_features)

In [317]:
df = pd.DataFrame({'Articles': WikiLabels['labels'], 'Category': labels})

In [323]:
df.sort_values(by='Category')

Unnamed: 0,Articles,Category
0,Education,0
3,GATE,0
6,School,0
10,Student,0
5,Gravity,1
8,Physics,1
9,Newton,1
13,Albert Einstein,1
2,Information Technology,2
7,Computer,2
