# Text preprocessing and grouping by topic

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import os
os.chdir('gdrive/My Drive/text_mining/Projet')

In [0]:
!ls

'BBC News Sample Solution.csv'	'BBC News Train.csv'
'BBC News Test.csv'		 my-submission.csv


In [0]:

import numpy as np
import pandas as pd

In [0]:
news = pd.read_csv ('BBC News Train.csv')



In [0]:
news.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [0]:
news=news[['Category','Text']]

In [0]:
import re
import nltk
nltk.download('stopwords')
#used once to download stopwords.
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
#cleaning and preprocessing

for i in range(0,len(news['Text'])):

  feed = re.sub('[^a-zA-Z]',' ',news['Text'][i])
  feed = feed.lower()
  feed = feed.split()
  ps = PorterStemmer()
  feed = [ps.stem(word) for word in feed if not word in set(stopwords.words('english'))]
  feed = ' '.join(feed)

  news['Text'][i]=feed



In [0]:
groups = news.groupby('Category') 


In [0]:
groups.first()

Unnamed: 0_level_0,Text
Category,Unnamed: 1_level_1
business,worldcom ex boss launch defenc lawyer defend f...
entertainment,french honour director parker british film dir...
politics,howard truant play snooker conserv leader mich...
sport,wale silent grand slam talk rhi william say wa...
tech,lifestyl govern mobil choic faster better funk...


In [0]:
tech=groups.get_group('tech') 
business=groups.get_group('business') 
entertainment=groups.get_group('entertainment') 
politics=groups.get_group('politics') 
sport=groups.get_group('sport') 


In [0]:
len(tech),len(business),len(entertainment),len(politics),len(sport)

(261, 336, 273, 274, 346)

# topic modeling for tech articals

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


In [0]:
count_vect = CountVectorizer()
tech_term_matrix = count_vect.fit_transform(tech['Text'])

In [0]:
tech_term_matrix.shape

(261, 6243)

In [0]:
LDA_tech = LatentDirichletAllocation(n_components=5, random_state=42)
LDA_tech.fit(tech_term_matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [0]:
for i,topic in enumerate(LDA_tech.components_):
  print(f'Top 10 words for topic #{i}:')
  print([count_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
  print('\n')

Top 10 words for topic #0:
['betsen', 'auxerr', 'jorg', 'newham', 'function', 'fifa', 'romanian', 'paphiti', 'sloppi', 'llewellyn']


Top 10 words for topic #1:
['paphiti', 'verg', 'comeback', 'schuettler', 'santini', 'simpli', 'unsubstanti', 'overhaul', 'unstructur', 'romanian']


Top 10 words for topic #2:
['trickl', 'wreck', 'peck', 'swipe', 'overhaul', 'desper', 'mid', 'mcenro', 'romanian', 'function']


Top 10 words for topic #3:
['wreck', 'unruffl', 'swipe', 'betray', 'paphiti', 'sampl', 'mistim', 'overhaul', 'unstructur', 'romanian']


Top 10 words for topic #4:
['unruffl', 'correctli', 'overhaul', 'dwarf', 'schuettler', 'indiscret', 'unveil', 'share', 'swipe', 'unstructur']




# Topic modeling for sports articals

In [0]:
count_vect = CountVectorizer()
sport_term_matrix = count_vect.fit_transform(sport['Text'])

In [0]:
sport_term_matrix.shape

(346, 6282)

In [0]:
LDA_sport = LatentDirichletAllocation(n_components=3, random_state=42)
LDA_sport.fit(sport_term_matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=3, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [0]:
for i,topic in enumerate(LDA_sport.components_):
  print(f'Top 10 words for topic #{i}:')
  print([count_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
  print('\n')

Top 10 words for topic #0:
['time', 'new', 'england', 'team', 'game', 'wale', 'win', 'world', 'said', 'year']


Top 10 words for topic #1:
['final', 'open', 'england', 'time', 'year', 'first', 'game', 'win', 'play', 'said']


Top 10 words for topic #2:
['olymp', 'athlet', 'ferguson', 'ban', 'drug', 'arsen', 'test', 'game', 'unit', 'said']




In [0]:
de meme ...

SyntaxError: ignored