##### Import Necessary Libraries

In [7]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

data = pd.read_csv("articles.csv", encoding = 'latin1')
print(data.head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\thars\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thars\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\thars\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\thars\AppData\Roaming\nltk_data...


                                             Article  \
0  Data analysis is the process of inspecting and...   
1  The performance of a machine learning algorith...   
2  You must have seen the news divided into categ...   
3  When there are only two classes in a classific...   
4  The Multinomial Naive Bayes is one of the vari...   

                                               Title  
0                  Best Books to Learn Data Analysis  
1         Assumptions of Machine Learning Algorithms  
2          News Classification with Machine Learning  
3  Multiclass Classification Algorithms in Machin...  
4        Multinomial Naive Bayes in Machine Learning  


In [14]:
#As we are working on a Natural Language Processing problem, we need to clean the textual 
# content by removing punctuation and stopwords. Here’s how we can clean the textual data:
def preprocess_text(text):
    #convert text to lower
    text = text.lower()
    #remove punctution
    text = text.translate(str.maketrans("", "", string.punctuation))
    #tokenize text
    tokens = nltk.word_tokenize(text)
    #remove punctuation
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    #lemmatize tokens
    lemma = WordNetLemmatizer()
    tokens = [lemma.lemmatize(word) for word in tokens]
    #join tokens to form preprocessed text
    preprocessed_text = " ".join(tokens)
    return preprocessed_text
    
data["Article"] = data["Article"].apply(preprocess_text)

In [18]:
#Now we need to convert the textual data into a numerical representation. We can use text vectorization here:
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(data["Article"].values)

In [27]:
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(x)

topic_modelling = lda.transform(x)

topic_labels = np.argmax(topic_modelling, axis=1)
data["topic_labels"] = topic_labels

#now here's the final data with topic label's
print(data.head())

                                             Article  \
0  data analysis process inspecting exploring dat...   
1  performance machine learning algorithm particu...   
2  must seen news divided category go news websit...   
3  two class classification problem problem binar...   
4  multinomial naive bayes one variant naive baye...   

                                               Title  topic_labels  
0                  Best Books to Learn Data Analysis             2  
1         Assumptions of Machine Learning Algorithms             3  
2          News Classification with Machine Learning             1  
3  Multiclass Classification Algorithms in Machin...             3  
4        Multinomial Naive Bayes in Machine Learning             1  
