In [10]:
import os
from collections import Counter

import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords
from wordcloud import WordCloud

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import mlflow
import mlflow.sklearn
import joblib

In [11]:
os.chdir('..') 
from src.loader import NewsDataLoader
from src.config import cfg
import src.utils as utils

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import LatentDirichletAllocation
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [13]:
loader = NewsDataLoader()

In [14]:
# Convert the list of messages into a DataFrame
news_data = loader.get_news_data()

news_data.head(2)

FileNotFoundError: [Errno 2] No such file or directory: '../data/rating.csv'

In [None]:
# Download 'punkt' resource
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Apply the cleaning function to the 'text' column
news_data['clean_content'] = news_data['content'].apply(utils.get_clean_text)

# Remove missing values
news_data.dropna(subset=['description'], inplace=True)
news_data.isna().sum()

In [None]:
print(news_data.columns)

In [None]:
columns_selected = ['description', 'title']
news_data_dt = news_data[columns_selected]

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(news_data_dt['title'], news_data_dt['description'], test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
# Vectorize the text data
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
# Train a Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_vec, y_train)

In [None]:
# Make predictions and Evaluate the model
predictions = classifier.predict(X_test_vec)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy : {accuracy}")
print(classification_report(y_test, predictions))

In [None]:
# Using Latent Dirichlet Allocation (LDA) for Topic Modeling
lda = LatentDirichletAllocation(n_components=10, random_state=42)
message_topics = lda.fit_transform(X_train_vec)