<a href="https://colab.research.google.com/github/AdamSimion/NLP/blob/main/NLP_BEADAND%C3%93.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# News Classifier

This notebook attempts to categorize and visualize news based on their content

Installations and Imports



In [None]:
!pip install nltk
!pip install imageio

In [36]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
from collections import Counter
import re
import imageio

# Győződjünk meg róla, hogy a szükséges NLTK erőforrások letöltve vannak

In [None]:
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

# Data

# Dataset [News Dataset](https://www.kaggle.com/datasets/setseries/news-category-dataset)

# **Dataset** containing categorized news articles

1. ADAT BETÖLTÉSE

In [None]:
from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/My Drive/Colab Notebooks/NewsCategorizer.csv'

try:
    df = pd.read_csv(file_path)
    print("File loaded successfully.")
except FileNotFoundError:
    print(f"Error: File not found at {file_path}. Please check the file path.")
except pd.errors.EmptyDataError:
    print(f"Error: The file at {file_path} is empty.")
except pd.errors.ParserError:
    print(f"Error: Unable to parse the file at {file_path}. Check the file format.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [None]:
data = pd.read_csv("/content/drive/My Drive/Colab Notebooks/NewsCategorizer.csv")
data

# Csak a 'headline' és 'category' oszlopokat tartjuk meg

In [None]:
data = data[['headline', 'category']]
data = data.dropna(subset=['headline'])  # Üres címek eltávolítása
data

# 2. ELŐFELDOLGOZÁS


*   Text lowercasing
*   Special character removal
*   Tokenization
*   Stop word removing
*   Token joining





In [41]:
stop_words = set(stopwords.words('english'))

def preprocess_text(text):

  #Text lowercasing
    text = text.lower()

  #Special character removing
    text = re.sub(r'[^a-z\s]', '', text)

  #Tokenization
    tokens = word_tokenize(text)

  #Stop word removing
    tokens = [word for word in tokens if word not in stop_words]

  #Token joining
    return ' '.join(tokens)

In [None]:
data['cleaned_headline'] = data['headline'].apply(lambda x: preprocess_text(str(x)))
data

# 3. Szentimentelemzés

---

# Egyszerű megközelítés: Használjunk egy előképzett szentimentelemző modellt / lexikont

>  Pl. VADER (Valence Aware Dictionary and sEntiment Reasoner): kifejezetten alkalmas hírek elemzésére


  





In [43]:
sia = SentimentIntensityAnalyzer()
def analyze_sentiment(text):
    score = sia.polarity_scores(text)
    if score['compound'] > 0.05:
        return 'Positive'
    elif score['compound'] < -0.05:
        return 'Negative'
    else:
        return 'Neutral'

In [None]:
data['sentiment'] = data['cleaned_headline'].apply(analyze_sentiment)
data

# 5. EREDMÉNYEK VIZUALIZÁLÁSA
# Szentimentek eloszlása

In [None]:
sentiment_counts = data['sentiment'].value_counts()
sentiment_counts.plot(kind='bar', color=['lightgray', 'lightgreen', 'salmon'], title='Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

# Téma szerinti szentiment eloszlás

In [None]:
topic_sentiment = data.groupby(['category', 'sentiment']).size().unstack(fill_value=0)
topic_sentiment.plot(kind='bar', stacked=True, title='Sentiment by Topic', figsize=(10, 6))
plt.xlabel('Topic')
plt.ylabel('Count')
plt.show()

# Kategóriánként százalékosan melyik mennyire pozitiv/negativ/semleges

In [None]:
topic_sentiment_percentage = topic_sentiment.div(topic_sentiment.sum(axis=1), axis=0) * 100

fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.ravel()

colors = ['lightgray', 'lightgreen', 'salmon']

for i, (topic, row) in enumerate(topic_sentiment_percentage.iterrows()):
    if i < len(axes):
        ax = axes[i]
        ax.pie(row, labels=row.index, autopct='%1.1f%%', startangle=90, colors=colors)
        ax.set_title(f"***{topic}***", fontstyle='italic', fontweight='bold')

for i in range(len(topic_sentiment_percentage), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

# Kategóriánként mely szentiment a legdominánsabb

In [None]:
def most_predominant_sentiment(df):
    topic_sentiment = df.groupby(['category', 'sentiment']).size().unstack(fill_value=0)
    most_predominant = topic_sentiment.idxmax(axis=1)
    return most_predominant

most_predominant = most_predominant_sentiment(data)

topic_sentiment_percentage = data.groupby(['category', 'sentiment']).size().unstack(fill_value=0)
topic_sentiment_percentage = topic_sentiment_percentage.div(topic_sentiment_percentage.sum(axis=1), axis=0) * 100

category_order = topic_sentiment_percentage.index

filtered_df = pd.DataFrame(columns=['category','sentiment', 'percentage'])

for category in category_order:
  sentiment = most_predominant[category]
  percentage = topic_sentiment_percentage.loc[category, sentiment]
  filtered_df = pd.concat([filtered_df, pd.DataFrame({'category': [category], 'sentiment': [sentiment], 'percentage': [percentage]})], ignore_index = True)

fig, ax = plt.subplots(figsize=(12, 6))
ax.bar(filtered_df['category'], filtered_df['percentage'], color=['lightgray' if s == 'Neutral' else 'lightgreen' if s == 'Positive' else 'salmon' for s in filtered_df['sentiment']])
ax.set_xticklabels(filtered_df['category'], rotation=45, ha='right')
ax.set_ylabel('Percentage %')
ax.set_xlabel('Category')
ax.set_title('Most Predominant Sentiment by Category')

plt.tight_layout()
plt.show()

# 6. Mentés feldolgozott adatokkal


In [None]:
# data.to_csv("processed_news.csv", index=False)

# Köszönet a figyelemért  ✌

In [None]:
img = np.zeros((300, 500, 3), dtype=np.uint8) + 255

plt.figure(figsize=(10, 6))
plt.imshow(img)
plt.text(100, 150, 'Thank you for the attention!', fontsize=30, color='purple', fontweight='bold', ha='center')
plt.text(100, 200, '(づ｡◕‿‿◕｡)づ', fontsize=50, color='orange', ha='center')


plt.axis('off')
plt.show()