In [13]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
from google.colab import files
uploaded = files.upload()

Saving blogs.csv to blogs.csv


In [6]:
df = pd.read_csv('blogs.csv')

In [7]:
# Data exploration
print("Dataset Information:")
print(df.info())
print("\nCategories Distribution:")
print(df['Labels'].value_counts())

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Data    2000 non-null   object
 1   Labels  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB
None

Categories Distribution:
Labels
alt.atheism                 100
comp.graphics               100
talk.politics.misc          100
talk.politics.mideast       100
talk.politics.guns          100
soc.religion.christian      100
sci.space                   100
sci.med                     100
sci.electronics             100
sci.crypt                   100
rec.sport.hockey            100
rec.sport.baseball          100
rec.motorcycles             100
rec.autos                   100
misc.forsale                100
comp.windows.x              100
comp.sys.mac.hardware       100
comp.sys.ibm.pc.hardware    100
comp.os.ms-windows.misc     100
talk.religion.misc          100
Name: coun

In [14]:
# Data preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenization using NLTK
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

df['Data'] = df['Data'].apply(preprocess_text)

In [15]:
# Naive Bayes Model for Text Classification
# Split data into training and test sets
X = df['Data']
y = df['Labels']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [17]:
# Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)


In [18]:
# Predictions
y_pred = nb_classifier.predict(X_test_tfidf)

In [19]:
# Evaluation
print("\nNaive Bayes Classifier Metrics:")
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Naive Bayes Classifier Metrics:
Accuracy: 0.84
                          precision    recall  f1-score   support

             alt.atheism       0.55      0.94      0.69        18
           comp.graphics       0.80      0.89      0.84        18
 comp.os.ms-windows.misc       0.87      0.91      0.89        22
comp.sys.ibm.pc.hardware       0.88      0.84      0.86        25
   comp.sys.mac.hardware       0.79      0.90      0.84        21
          comp.windows.x       1.00      0.76      0.86        25
            misc.forsale       1.00      0.61      0.76        18
               rec.autos       0.89      0.94      0.92        18
         rec.motorcycles       0.83      0.94      0.88        16
      rec.sport.baseball       0.74      0.94      0.83        18
        rec.sport.hockey       0.88      1.00      0.94        15
               sci.crypt       0.86      1.00      0.93        19
         sci.electronics       0.76      0.81      0.79        16
                 sci.med   

In [20]:
# Sentiment Analysis using TextBlob
def get_sentiment(text):
    analysis = TextBlob(text)
    polarity = analysis.sentiment.polarity
    if polarity > 0:
        return 'positive'
    elif polarity < 0:
        return 'negative'
    else:
        return 'neutral'

df['Sentiment'] = df['Data'].apply(get_sentiment)

In [21]:
# Sentiment analysis summary
print("\nSentiment Analysis Summary:")
sentiment_counts = df.groupby(['Labels', 'Sentiment']).size().unstack(fill_value=0)
print(sentiment_counts)


Sentiment Analysis Summary:
Sentiment                 negative  neutral  positive
Labels                                               
alt.atheism                     36        0        64
comp.graphics                   28        0        72
comp.os.ms-windows.misc         22        0        78
comp.sys.ibm.pc.hardware        17        0        83
comp.sys.mac.hardware           26        0        74
comp.windows.x                  20        2        78
misc.forsale                    20        0        80
rec.autos                       24        0        76
rec.motorcycles                 28        0        72
rec.sport.baseball              38        0        62
rec.sport.hockey                44        0        56
sci.crypt                       22        0        78
sci.electronics                 24        0        76
sci.med                         34        0        66
sci.space                       29        0        71
soc.religion.christian          24        0        76

In [23]:
df.to_csv('blogs_preprocessed.csv', index=False)

print("\nPreprocessed dataset with sentiments saved to 'blogs_preprocessed.csv'")


Preprocessed dataset with sentiments saved to 'blogs_preprocessed.csv'
