In [15]:
import pandas as pd
import re

In [18]:
data=pd.read_csv('blogs.csv')
data.head(10)

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism
5,Xref: cantaloupe.srv.cs.cmu.edu talk.abortion:...,alt.atheism
6,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
7,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
8,Xref: cantaloupe.srv.cs.cmu.edu talk.abortion:...,alt.atheism
9,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism


## Data Exploration and Preprocessing

In [19]:
data.shape

(2000, 2)

In [20]:
data['Labels'].value_counts()

Labels
alt.atheism                 100
comp.graphics               100
talk.politics.misc          100
talk.politics.mideast       100
talk.politics.guns          100
soc.religion.christian      100
sci.space                   100
sci.med                     100
sci.electronics             100
sci.crypt                   100
rec.sport.hockey            100
rec.sport.baseball          100
rec.motorcycles             100
rec.autos                   100
misc.forsale                100
comp.windows.x              100
comp.sys.mac.hardware       100
comp.sys.ibm.pc.hardware    100
comp.os.ms-windows.misc     100
talk.religion.misc          100
Name: count, dtype: int64

In [21]:
data['Labels'].unique()

array(['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc',
       'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
       'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles',
       'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt',
       'sci.electronics', 'sci.med', 'sci.space',
       'soc.religion.christian', 'talk.politics.guns',
       'talk.politics.mideast', 'talk.politics.misc',
       'talk.religion.misc'], dtype=object)

In [22]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

data['Data'] = data['Data'].apply(preprocess_text)

In [24]:
data

Unnamed: 0,Data,Labels
0,path cantaloupesrvcscmuedumagnesiumclubcccmued...,alt.atheism
1,newsgroups altatheism\npath cantaloupesrvcscmu...,alt.atheism
2,path cantaloupesrvcscmuedudasnewsharvardedunoc...,alt.atheism
3,path cantaloupesrvcscmuedumagnesiumclubcccmued...,alt.atheism
4,xref cantaloupesrvcscmuedu altatheism53485 tal...,alt.atheism
...,...,...
1995,xref cantaloupesrvcscmuedu talkabortion120945 ...,talk.religion.misc
1996,xref cantaloupesrvcscmuedu talkreligionmisc837...,talk.religion.misc
1997,xref cantaloupesrvcscmuedu talkorigins41030 ta...,talk.religion.misc
1998,xref cantaloupesrvcscmuedu talkreligionmisc836...,talk.religion.misc


In [23]:
# Feature Extraction using TF-IDF
X = data['Data']
y = data['Labels']

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [26]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  
X_tfidf = tfidf_vectorizer.fit_transform(X)

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

 ## Naive Bayes Model for Text Classification

In [30]:
#Import Categorical Naive Bayes model
from sklearn.naive_bayes import MultinomialNB

#Create a Categorical Classifier
model = MultinomialNB()

# Train the model using the training sets
model.fit(X_train,y_train)
y_pred = model.predict(X_test)


In [31]:
from sklearn.metrics import confusion_matrix, classification_report

In [32]:
print("\nClassification Report:")
print(classification_report(y_test,y_pred))


Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.45      0.72      0.55        18
           comp.graphics       0.68      0.72      0.70        18
 comp.os.ms-windows.misc       0.80      0.73      0.76        22
comp.sys.ibm.pc.hardware       0.74      0.56      0.64        25
   comp.sys.mac.hardware       0.59      0.62      0.60        21
          comp.windows.x       0.82      0.72      0.77        25
            misc.forsale       0.83      0.56      0.67        18
               rec.autos       0.85      0.94      0.89        18
         rec.motorcycles       0.57      0.81      0.67        16
      rec.sport.baseball       0.76      0.89      0.82        18
        rec.sport.hockey       0.83      1.00      0.91        15
               sci.crypt       0.78      0.74      0.76        19
         sci.electronics       0.41      0.56      0.47        16
                 sci.med       0.62      0.76      

## Sentiment Analysis

In [33]:
!pip install nltk



In [34]:
import nltk

In [35]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [36]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [37]:
sia = SentimentIntensityAnalyzer()


In [38]:
sentiments = []
for post in data['Data']:
    sentiment_score = sia.polarity_scores(post)
    if sentiment_score['compound'] >= 0.05:
        sentiment = 'positive'
    elif sentiment_score['compound'] <= -0.05:
        sentiment = 'negative'
    else:
        sentiment = 'neutral'
    sentiments.append(sentiment)

data['Sentiment'] = sentiments

## Evaluation

In [39]:
# Evaluate sentiment distribution across different categories
sentiment_category_distribution = data.groupby(['Labels', 'Sentiment']).size().unstack(fill_value=0)
print("\nSentiment Distribution Across Categories:")
print(sentiment_category_distribution)


Sentiment Distribution Across Categories:
Sentiment                 negative  neutral  positive
Labels                                               
alt.atheism                     42        1        57
comp.graphics                   14        2        84
comp.os.ms-windows.misc         25        1        74
comp.sys.ibm.pc.hardware        20        3        77
comp.sys.mac.hardware           27        3        70
comp.windows.x                  19        3        78
misc.forsale                     8       11        81
rec.autos                       26        2        72
rec.motorcycles                 32        3        65
rec.sport.baseball              26        2        72
rec.sport.hockey                33        1        66
sci.crypt                       27        2        71
sci.electronics                 19        4        77
sci.med                         34        1        65
sci.space                       34        4        62
soc.religion.christian          26     

In [1]:
#end