# 1. Data Exploration and Preprocessing

In [34]:
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Load dataset
df = pd.read_csv(r"D:\Excelr\Assignments\Nlp and Naive Bayes\blogs.csv")

# Display basic information about the dataset
print(df.info())
print(df.head())

# Clean the text data (remove punctuation, convert to lowercase)
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)
    
from nltk.tokenize import word_tokenize

# Apply preprocessing function to your 'Data' column
def preprocess_text(text):
    text = text.lower()
    # Use the correct tokenizer explicitly
    tokens = word_tokenize(text)
    # Remove stopwords, etc.
    return " ".join(tokens)

df['cleaned_text'] = df['Data'].apply(preprocess_text)
# Apply preprocessing to the 'Data' column
df['cleaned_text'] = df['Data'].apply(preprocess_text)

# Check cleaned text
print(df['cleaned_text'].head())

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['Labels']


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Data    2000 non-null   object
 1   Labels  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB
None
                                                Data       Labels
0  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  alt.atheism
1  Newsgroups: alt.atheism\nPath: cantaloupe.srv....  alt.atheism
2  Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...  alt.atheism
3  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  alt.atheism
4  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...  alt.atheism
0    path : cantaloupe.srv.cs.cmu.edu ! magnesium.c...
1    newsgroups : alt.atheism path : cantaloupe.srv...
2    path : cantaloupe.srv.cs.cmu.edu ! das-news.ha...
3    path : cantaloupe.srv.cs.cmu.edu ! magnesium.c...
4    xref : cantaloupe.srv.cs.cmu.edu alt.atheism:5...
Name: cleaned_text, dtype: object

# 2. Naive Bayes Model for Text Classification

In [37]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Predict on test set
y_pred = nb_model.predict(X_test)

# Evaluate model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.665
Classification Report:
                           precision    recall  f1-score   support

             alt.atheism       0.38      0.96      0.54        24
           comp.graphics       0.86      0.64      0.73        28
 comp.os.ms-windows.misc       0.88      0.85      0.86        33
comp.sys.ibm.pc.hardware       0.80      0.56      0.66        36
   comp.sys.mac.hardware       0.93      0.39      0.55        36
          comp.windows.x       1.00      0.14      0.24        36
            misc.forsale       0.76      0.79      0.78        24
               rec.autos       0.89      0.81      0.85        31
         rec.motorcycles       0.62      0.91      0.74        22
      rec.sport.baseball       0.95      0.59      0.73        32
        rec.sport.hockey       0.60      0.96      0.74        25
               sci.crypt       0.36      1.00      0.53        24
         sci.electronics       0.74      0.63      0.68        27
                 sci.med       0.94

# 3. Sentiment Analysis

In [44]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to categorize sentiment
def get_sentiment(text):
    sentiment_score = sia.polarity_scores(text)
    if sentiment_score['compound'] >= 0.05:
        return 'positive'
    elif sentiment_score['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply sentiment analysis to the 'cleaned_text' column
df['sentiment'] = df['cleaned_text'].apply(get_sentiment)

# Display sentiment distribution
print(df['sentiment'].value_counts())

# Optionally, analyze sentiment per category
sentiment_by_category = df.groupby('Labels')['sentiment'].value_counts().unstack().fillna(0)
print(sentiment_by_category)

sentiment
positive    1323
negative     648
neutral       29
Name: count, dtype: int64
sentiment                 negative  neutral  positive
Labels                                               
alt.atheism                   43.0      1.0      56.0
comp.graphics                 15.0      2.0      83.0
comp.os.ms-windows.misc       24.0      2.0      74.0
comp.sys.ibm.pc.hardware      22.0      0.0      78.0
comp.sys.mac.hardware         26.0      3.0      71.0
comp.windows.x                21.0      2.0      77.0
misc.forsale                  10.0      8.0      82.0
rec.autos                     28.0      1.0      71.0
rec.motorcycles               31.0      1.0      68.0
rec.sport.baseball            24.0      1.0      75.0
rec.sport.hockey              31.0      1.0      68.0
sci.crypt                     29.0      0.0      71.0
sci.electronics               21.0      2.0      77.0
sci.med                       34.0      1.0      65.0
sci.space                     34.0      2.0      

# 4. Evaluation

In [47]:
# Sentiment analysis evaluation (e.g., distribution by category)
print(df.groupby('Labels')['sentiment'].value_counts())


Labels                    sentiment
alt.atheism               positive     56
                          negative     43
                          neutral       1
comp.graphics             positive     83
                          negative     15
                          neutral       2
comp.os.ms-windows.misc   positive     74
                          negative     24
                          neutral       2
comp.sys.ibm.pc.hardware  positive     78
                          negative     22
comp.sys.mac.hardware     positive     71
                          negative     26
                          neutral       3
comp.windows.x            positive     77
                          negative     21
                          neutral       2
misc.forsale              positive     82
                          negative     10
                          neutral       8
rec.autos                 positive     71
                          negative     28
                          neutral       