### TEXT CLASSIFICATION USING NAIVE BAYES AND SENTIMENT ANALYSIS ON BLOG POSTS

In [None]:
import pandas as pd                         # importing essensial libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('all')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\nakul\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\nakul\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\nakul\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\nakul\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers\averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\nakul\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       tagge

True

#### Data Exploration and Preprocessing

In [None]:
#read the data
df = pd.read_csv("blogs.csv")
df.head()

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism


In [None]:
# Preprocessing
import string
def preprocess_text(text):                     # function
    # Convert text to lowercase
    text = text.lower()
    print(text)
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [None]:
# Removing Punctuation
def remove_punctuation(text):
  for punctuation in string.punctuation:
    text = text.replace(punctuation, ' ')
  return text

df["Data"] = df["Data"].apply(remove_punctuation)          # df after remove puncuation
df.head()

Unnamed: 0,Data,Labels
0,Path cantaloupe srv cs cmu edu magnesium club...,alt.atheism
1,Newsgroups alt atheism\nPath cantaloupe srv ...,alt.atheism
2,Path cantaloupe srv cs cmu edu das news harva...,alt.atheism
3,Path cantaloupe srv cs cmu edu magnesium club...,alt.atheism
4,Xref cantaloupe srv cs cmu edu alt atheism 53...,alt.atheism


In [None]:
# tokenization
def preprocess_text(text):                 # function
# Tokenization
    tokens = word_tokenize(df["Data"])
    print(tokens)
    print(len(tokens))

In [None]:
# converting to lowercase
df["Data"] = df["Data"].apply(lambda x: x.lower())
df.head()

Unnamed: 0,Data,Labels
0,path cantaloupe srv cs cmu edu magnesium club...,alt.atheism
1,newsgroups alt atheism\npath cantaloupe srv ...,alt.atheism
2,path cantaloupe srv cs cmu edu das news harva...,alt.atheism
3,path cantaloupe srv cs cmu edu magnesium club...,alt.atheism
4,xref cantaloupe srv cs cmu edu alt atheism 53...,alt.atheism


In [None]:
# Removing the stopwords
stopwords = nltk.corpus.stopwords.words("english")
df["Data"] = df["Data"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))

df.head()

Unnamed: 0,Data,Labels
0,path cantaloupe srv cs cmu edu magnesium club ...,alt.atheism
1,newsgroups alt atheism path cantaloupe srv cs ...,alt.atheism
2,path cantaloupe srv cs cmu edu das news harvar...,alt.atheism
3,path cantaloupe srv cs cmu edu magnesium club ...,alt.atheism
4,xref cantaloupe srv cs cmu edu alt atheism 534...,alt.atheism


#### Naive Bayes Model for Text Classification

In [None]:
# Feature extraction
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df['Data'])
y = df['Labels']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

In [None]:
ypred_train= nb_classifier.predict(X_train)       # predictions on training
print(ypred_train)

['rec.sport.baseball' 'comp.os.ms-windows.misc' 'rec.motorcycles' ...
 'rec.motorcycles' 'sci.space' 'sci.crypt']


In [None]:
# Make predictions
y_predtest = nb_classifier.predict(X_test)               # on test
print(y_predtest)

['talk.politics.misc' 'comp.sys.ibm.pc.hardware' 'sci.med'
 'rec.sport.baseball' 'comp.sys.ibm.pc.hardware' 'sci.electronics'
 'rec.sport.baseball' 'talk.politics.mideast' 'alt.atheism' 'sci.med'
 'alt.atheism' 'sci.med' 'sci.crypt' 'rec.sport.baseball'
 'comp.sys.ibm.pc.hardware' 'comp.os.ms-windows.misc' 'rec.autos'
 'comp.graphics' 'talk.politics.guns' 'talk.politics.misc' 'misc.forsale'
 'talk.politics.misc' 'talk.religion.misc' 'rec.sport.hockey'
 'alt.atheism' 'sci.crypt' 'sci.crypt' 'rec.sport.baseball' 'rec.autos'
 'alt.atheism' 'misc.forsale' 'rec.sport.hockey' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.hockey' 'comp.sys.mac.hardware' 'sci.med'
 'rec.sport.hockey' 'rec.sport.hockey' 'comp.os.ms-windows.misc'
 'comp.sys.ibm.pc.hardware' 'talk.religion.misc' 'comp.os.ms-windows.misc'
 'comp.graphics' 'soc.religion.christian' 'rec.motorcycles' 'sci.crypt'
 'rec.motorcycles' 'alt.atheism' 'talk.politics.guns'
 'soc.religion.christian' 'sci.crypt' 'sci.space'
 'comp.os.ms-wind

####  Sentiment Analysis

In [None]:
####!pip install textblob

Collecting textblob
  Downloading textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
   ---------------------------------------- 0.0/626.3 kB ? eta -:--:--
    --------------------------------------- 10.2/626.3 kB ? eta -:--:--
   ------------- -------------------------- 204.8/626.3 kB 2.5 MB/s eta 0:00:01
   ---------------------------------------  614.4/626.3 kB 4.8 MB/s eta 0:00:01
   ---------------------------------------- 626.3/626.3 kB 4.4 MB/s eta 0:00:00
Installing collected packages: textblob
Successfully installed textblob-0.18.0.post0


In [None]:
from textblob import TextBlob

def get_sentiment(text):                          # function for sentiment analysis
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity == 0:
        return 'Neutral'
    else:
        return 'Negative'

df['Sentiment'] = df['Data'].apply(get_sentiment)            # added column in df
df.head()

Unnamed: 0,Data,Labels,Sentiment
0,path cantaloupe srv cs cmu edu magnesium club ...,alt.atheism,Positive
1,newsgroups alt atheism path cantaloupe srv cs ...,alt.atheism,Negative
2,path cantaloupe srv cs cmu edu das news harvar...,alt.atheism,Positive
3,path cantaloupe srv cs cmu edu magnesium club ...,alt.atheism,Positive
4,xref cantaloupe srv cs cmu edu alt atheism 534...,alt.atheism,Positive


#### Evaluation

In [None]:
# Evaluate Naive Bayes classifier
accuracy = accuracy_score(y_test, y_predtest)                  # accuracy
precision = precision_score(y_test, y_predtest, average='macro')
recall = recall_score(y_test, y_predtest, average='macro')
f1 = f1_score(y_test, y_predtest, average='macro')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.7875
Precision: 0.8015665398581167
Recall: 0.7982219039130805
F1 Score: 0.7789734897625344


In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test, y_predtest))                  # classification report and confusion matrix on test
print(confusion_matrix(y_test, y_predtest))

                          precision    recall  f1-score   support

             alt.atheism       0.60      0.83      0.70        18
           comp.graphics       0.72      0.72      0.72        18
 comp.os.ms-windows.misc       0.75      0.95      0.84        22
comp.sys.ibm.pc.hardware       0.75      0.84      0.79        25
   comp.sys.mac.hardware       0.88      0.67      0.76        21
          comp.windows.x       1.00      0.28      0.44        25
            misc.forsale       0.74      0.78      0.76        18
               rec.autos       0.77      0.94      0.85        18
         rec.motorcycles       0.81      0.81      0.81        16
      rec.sport.baseball       0.83      0.83      0.83        18
        rec.sport.hockey       0.65      1.00      0.79        15
               sci.crypt       0.68      1.00      0.81        19
         sci.electronics       0.75      0.56      0.64        16
                 sci.med       0.88      0.88      0.88        17
         

In [None]:
print(classification_report(y_train, ypred_train))                  # classification report and confusion matrix on train
print(confusion_matrix(y_train, ypred_train))

                          precision    recall  f1-score   support

             alt.atheism       0.90      0.99      0.94        82
           comp.graphics       0.99      0.99      0.99        82
 comp.os.ms-windows.misc       0.99      1.00      0.99        78
comp.sys.ibm.pc.hardware       0.99      1.00      0.99        75
   comp.sys.mac.hardware       1.00      1.00      1.00        79
          comp.windows.x       1.00      0.99      0.99        75
            misc.forsale       1.00      1.00      1.00        82
               rec.autos       1.00      1.00      1.00        82
         rec.motorcycles       1.00      1.00      1.00        84
      rec.sport.baseball       1.00      0.99      0.99        82
        rec.sport.hockey       0.99      1.00      0.99        85
               sci.crypt       1.00      1.00      1.00        81
         sci.electronics       1.00      0.99      0.99        84
                 sci.med       1.00      0.99      0.99        83
         