# Task - 1 : Data Selection

In [1]:
from sklearn.datasets import fetch_20newsgroups
import warnings
warnings.filterwarnings('ignore')
from time import time
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

### Load dataset

In [2]:
task1 = fetch_20newsgroups(subset='all',categories=['rec.motorcycles','sci.electronics'])

# Task 2: Vectorization and Building the Classifier Model

1.Define a common function to meet the following criterion:
- Convert the text data into vector form using the TF-IDF
vectorization method.
- Use the MultinomialNB() as classification method.
- Find the confusion matrix and classification report.
- Return the first 30 samples of predicted and actual output
as two lists.

In [3]:
from sklearn.metrics import confusion_matrix, classification_report
import pandas as pd

def classify_text(text_data, labels):

    docs_train, docs_test, y_train, y_test = train_test_split(text_data, labels, test_size = 0.2, random_state = 42)
    
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(docs_train)
    
    clf = MultinomialNB()
    clf.fit(X, y_train)
    
    y_pred = clf.predict(X)
    cm = confusion_matrix(y_test, y_pred[:396])
    cr = classification_report(y_test, y_pred[:396])
    print("Confusion matrix:\n\n", cm,'\n')
    print("Classification report:\n\n", cr)
    print('Actual :\n',y_test[:30])
    print('Predicted :\n',y_pred[:30])

In [4]:
classify_text(task1.data,task1.target)

Confusion matrix:

 [[ 87 114]
 [101  94]] 

Classification report:

               precision    recall  f1-score   support

           0       0.46      0.43      0.45       201
           1       0.45      0.48      0.47       195

    accuracy                           0.46       396
   macro avg       0.46      0.46      0.46       396
weighted avg       0.46      0.46      0.46       396

Actual :
 [1 1 1 1 0 0 1 0 0 0 1 1 0 0 1 1 1 0 1 0 1 1 0 1 0 1 0 0 0 1]
Predicted :
 [0 1 1 0 0 1 1 1 1 1 0 1 0 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 1 1]


### 2. Invoke the method above by passing the classifier object, trian, and test datasets.
### 3. Print the first 30 samples of predicted and actual output as a table.


In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

def text_classification(classifier, train_data, test_data):
    
    TFIDF = TfidfVectorizer(stop_words='english')
    X_train = TFIDF.fit_transform(train_data.data)
    X_test = TFIDF.transform(test_data.data)

    
    clf = classifier.fit(X_train, train_data.target)
    y_pred = clf.predict(X_test)

    
    cm = confusion_matrix(test_data.target, y_pred)
    cr = classification_report(test_data.target, y_pred)

    
    pred_output = list(y_pred[:30])
    actual_output = list(test_data.target[:30])

    return cm, cr, pred_output, actual_output

task1 = fetch_20newsgroups(subset='all', categories=['rec.motorcycles', 'sci.electronics'])
train = fetch_20newsgroups(subset='train', categories=['rec.motorcycles', 'sci.electronics'])
test = fetch_20newsgroups(subset='test', categories=['rec.motorcycles', 'sci.electronics'])

classifier = MultinomialNB()

cm, cr, pred_output, actual_output = text_classification(classifier, train, test)
print("Confusion matrix:\n", cm)
print("\nClassification report:\n", cr)
print("\nPredicted output:\n", pred_output)
print("\nActual output:\n", actual_output)

Confusion matrix:
 [[395   3]
 [ 13 380]]

Classification report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98       398
           1       0.99      0.97      0.98       393

    accuracy                           0.98       791
   macro avg       0.98      0.98      0.98       791
weighted avg       0.98      0.98      0.98       791


Predicted output:
 [1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0]

Actual output:
 [1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0]


# Task - 3 : Classification on Additional Categories
1. Add the following two categories to the previous input data:
'rec.sport.baseball,
'comp.graphics'
2. Repeat all the steps 

In [6]:
categories = ['rec.motorcycles', 'sci.electronics', 'rec.sport.baseball', 'comp.graphics']
task2 = fetch_20newsgroups(subset='all', categories=categories)

In [7]:
def classify_text(text_data, labels):

    docs_train, docs_test, y_train, y_test = train_test_split(text_data, labels, test_size = 0.2, random_state = 42)
    
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(docs_train)
    
    clf = MultinomialNB()
    clf.fit(X, y_train)
    
    y_pred = clf.predict(X)
    cm = confusion_matrix(y_test, y_pred[:790])
    cr = classification_report(y_test, y_pred[:790])
    print("Confusion matrix:\n\n", cm,'\n')
    print("Classification report:\n\n", cr)
    print('Actual :\n',y_test[:30])
    print('Predicted :\n',y_pred[:30])

In [8]:
classify_text(task2.data,task2.target)

Confusion matrix:

 [[50 57 42 49]
 [44 61 45 53]
 [58 52 46 48]
 [56 41 44 44]] 

Classification report:

               precision    recall  f1-score   support

           0       0.24      0.25      0.25       198
           1       0.29      0.30      0.29       203
           2       0.26      0.23      0.24       204
           3       0.23      0.24      0.23       185

    accuracy                           0.25       790
   macro avg       0.25      0.25      0.25       790
weighted avg       0.25      0.25      0.25       790

Actual :
 [1 0 1 2 0 0 0 0 3 2 1 1 3 1 0 1 1 2 2 0 0 0 3 1 0 1 2 2 1 2]
Predicted :
 [1 1 1 1 1 1 0 1 3 0 1 1 2 2 2 1 3 2 3 2 0 1 2 2 3 3 1 0 3 0]


In [9]:
def text_classification(classifier, train_data, test_data):
    
    TFIDF = TfidfVectorizer(stop_words='english')
    X_train = TFIDF.fit_transform(train_data.data)
    X_test = TFIDF.transform(test_data.data)

    
    clf = classifier.fit(X_train, train_data.target)
    y_pred = clf.predict(X_test)

    
    cm = confusion_matrix(test_data.target, y_pred)
    cr = classification_report(test_data.target, y_pred)

    
    pred_output = list(y_pred[:30])
    actual_output = list(test_data.target[:30])

    return cm, cr, pred_output, actual_output

task1 = fetch_20newsgroups(subset='all', categories=['rec.motorcycles', 'sci.electronics', 'rec.sport.baseball', 'comp.graphics'])
train = fetch_20newsgroups(subset='train', categories=['rec.motorcycles', 'sci.electronics', 'rec.sport.baseball', 'comp.graphics'])
test = fetch_20newsgroups(subset='test', categories=['rec.motorcycles', 'sci.electronics', 'rec.sport.baseball', 'comp.graphics'])

classifier = MultinomialNB()

cm, cr, pred_output, actual_output = text_classification(classifier, train, test)
print("Confusion matrix:\n", cm)
print("\nClassification report:\n", cr)
print("\nPredicted output:\n", pred_output)
print("\nActual output:\n", actual_output)

Confusion matrix:
 [[354   3   9  23]
 [  0 395   1   2]
 [  0   3 394   0]
 [ 31  12   2 348]]

Classification report:
               precision    recall  f1-score   support

           0       0.92      0.91      0.91       389
           1       0.96      0.99      0.97       398
           2       0.97      0.99      0.98       397
           3       0.93      0.89      0.91       393

    accuracy                           0.95      1577
   macro avg       0.94      0.95      0.94      1577
weighted avg       0.94      0.95      0.94      1577


Predicted output:
 [2, 2, 2, 1, 1, 3, 2, 2, 1, 1, 0, 2, 0, 1, 0, 0, 1, 2, 3, 0, 1, 1, 1, 0, 0, 2, 1, 0, 2, 3]

Actual output:
 [2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 0, 2, 0, 1, 0, 0, 1, 2, 3, 0, 1, 1, 1, 0, 0, 2, 1, 0, 2, 0]


# Task 4 : Predictions of Unseen Data

In [10]:
X_train, X_test, y_train, y_test = train_test_split(task2.data, task2.target, test_size=0.2, random_state=42)

In [11]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

CV = CountVectorizer(stop_words=stopwords.words('english'))
X_train_vectors = CV.fit_transform(X_train)

[nltk_data] Error loading stopwords: <urlopen error [WinError 10054]
[nltk_data]     An existing connection was forcibly closed by the
[nltk_data]     remote host>


In [12]:
NB_classifier = MultinomialNB()
NB_classifier.fit(X_train_vectors, y_train)

MultinomialNB()

In [13]:
test_data = [
    "What are the different parts of a computer?",
    "Playing baseball is good for one's health.",
    "In which games are you interested?",
    "It is the unknown around the corner that turns my wheels.",
    "I am interested in increasing the picture resolution of my computer.",
    "The team might not win if there is rain."]

In [14]:
test_data_vectors = CV.transform(test_data)
predicted_labels = NB_classifier.predict(test_data_vectors)

In [15]:
for i in range(len(test_data)):
    print(f"{test_data[i]} ==> {task1.target_names[predicted_labels[i]]}")

What are the different parts of a computer? ==> sci.electronics
Playing baseball is good for one's health. ==> rec.sport.baseball
In which games are you interested? ==> rec.sport.baseball
It is the unknown around the corner that turns my wheels. ==> rec.motorcycles
I am interested in increasing the picture resolution of my computer. ==> comp.graphics
The team might not win if there is rain. ==> rec.sport.baseball
