In [None]:
!pip install scikit-multilearn

In [None]:
import nltk
import unidecode
import pandas as pd
import os, re, string, gzip, itertools
import numpy as np
import tensorflow as tf
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix
from tensorflow import keras
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from collections import defaultdict
from sklearn.model_selection import train_test_split
from skmultilearn.problem_transform import LabelPowerset,  ClassifierChain, BinaryRelevance

In [None]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words =set(stopwords.words('english'))



In [None]:
def calc_accuracy_score(clf, train_data, train_target, test_data, test_target, gs = True):
    clf_ = clf.fit(train_data, train_target)
    predicted_test_target = clf_.predict(test_data)
    predicted_train_target = clf_.predict(train_data)
    if True:
        print('For {} classifier accuracy on train data is {}'.format(clf_.steps[-1][-1], accuracy_score(train_target,predicted_train_target)))
        print('For {} classifier accuracy on test data is {}'.format(clf_.steps[-1][-1], accuracy_score(test_target, predicted_test_target)))
    else:
        print('For {} classifier accuracy on train data is {}'.format(clf_, accuracy_score(train_target,predicted_train_target)))
        print('For {} classifier accuracy on test data is {}'.format(clf_, accuracy_score(test_target, predicted_test_target)))
        

In [None]:
ch_mnb_clf = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', BinaryRelevance(MultinomialNB()))
            ])
ch_lr_clf = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', ClassifierChain(LogisticRegression()))
            ])
lp_lr_clf = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', LabelPowerset(LogisticRegression()))
            ])

clfs = [ch_mnb_clf, ch_lr_clf, lp_lr_clf]

In [None]:
ch_lr_clf = ch_lr_clf.fit(twenty_train.data, twenty_train.target)
predicted_test_target = ch_lr_clf.predict(twenty_test.data)


In [None]:
accuracy_score(predicted_test_target.toarray(), twenty_test.data)

In [None]:
# count =0
# for classfier in clfs:
#     count+=1
#     print("Model : {} ".format(count))
#     calc_accuracy_score(classfier, twenty_train.data, twenty_train.target, twenty_test.data, twenty_test.target, gs=False)
#     print("\n")
    

In [None]:
bagging_clf = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(BaggingClassifier(), n_jobs=1))
            ])

boosting_clf = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(GradientBoostingClassifier(), n_jobs=1))
            ])
ada_boosting_clf = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(AdaBoostClassifier(), n_jobs=1)),
            ])

In [None]:
# calc_accuracy_score(boosting_clf, twenty_train.data, twenty_train.target, twenty_test.data, twenty_test.target)



In [None]:
calc_accuracy_score(bagging_clf, twenty_train.data, twenty_train.target, twenty_test.data, twenty_test.target)



In [None]:
calc_accuracy_score(ada_boosting_clf, twenty_train.data, twenty_train.target, twenty_test.data, twenty_test.target)



In [None]:
vectorizer = TfidfVectorizer()
training_features = vectorizer.fit_transform(twenty_train.data)    
test_features = vectorizer.transform(twenty_test.data)



In [None]:
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression
# initialize classifier chains multi-label classifier
classifier = ClassifierChain(LogisticRegression())
# Training logistic regression model on train data
classifier.fit(training_features.toarray(), twenty_train.target)
# predict
predictions = classifier.predict(test_features.toarray())
# accuracy
print("Accuracy = ",accuracy_score(twenty_test.target,predictions))
print("\n")


In [None]:
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42))])
text_clf_stop = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), 
                     ('clf', MultinomialNB())])
SVC_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])
LogReg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ])

NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])


classfiers = [text_clf, text_clf_svm, text_clf_stop, SVC_pipeline, LogReg_pipeline, NB_pipeline]
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),'clf-svm__alpha': (1e-2, 1e-3)}
count =0
for classfier in classfiers:
    count+=1
    print("Model : {} ".format(count))
    calc_accuracy_score(classfier, twenty_train.data, twenty_train.target, twenty_test.data, twenty_test.target)
    print("\n")
    