In [1]:
import numpy as np
import nltk
from sklearn.datasets import fetch_20newsgroups
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
import pickle
import TextNLP

In [2]:
def runNLP(data):
    nlpText = TextNLP.TextNLP()
    for i , _ in enumerate(data):
        data[i] = nlpText.prepareData(data[i])
        data[i] = nlpText.slang(data[i])
        data[i] = nlpText.removeStopWords(data[i])
        data[i] = nlpText.stemmAndLemmatization(data[i])
        data[i] = nlpText.removepunctuations(data[i])
    return data

In [3]:
#Loading the data set - training data.
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, remove=('headers','footers', 'quotes'))
twenty_test  = fetch_20newsgroups(subset='test', shuffle=True, remove=('headers','footers', 'quotes'))
# ======================= NLP ========================== 
train = runNLP(twenty_train.data)
test = runNLP(twenty_test.data)

In [4]:
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

In [5]:
class techniques:

    def naiveBayes(self,data, target):
        stemmed_count_vect = StemmedCountVectorizer(stop_words='english')
        model = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
        model = model.fit(data, target)
        return model

    def SVM(self, data, target):
        model = Pipeline([('vect',CountVectorizer()), ('tfidf', TfidfTransformer()),('clf-svm', SGDClassifier(loss='hinge',                penalty='l2',alpha=1e-3, max_iter=5, random_state=42))])
        model = model.fit(data, target)
        return model
    
    def LinearSVC(self, data, target):
        model = Pipeline([('vect',CountVectorizer()), ('tfidf', TfidfTransformer()),('linear-svm', LinearSVC())])
        model = model.fit(data, target)
        return model

    def RandomForest(self, data, target):
        model = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf-RF', RandomForestClassifier                       (n_estimators=100, random_state=0, max_depth=20))])
        model.fit(data, target)
        return model

In [6]:
# ======================= Training =======================
technique = techniques()
#naiveBayes
naiveBayesModel = technique.naiveBayes(train, twenty_train.target)
#SVM
SVMModel = technique.SVM(train, twenty_train.target)
#RandomForest
RandomForestModel = technique.RandomForest(train, twenty_train.target)
#LinearSVC
LienarSVCModel = technique.LinearSVC(train, twenty_train.target)

In [13]:
# ======================= Testing =======================
predicted = LienarSVCModel.predict(test)
m = np.mean(predicted == twenty_test.target)
print(m)

0.694105151354222


In [9]:
## Saving the models

#naiveBayesModel
with open('savedModels/naiveBayesModel', 'wb') as picklefile:
    pickle.dump(naiveBayesModel,picklefile)

#SVMModel
with open('savedModels/SVMModel', 'wb') as picklefile:
    pickle.dump(SVMModel,picklefile)

#naiveBayesModel
with open('savedModels/RandomForestModel', 'wb') as picklefile:
    pickle.dump(RandomForestModel,picklefile)

#naiveBayesModel
with open('savedModels/LienarSVCModel', 'wb') as picklefile:
    pickle.dump(LienarSVCModel,picklefile)

In [11]:
#Loading the model
with open('savedModels/LienarSVCModel', 'rb') as training_model:
    loadedModel = pickle.load(training_model)
#predict the loadedModel
predicted = loadedModel.predict(test)
m = np.mean(predicted == twenty_test.target)
print (m)

0.694105151354222
