In [68]:
import numpy as np
import nltk
from sklearn.datasets import fetch_20newsgroups
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords 
from nltk.stem.snowball import SnowballStemmer
import html
import re
import string
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier


In [69]:
#Loading the data set - training data.
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, remove=('headers','footers', 'quotes'))
twenty_test  = fetch_20newsgroups(subset='test', shuffle=True, remove=('headers','footers', 'quotes'))


In [70]:
class NLP:

    def prepareData(self,data):
        #Escaping out HTML characters 
        data=html.unescape(data)
    
        # remove hyperlinks 
        data = re.sub(r'https?:\/\/.\S+', "", data) 
        
        # remove hashtags
        data = re.sub(r'#', '', data) 
        
        # remove old style data text "RT" 
        data = re.sub(r'^RT[\s]+', '', data)

        #dictionary consisting of the contraction and the actual value 
        dictionary ={"'s":" is","n't":" not","'m":" am","'ll":" will", "'d":" would","'ve":" have","'re":" are"} 
        
        #replace the contractions 
        for key,value in dictionary.items(): 
            if key in data: 
                data=data.replace(key,value)

        #separate the words 
        data = " ".join([s for s in re.split("([A-Z][a-z]+[^A-Z]*)",data) if s]) 
        
        #convert to lower case 
        data=data.lower()

        return data


    def slang(self,data):
        #replace the slang word with meaning
        file=open("slang.txt","r") 
        slang=file.read() 
        
        #seperating each line present in the file 
        slang=slang.split('\n') 
        
        data_tokens=data.split() 
        slang_word=[] 
        meaning=[] 
        
        #store the slang words and meanings in different lists 
        for line in slang: 
            temp=line.split("=") 
            slang_word.append(temp[0]) 
            meaning.append(temp[-1]) 
        
        #replace the slang word with meaning 
        for i,word in enumerate(data_tokens): 
            if word in slang_word: 
                idx=slang_word.index(word) 
                data_tokens[i]=meaning[idx] 
        data=" ".join(data_tokens)
        return data

    def stemmAndLemmatization(self,data):
        stemmer= PorterStemmer()
        lemmatizer=WordNetLemmatizer()

        data=word_tokenize(data)
        data_list=[] 
        for word in data:

            #word = stemmer.stem(word)
            word = lemmatizer.lemmatize(word)
            data_list.append(word)
        data=" ".join(data_list)

        return data

    def removeStopWords(self,data):
        #import english stopwords list from nltk 
        stopwords_eng = stopwords.words('english')  
        
        data_tokens=data.split() 
        data_list=[] 
        #remove stopwords 
        for word in data_tokens: 
            if word not in stopwords_eng: 
                data_list.append(word) 
        data=" ".join(data_list)

        return data

    def removepunctuations(self,data):
        #remove punctuations
        data_tokens=data.split() 
        data_list=[]  
        
        for word in data_tokens:
            if word not in string.punctuation: 
                data_list.append(word) 
        data=" ".join(data_list)

        return data

In [71]:
def runNLP(data):
    
    for i , _ in enumerate(data):
        data[i] = filter.prepareData(data[i])
        data[i] = filter.slang(data[i])
        data[i] = filter.removeStopWords(data[i])
        data[i] = filter.stemmAndLemmatization(data[i])
        data[i] = filter.removepunctuations(data[i])

    return data

In [72]:
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

In [73]:
class techniques:

    def naiveBayes(self,data, target):
        stemmed_count_vect = StemmedCountVectorizer(stop_words='english')
        model = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
        model = model.fit(data, target)
        return model

    def SVM(self, data, target):
        model = Pipeline([('vect',CountVectorizer()), ('tfidf', TfidfTransformer()),('clf-svm', SGDClassifier(loss='hinge',                penalty='l2',alpha=1e-3, max_iter=5, random_state=42))])
        model = model.fit(data, target)
        return model

    def RF(self, data, target):
        model = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf-RF', RandomForestClassifier                       (n_estimators=1000, random_state=0))])
        model.fit(data, target)
        return model
    

In [74]:
# ======================= NLP ========================== 
train = runNLP(twenty_train.data)

In [75]:
# ======================= Training =======================
technique = techniques()
model = technique.SVM(train, twenty_train.target)

In [76]:
# ======================= Testing =======================
test = runNLP(twenty_test.data)

predicted = model.predict(test)
m = np.mean(predicted == twenty_test.target)
print (m)

0.6840148698884758
