In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.pipeline import Pipeline
import numpy as np
import nltk
from nltk.corpus import stopwords 
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import wordnet as wn
import html
import re
import string
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [122]:
#Loading the data set - training data
twenty_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), shuffle=True)
twenty_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), shuffle=True)

In [123]:
# You can check the target names (categories) and some data files by following commands.
print(twenty_train.target_names,'\n') #prints all the categories
print(twenty_train.data[0])

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'] 

I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.


In [None]:
class filterData:

    def prepareData(self,data):
        #Escaping out HTML characters 
        data=html.unescape(data)
    
        # remove hyperlinks 
        data = re.sub(r'https?:\/\/.\S+', "", data) 
        
        # remove hashtags
        data = re.sub(r'#', '', data) 
        
        # remove old style data text "RT" 
        data = re.sub(r'^RT[\s]+', '', data)

        #dictionary consisting of the contraction and the actual value 
        dictionary ={"'s":" is","n't":" not","'m":" am","'ll":" will", "'d":" would","'ve":" have","'re":" are"} 
        
        #replace the contractions 
        for key,value in dictionary.items(): 
            if key in data: 
                data=data.replace(key,value)

        #separate the words 
        data = " ".join([s for s in re.split("([A-Z][a-z]+[^A-Z]*)",data) if s]) 
        
        #convert to lower case 
        data=data.lower()

        return data


    def slang(self,data):
        #replace the slang word with meaning
        file=open("slang.txt","r") 
        slang=file.read() 
        
        #seperating each line present in the file 
        slang=slang.split('\n') 
        
        data_tokens=data.split() 
        slang_word=[] 
        meaning=[] 
        
        #store the slang words and meanings in different lists 
        for line in slang: 
            temp=line.split("=") 
            slang_word.append(temp[0]) 
            meaning.append(temp[-1]) 
        
        #replace the slang word with meaning 
        for i,word in enumerate(data_tokens): 
            if word in slang_word: 
                idx=slang_word.index(word) 
                data_tokens[i]=meaning[idx] 
        data=" ".join(data_tokens)
        return data

    def stemmAndLemmatization(self,data):
        stemmer= PorterStemmer()
        lemmatizer=WordNetLemmatizer()

        data=word_tokenize(data)
        data_list=[] 
        for word in data:

            #word = stemmer.stem(word)
            word = lemmatizer.lemmatize(word)
            data_list.append(word)
        data=" ".join(data_list)

        return data

    def removeStopWords(self,data):
        #import english stopwords list from nltk 
        stopwords_eng = stopwords.words('english')  
        
        data_tokens=data.split() 
        data_list=[] 
        #remove stopwords 
        for word in data_tokens: 
            if word not in stopwords_eng: 
                data_list.append(word) 
        data=" ".join(data_list)

        return data

    def removepunctuations(self,data):
        #remove punctuations
        data_tokens=data.split() 
        data_list=[]  
        
        for word in data_tokens:
            if word not in string.punctuation: 
                data_list.append(word) 
        data=" ".join(data_list)

        return data

In [None]:
# Testing
data = "I enjoyed the event which took place , yesteday &amp; I luvd it ! , The link to the show is http://t.co/4ftYom0i It's awsome you'll luv it #HadFun #Enjoyed BFN GN"

data = twenty_train.data[0]
print(data,'\n\n')

filter = filterData()

data = filter.prepareData(data)
data = filter.slang(data)
data = filter.removeStopWords(data)
data = filter.stemmAndLemmatization(data)
data = filter.removepunctuations(data)


print(data)

In [99]:
# Extracting features from text files
#from sklearn.feature_extraction.text import CountVectorizer
#count_vect = CountVectorizer(stop_words='english')
#X_train_counts = count_vect.fit_transform(twenty_train.data)
#print(X_train_counts.shape)
#print(count_vect.get_feature_names())
#print(X_train_counts.toarray()[0])

In [110]:
# TF-IDF
#from sklearn.feature_extraction.text import TfidfTransformer
#tfidf_transformer = TfidfTransformer()
#X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
#X_train_tfidf.shape

In [None]:
# Machine Learning
# Training Naive Bayes (NB) classifier on training data.
#from sklearn.naive_bayes import MultinomialNB
#clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [132]:
# Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows:
# The names ‘vect’ , ‘tfidf’ and ‘clf’ are arbitrary but will be used later.
# We will be using the 'text_clf' going forward.
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

In [133]:
# Performance of NB Classifier
import numpy as np
predicted = text_clf.predict(twenty_test.data)
print(np.mean(predicted == twenty_test.target))
#print(text_clf.predict(['i love cars']))

0.6062134891131173


In [136]:
# Training Support Vector Machines - SVM and calculating its performance

from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=5, random_state=42))])

text_clf_svm = text_clf_svm.fit(twenty_train.data, twenty_train.target)
predicted_svm = text_clf_svm.predict(twenty_test.data)
np.mean(predicted_svm == twenty_test.target)

0.6829527349973447

In [24]:
# Grid Search
# Here, we are creating a list of parameters for which we would like to do performance tuning. 
# All the parameters name start with the classifier name (remember the arbitrary name we gave). 
# E.g. vect__ngram_range; here we are telling to use unigram and bigrams and choose the one which is optimal.

from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}

In [25]:
# Next, we create an instance of the grid search by passing the classifier, parameters 
# and n_jobs=-1 which tells to use multiple cores from user machine.
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)

In [30]:
# To see the best mean score and the params, run the following code

print(gs_clf.best_score_)
print(gs_clf.best_params_)

# Output for above should be: The accuracy has now increased to ~90.6% for the NB classifier (not so naive anymore! 😄)
# and the corresponding parameters are {‘clf__alpha’: 0.01, ‘tfidf__use_idf’: True, ‘vect__ngram_range’: (1, 2)}.

0.9157684864695698
{'clf__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}


In [27]:
# Similarly doing grid search for SVM
from sklearn.model_selection import GridSearchCV
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),'clf-svm__alpha': (1e-2, 1e-3)}

gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(twenty_train.data, twenty_train.target)


gs_clf_svm.best_score_
gs_clf_svm.best_params_

{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [28]:
# NLTK
# Removing stop words
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), 
                     ('clf', MultinomialNB())])

In [29]:
# Stemming Code

import nltk
nltk.download()

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), 
                             ('mnb', MultinomialNB(fit_prior=False))])

text_mnb_stemmed = text_mnb_stemmed.fit(twenty_train.data, twenty_train.target)

predicted_mnb_stemmed = text_mnb_stemmed.predict(twenty_test.data)

np.mean(predicted_mnb_stemmed == twenty_test.target)

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


0.8167817312798725

In [75]:
#print(X_train_tfidf[0])
print(twenty_test.data[0:1])
#print(text_clf_svm.predict(twenty_test.data[0:1]))
print('class is : ',twenty_train.target_names[text_clf_svm.predict(twenty_test.data[5:6])[0]])

['From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. GANDLER)\nSubject: Need info on 88-89 Bonneville\nOrganization: University at Buffalo\nLines: 10\nNews-Software: VAX/VMS VNEWS 1.41\nNntp-Posting-Host: ubvmsd.cc.buffalo.edu\n\n\n I am a little confused on all of the models of the 88-89 bonnevilles.\nI have heard of the LE SE LSE SSE SSEI. Could someone tell me the\ndifferences are far as features or performance. I am also curious to\nknow what the book value is for prefereably the 89 model. And how much\nless than book value can you usually get them for. In other words how\nmuch are they in demand this time of year. I have heard that the mid-spring\nearly summer is the best time to buy.\n\n\t\t\tNeil Gandler\n']
class is :  sci.med
