# Loading Data ( News ) From the Files

In [1]:
import os
import pandas as pd

Base_Folder = r'D:\Materials\6th Semester\Natural Language Processing\Project\20_newsgroups'

Groups = [
    'alt.atheism',
    'comp.graphics',
    'comp.os.ms-windows.misc',
    'comp.sys.ibm.pc.hardware',
    'comp.sys.mac.hardware',
    'comp.windows.x',
    'misc.forsale',
    'rec.autos',
    'rec.motorcycles',
    'rec.sport.baseball',
    'rec.sport.hockey',
    'sci.crypt',
    'sci.electronics',
    'sci.med',
    'sci.space',
    'soc.religion.christian',
    'talk.politics.guns',
    'talk.politics.mideast',
    'talk.politics.misc',
    'talk.religion.misc'
]

All_News = []
Targets = []
for Group in Groups:
    CurFolder = Base_Folder + '\\' + Group
    CurListOfFiles = os.listdir(CurFolder)
    for file in CurListOfFiles:
        PATH = CurFolder + '\\' + file
        with open(PATH) as f:
            News = f.read()
            All_News.append(News)
            Targets.append(Group)

mydict = {'News':All_News ,'Group': Targets}

News_DF = pd.DataFrame(mydict)

X = News_DF['News']
y = News_DF['Group']
X

0        Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:49...
1        Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...
2        Newsgroups: alt.atheism\nPath: cantaloupe.srv....
3        Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...
4        Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51...
                               ...                        
19992    Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:54...
19993    Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:54...
19994    Xref: cantaloupe.srv.cs.cmu.edu talk.religion....
19995    Xref: cantaloupe.srv.cs.cmu.edu talk.religion....
19996    Xref: cantaloupe.srv.cs.cmu.edu talk.abortion:...
Name: News, Length: 19997, dtype: object

# Stop Words & Punctuation Removal

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# remove stopwords 
stop_words = set(stopwords.words("english"))
predoc_x= []
for doc in X:
    words = word_tokenize(doc)
    predoc = [word for word in words if word.casefold() not in stop_words]
    predoc_x.append(' '.join(predoc))

X = pd.Series(predoc_x)

#remove punctuation and any unnecessary char 
import re
import string

translator = str.maketrans('', '', string.punctuation)
for i in range(len(X)):
    doc = X[i]
    doc = re.sub(r'\d+', '', doc)
    doc = doc.translate(translator)
    X[i] = doc


X

0        Xref  cantaloupesrvcscmuedu altatheism altathe...
1        Xref  cantaloupesrvcscmuedu altatheism altathe...
2        Newsgroups  altatheism Path  cantaloupesrvcscm...
3        Xref  cantaloupesrvcscmuedu altatheism altpoli...
4        Xref  cantaloupesrvcscmuedu altatheism socmots...
                               ...                        
19992    Xref  cantaloupesrvcscmuedu altatheism talkrel...
19993    Xref  cantaloupesrvcscmuedu altatheism talkrel...
19994    Xref  cantaloupesrvcscmuedu talkreligionmisc t...
19995    Xref  cantaloupesrvcscmuedu talkreligionmisc t...
19996    Xref  cantaloupesrvcscmuedu talkabortion altat...
Length: 19997, dtype: object

# Lemmatizing & Stemming

In [3]:
from nltk.stem import WordNetLemmatizer,PorterStemmer

lemma_X= []
lemmatizer = WordNetLemmatizer() 
for doc in X:
    words = word_tokenize(doc)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    lemma_X.append(' '.join(lemmatized_words))
                
X = pd.Series(lemma_X)

stemmed_X = []
stemmer = PorterStemmer() 
for doc in X:
    words = word_tokenize(doc)
    stemmed_words = [stemmer.stem(word) for word in words]
    stemmed_X.append(' '.join(stemmed_words))
                
X = pd.Series(stemmed_X)

for i in range(len(X)):
    doc = X[i]
    doc = re.sub(r'\b\w{1,2}\b', '', doc) 
    X[i] = doc

X

0        xref cantaloupesrvcscmuedu altath altatheismmo...
1        xref cantaloupesrvcscmuedu altath altatheismmo...
2        newsgroup altath path cantaloupesrvcscmuedu cr...
3        xref cantaloupesrvcscmuedu altath altpoliticsu...
4        xref cantaloupesrvcscmuedu altath socmotss rec...
                               ...                        
19992    xref cantaloupesrvcscmuedu altath talkreligion...
19993    xref cantaloupesrvcscmuedu altath talkreligion...
19994    xref cantaloupesrvcscmuedu talkreligionmisc ta...
19995    xref cantaloupesrvcscmuedu talkreligionmisc ta...
19996    xref cantaloupesrvcscmuedu talkabort altath ta...
Length: 19997, dtype: object

# Target ( Groups ) Encoding

In [4]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm

Encoder = preprocessing.LabelEncoder()
y = Encoder.fit_transform(y)

# Data Splitting

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF

In [6]:
#vectorization using TF_IDF 
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
vectorizer.fit(X)
X_train_tfidf =  vectorizer.transform(X_train)
X_test_tfidf =  vectorizer.transform(X_test)

# Models training & Evaluation

In [7]:
def train_model(classifier, X_train_tfidf, y_train, X_test_tfidf, y_test):
    # fit the training dataset on the classifier
    classifier.fit(X_train_tfidf, y_train)
    # predict the labels on validation dataset
    predictions = classifier.predict(X_test_tfidf)
    return metrics.accuracy_score(predictions, y_test)

In [8]:
# Naive Bayes
accuracy = train_model(naive_bayes.MultinomialNB(alpha=1.6), X_train_tfidf, y_train, X_test_tfidf,y_test)
print ("Naive Bayes Accuracy: ", accuracy)

Naive Bayes Accuracy:  0.90025


In [9]:
# Logistic Regression
accuracy = train_model(linear_model.LogisticRegression(), X_train_tfidf, y_train, X_test_tfidf,y_test)
print ("Logistic Regression Accuracy: ", accuracy)

Logistic Regression Accuracy:  0.94975


In [10]:
# Support Vector Machine
accuracy = train_model(svm.SVC(kernel='linear'), X_train_tfidf, y_train, X_test_tfidf,y_test)
print ("SVM Accuracy: ", accuracy)

SVM Accuracy:  0.951
