# Loading Data ( News ) From the Files

In [1]:
import os
import pandas as pd
pd.reset_option("display")
pd.set_option('display.max_colwidth', 150)

Base_Folder = r'D:\Materials\6th Semester\Natural Language Processing\Project\20_newsgroups'

Groups = [
    'alt.atheism',
    'comp.graphics',
    'comp.os.ms-windows.misc',
    'comp.sys.ibm.pc.hardware',
    'comp.sys.mac.hardware',
    'comp.windows.x',
    'misc.forsale',
    'rec.autos',
    'rec.motorcycles',
    'rec.sport.baseball',
    'rec.sport.hockey',
    'sci.crypt',
    'sci.electronics',
    'sci.med',
    'sci.space',
    'soc.religion.christian',
    'talk.politics.guns',
    'talk.politics.mideast',
    'talk.politics.misc',
    'talk.religion.misc'
]

All_News = []
Targets = []
for Group in Groups:
    CurFolder = Base_Folder + '\\' + Group
    CurListOfFiles = os.listdir(CurFolder)
    for file in CurListOfFiles:
        PATH = CurFolder + '\\' + file
        with open(PATH) as f:
            News = f.read()
            All_News.append(News)
            Targets.append(Group)

mydict = {'News':All_News ,'Group': Targets}

News_DF = pd.DataFrame(mydict)

X = News_DF['News']
y = News_DF['Group']
News_DF

Unnamed: 0,News,Group
0,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:49960 alt.atheism.moderated:713 news.answers:7054 alt.answers:126\nPath: cantaloupe.srv.cs.cmu.edu!cra...,alt.atheism
1,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51060 alt.atheism.moderated:727 news.answers:7300 alt.answers:155\nPath: cantaloupe.srv.cs.cmu.edu!cra...,alt.atheism
2,Newsgroups: alt.atheism\nPath: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!fs7.ece.cmu.edu!europa.eng.gtefsd.com!howland.reston.ans.net!usc...,alt.atheism
3,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51120 alt.politics.usa.constitution:1934\nPath: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!fs7...,alt.atheism
4,"Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51121 soc.motss:139944 rec.scouting:5318\nNewsgroups: alt.atheism,soc.motss,rec.scouting\nPath: cantal...",alt.atheism
...,...,...
19992,"Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:54482 talk.religion.misc:84566\nNewsgroups: alt.atheism,talk.religion.misc\nPath: cantaloupe.srv.cs.cm...",talk.religion.misc
19993,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:54485 talk.religion.misc:84567 talk.origins:41255\nPath: cantaloupe.srv.cs.cmu.edu!magnesium.club.cc.c...,talk.religion.misc
19994,"Xref: cantaloupe.srv.cs.cmu.edu talk.religion.misc:84568 talk.politics.misc:180112 sci.skeptic:43753\nNewsgroups: talk.religion.misc,talk.politics...",talk.religion.misc
19995,Xref: cantaloupe.srv.cs.cmu.edu talk.religion.misc:84569 talk.religion.newage:19754\nPath: cantaloupe.srv.cs.cmu.edu!magnesium.club.cc.cmu.edu!new...,talk.religion.misc


# Stop Words & Punctuation Removal

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# remove stopwords 
stop_words = set(stopwords.words("english"))
predoc_x= []
for doc in X:
    words = word_tokenize(doc)
    predoc = [word for word in words if word.casefold() not in stop_words]
    predoc_x.append(' '.join(predoc))

X = pd.Series(predoc_x)

#remove punctuation and any unnecessary char 
import re
import string

translator = str.maketrans('', '', string.punctuation)
for i in range(len(X)):
    doc = X[i]
    doc = re.sub(r'\d+', '', doc)
    doc = doc.translate(translator)
    X[i] = doc

News_DF['News'] = X
News_DF

Unnamed: 0,News,Group
0,Xref cantaloupesrvcscmuedu altatheism altatheismmoderated newsanswers altanswers Path cantaloupesrvcscmuedu crabapplesrvcscmuedu bbandrewcmued...,alt.atheism
1,Xref cantaloupesrvcscmuedu altatheism altatheismmoderated newsanswers altanswers Path cantaloupesrvcscmuedu crabapplesrvcscmuedu fsececmuedu ...,alt.atheism
2,Newsgroups altatheism Path cantaloupesrvcscmuedu crabapplesrvcscmuedu fsececmuedu europaenggtefsdcom howlandrestonansnet usc sddhpcom nig...,alt.atheism
3,Xref cantaloupesrvcscmuedu altatheism altpoliticsusaconstitution Path cantaloupesrvcscmuedu crabapplesrvcscmuedu fsececmuedu europaenggtefsdc...,alt.atheism
4,Xref cantaloupesrvcscmuedu altatheism socmotss recscouting Newsgroups altatheism socmotss recscouting Path cantaloupesrvcscmuedu crabapplesr...,alt.atheism
...,...,...
19992,Xref cantaloupesrvcscmuedu altatheism talkreligionmisc Newsgroups altatheism talkreligionmisc Path cantaloupesrvcscmuedu magnesiumclubcccmued...,talk.religion.misc
19993,Xref cantaloupesrvcscmuedu altatheism talkreligionmisc talkorigins Path cantaloupesrvcscmuedu magnesiumclubcccmuedu newsseicmuedu cisohiostat...,talk.religion.misc
19994,Xref cantaloupesrvcscmuedu talkreligionmisc talkpoliticsmisc sciskeptic Newsgroups talkreligionmisc talkpoliticsmisc sciskeptic Path cantalou...,talk.religion.misc
19995,Xref cantaloupesrvcscmuedu talkreligionmisc talkreligionnewage Path cantaloupesrvcscmuedu magnesiumclubcccmuedu newsseicmuedu cisohiostateedu...,talk.religion.misc


# Lemmatizing & Stemming

In [3]:
from nltk.stem import WordNetLemmatizer,PorterStemmer

lemma_X= []
lemmatizer = WordNetLemmatizer() 
for doc in X:
    words = word_tokenize(doc)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    lemma_X.append(' '.join(lemmatized_words))
                
X = pd.Series(lemma_X)

stemmed_X = []
stemmer = PorterStemmer() 
for doc in X:
    words = word_tokenize(doc)
    stemmed_words = [stemmer.stem(word) for word in words]
    stemmed_X.append(' '.join(stemmed_words))
                
X = pd.Series(stemmed_X)

for i in range(len(X)):
    doc = X[i]
    doc = re.sub(r'\b\w{1,2}\b', '', doc) 
    X[i] = doc

News_DF['News'] = X
News_DF

Unnamed: 0,News,Group
0,xref cantaloupesrvcscmuedu altath altatheismmoder newsansw altansw path cantaloupesrvcscmuedu crabapplesrvcscmuedu bbandrewcmuedu newsseicmuedu ci...,alt.atheism
1,xref cantaloupesrvcscmuedu altath altatheismmoder newsansw altansw path cantaloupesrvcscmuedu crabapplesrvcscmuedu fsececmuedu europaenggtefsdcom ...,alt.atheism
2,newsgroup altath path cantaloupesrvcscmuedu crabapplesrvcscmuedu fsececmuedu europaenggtefsdcom howlandrestonansnet usc sddhpcom nigelmsencom yale...,alt.atheism
3,xref cantaloupesrvcscmuedu altath altpoliticsusaconstitut path cantaloupesrvcscmuedu crabapplesrvcscmuedu fsececmuedu europaenggtefsdcom howlandre...,alt.atheism
4,xref cantaloupesrvcscmuedu altath socmotss recscout newsgroup altath socmotss recscout path cantaloupesrvcscmuedu crabapplesrvcscmuedu fsececmuedu...,alt.atheism
...,...,...
19992,xref cantaloupesrvcscmuedu altath talkreligionmisc newsgroup altath talkreligionmisc path cantaloupesrvcscmuedu magnesiumclubcccmuedu newsseicmued...,talk.religion.misc
19993,xref cantaloupesrvcscmuedu altath talkreligionmisc talkorigin path cantaloupesrvcscmuedu magnesiumclubcccmuedu newsseicmuedu cisohiostateedu pacif...,talk.religion.misc
19994,xref cantaloupesrvcscmuedu talkreligionmisc talkpoliticsmisc sciskept newsgroup talkreligionmisc talkpoliticsmisc sciskept path cantaloupesrvcscmu...,talk.religion.misc
19995,xref cantaloupesrvcscmuedu talkreligionmisc talkreligionnewag path cantaloupesrvcscmuedu magnesiumclubcccmuedu newsseicmuedu cisohiostateedu zapho...,talk.religion.misc


# Target ( Groups ) Encoding

In [4]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm

Encoder = preprocessing.LabelEncoder()
y = Encoder.fit_transform(y)

News_DF['Group'] = y
News_DF

Unnamed: 0,News,Group
0,xref cantaloupesrvcscmuedu altath altatheismmoder newsansw altansw path cantaloupesrvcscmuedu crabapplesrvcscmuedu bbandrewcmuedu newsseicmuedu ci...,0
1,xref cantaloupesrvcscmuedu altath altatheismmoder newsansw altansw path cantaloupesrvcscmuedu crabapplesrvcscmuedu fsececmuedu europaenggtefsdcom ...,0
2,newsgroup altath path cantaloupesrvcscmuedu crabapplesrvcscmuedu fsececmuedu europaenggtefsdcom howlandrestonansnet usc sddhpcom nigelmsencom yale...,0
3,xref cantaloupesrvcscmuedu altath altpoliticsusaconstitut path cantaloupesrvcscmuedu crabapplesrvcscmuedu fsececmuedu europaenggtefsdcom howlandre...,0
4,xref cantaloupesrvcscmuedu altath socmotss recscout newsgroup altath socmotss recscout path cantaloupesrvcscmuedu crabapplesrvcscmuedu fsececmuedu...,0
...,...,...
19992,xref cantaloupesrvcscmuedu altath talkreligionmisc newsgroup altath talkreligionmisc path cantaloupesrvcscmuedu magnesiumclubcccmuedu newsseicmued...,19
19993,xref cantaloupesrvcscmuedu altath talkreligionmisc talkorigin path cantaloupesrvcscmuedu magnesiumclubcccmuedu newsseicmuedu cisohiostateedu pacif...,19
19994,xref cantaloupesrvcscmuedu talkreligionmisc talkpoliticsmisc sciskept newsgroup talkreligionmisc talkpoliticsmisc sciskept path cantaloupesrvcscmu...,19
19995,xref cantaloupesrvcscmuedu talkreligionmisc talkreligionnewag path cantaloupesrvcscmuedu magnesiumclubcccmuedu newsseicmuedu cisohiostateedu zapho...,19


# Data Splitting

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF

In [6]:
#vectorization using TF_IDF 
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer='word', max_features=4800,stop_words = 'english')
vectorizer.fit(X)
X_train_tfidf =  vectorizer.transform(X_train)
X_test_tfidf =  vectorizer.transform(X_test)

# Models training & Evaluation

In [7]:
def train_model(classifier, X_train_tfidf, y_train, X_test_tfidf, y_test):
    # fit the training dataset on the classifier
    classifier.fit(X_train_tfidf, y_train)
    # predict the labels on validation dataset
    predictions = classifier.predict(X_test_tfidf)
    return metrics.accuracy_score(predictions, y_test)

In [8]:
# Naive Bayes
accuracy = train_model(naive_bayes.MultinomialNB(alpha=1.6), X_train_tfidf, y_train, X_test_tfidf,y_test)
print ("Naive Bayes Accuracy --> ", accuracy)

Naive Bayes Accuracy -->  0.901


In [9]:
# Logistic Regression
accuracy = train_model(linear_model.LogisticRegression(), X_train_tfidf, y_train, X_test_tfidf,y_test)
print ("Logistic Regression Accuracy --> ", accuracy)

Logistic Regression Accuracy -->  0.95


In [10]:
# Support Vector Machine
accuracy = train_model(svm.SVC(kernel='linear'), X_train_tfidf, y_train, X_test_tfidf,y_test)
print ("SVM Accuracy --> ", accuracy)

SVM Accuracy -->  0.9515
