<div style="font-size:20px">#Q1

In [1]:
import pandas as pd
import string
from nltk import corpus
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

# Load the dataset
news_df = pd.read_csv ('news.csv')

# Preprocess
print(news_df.isnull().sum()) # so we understand that we have 39 null text
news_df = news_df[news_df["text"].notnull()] # remove rows with null value for text column
news_df.drop_duplicates(subset=['text', 'label'],keep="first",inplace = True) # we understand that we have 419  duplicated text so we remove them


## part (A)
# a) lowercase text
news_df["text"] = news_df["text"].str.lower()

# b) remove digits
news_df["text"] =  news_df["text"].str.replace('\d+', '')

# c) remove Punctuations and special characters
news_df["text"] =  news_df["text"].str.translate(str.maketrans('','', string.punctuation))
news_df["text"] =  news_df["text"].str.translate(str.maketrans('', '', '''’—“”»«›!©…•–'''))

# d) remove single characters
news_df["text"] = news_df["text"].apply(lambda x: ' '.join(word for word in x.split() if len(word)>1 ))

# remove newline and tab
news_df["text"] =  news_df["text"].str.replace('\n|\t', ' ')

# e,f) Lemmatization, Stemming and removing stop words
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
cachedStopWords  = stopwords.words('english')
news_df["text"] = news_df["text"].apply(lambda x: ' '.join([stemmer.stem(lemmatizer.lemmatize(word)) for word in x.split() if word not in cachedStopWords]))

## part (B)
# Creating the Bag of Words
corpus = news_df["text"].tolist()
cv = CountVectorizer(max_features=6000, ngram_range=(1,3))
x = cv.fit_transform(corpus).toarray()
y = news_df["label"]

## part (C)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2500, shuffle=True)

## part (D)
classifier = MultinomialNB()
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

## part (E)
# printing confusion matrix for the model
cm = confusion_matrix(y_test, y_pred)
print("\nconfusion matrix for the model : \n" + str(cm))
# printing classification report for the model
cr = classification_report(y_test, y_pred)
print("\nclassification report for the model : \n" + str(cr)) 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nooshin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Nooshin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


id           0
title      558
author    1957
text        39
label        0
dtype: int64
(20761, 5)
(20386, 5)
confusion matrix for the model : 
[[1929  145]
 [ 283 1721]]

classification report for the model : 
              precision    recall  f1-score   support

           0       0.87      0.93      0.90      2074
           1       0.92      0.86      0.89      2004

    accuracy                           0.90      4078
   macro avg       0.90      0.89      0.89      4078
weighted avg       0.90      0.90      0.89      4078



<div style="font-size:20px">#Q2

In [8]:
import pandas as pd
import re
from gensim.utils import simple_preprocess
from gensim.models.phrases import Phrases, FrozenPhrases
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel

## part (A)
# Load dataset
df = pd.read_json("https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json")
df.head(10)
#print(df.shape) 
#print(df.isnull().sum())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nooshin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space
5,From: dfo@vttoulu.tko.vtt.fi (Foxvog Douglas)\...,16,talk.politics.guns
6,From: bmdelane@quads.uchicago.edu (brian manni...,13,sci.med
7,From: bgrubb@dante.nmsu.edu (GRUBB)\nSubject: ...,3,comp.sys.ibm.pc.hardware
8,From: holmes7000@iscsvax.uni.edu\nSubject: WIn...,2,comp.os.ms-windows.misc
9,From: kerr@ux1.cso.uiuc.edu (Stan Kerr)\nSubje...,4,comp.sys.mac.hardware


In [30]:
## part (B)
# Convert to list
data = df.content.values.tolist()
content = []
# Preprocess
for row in data:
        # remove some patterns
        row = re.sub(r'(From:\s+[^\n]+\n)', ' ', row)
        row = re.sub(r'(([\sA-Za-z0-9\-]+)?[A|a]rchive-name:)', ' ', row)
        row = re.sub(r'(Last-modified:[^\n]+\n)', ' ', row)
        row = re.sub(r'(Version:[^\n]+\n)', ' ', row)
        row = re.sub(r'(Reply-To:[^\n]+\n)', ' ', row)
        row = re.sub(r'(Lines:[^\n]+\n)', ' ', row)
        row = re.sub(r'(N[n|N][t|T][p|P]-Posting-Host:[^\n]+\n)', ' ', row)
       
        # remove emails
        row = re.sub('\S*@\S*\s?', '', row)  
        # remove whitespaces  
        row = re.sub('\s+', ' ', row)  
        # remove single quotes and other special characters
        row = re.sub("['~’—“”»«›!©…•–]", "", row)  
        # remove digits
        row = re.sub("\d+", "", row)
        ## part (C)
        # lowercase content, remove Punctuations and remove words which length are less than 3
        row = simple_preprocess(str(row), deacc=True, min_len=3) 
        content.append(row)

## part (D)
# Build the bigram model
phrase_model = Phrases(content, min_count=15, threshold=100) 
# use less RAM, faster processing
#frozen_model = phrase_model.freeze()
frozen_model = FrozenPhrases(phrase_model)

## part (E)
# extend stopwords of this dataset
cachedStopWords  = stopwords.words('english')
cachedStopWords.extend(['from', 'subject', 'organization', 'edu', 'use', 'not', 'would', 'say',
                        'could', 'know', 'good', 'get', 'done', 'try', 'many', 'some', 'nice', 'thank',
                        'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem',
                        'run', 'need', 'even', 'right', 'even', 'also', 'may', 'take', 'come'])
new_content = []
# remove stopwords
for row in content:
  removed_stopwords = [word for word in row if  word  not in cachedStopWords]
  new_content.append(removed_stopwords)  
# Build bigrams
bigrams = frozen_model[new_content]

## part (F)
# Create Dictionary
dct  = Dictionary(bigrams)
# Create Corpus: Term Document Frequency
corpus = [dct.doc2bow(text) for text in bigrams]

## part (G)
# Train the model on the corpus
lda = LdaModel(corpus, id2word=dct, num_topics=20)
print(lda.print_topics(num_topics=20, num_words=10))
print("\n\n")
for idx, topic in lda.print_topics(num_words=10 , num_topics=20):

    print('Topic: {} \nWords: {}'.format(idx, [w.split('"')[1] for w in topic.split("+")]))


[(0, '0.010*"university" + 0.007*"game" + 0.006*"one" + 0.006*"like" + 0.005*"article" + 0.005*"writes" + 0.004*"win" + 0.004*"card" + 0.004*"time" + 0.003*"dont"'), (1, '0.007*"pts" + 0.006*"team" + 0.005*"players" + 0.005*"player" + 0.004*"one" + 0.004*"article" + 0.004*"time" + 0.004*"writes" + 0.004*"like" + 0.004*"aids"'), (2, '0.008*"information" + 0.006*"space" + 0.006*"encryption" + 0.005*"technology" + 0.005*"data" + 0.004*"one" + 0.004*"university" + 0.004*"new" + 0.004*"mail" + 0.004*"writes"'), (3, '0.007*"sale" + 0.007*"new" + 0.006*"one" + 0.006*"article" + 0.006*"like" + 0.006*"writes" + 0.005*"window" + 0.004*"price" + 0.003*"university" + 0.003*"way"'), (4, '0.011*"file" + 0.009*"entry" + 0.006*"program" + 0.005*"section" + 0.004*"entries" + 0.004*"information" + 0.004*"send" + 0.004*"files" + 0.004*"info" + 0.004*"faq"'), (5, '0.012*"people" + 0.010*"writes" + 0.009*"article" + 0.006*"one" + 0.006*"dont" + 0.005*"like" + 0.004*"government" + 0.004*"gun" + 0.004*"well"