In [1]:
# Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import textwrap
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import PorterStemmer
import spacy
import gensim
import gensim.corpora as corpora
from pprint import pprint
import pyLDAvis.gensim_models as gensimvis
import pickle
import pyLDAvis
import os
from gensim.models import CoherenceModel, TfidfModel
import pyLDAvis.gensim
import re
from bertopic import BERTopic
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, SpectralClustering, MeanShift, Birch
from sentence_transformers import SentenceTransformer
from bertopic.vectorizers import ClassTfidfTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from skopt.space import Real, Integer
from sklearn.mixture import GaussianMixture
from gensim.models.nmf import Nmf
import warnings
from gensim.parsing.preprocessing import preprocess_string
from top2vec import Top2Vec
from collections import Counter
from sklearn.metrics import silhouette_score
warnings.filterwarnings("ignore")

stops = stopwords.words('english')
stops.extend(['has', 'been', 're', 'com', 'edu', 'use', 'said', 'would', 'could', 'told', 'also', 'one', 'two', 'mr', 'new', 'year', 'people'])

  if (distutils.version.LooseVersion(tf.__version__) <
  distutils.version.LooseVersion(required_tensorflow_version)):


In [106]:
# # Dataset preparation

# from sklearn.datasets import fetch_20newsgroups

# categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', 'talk.politics.guns']


# remove = ('headers', 'footers', 'quotes')
# newsgroups_train = fetch_20newsgroups(subset = 'train', categories = categories, remove = remove)

# col_names = [label.split('.')[1] for label in newsgroups_train.target_names]

# df = pd.DataFrame({'text': newsgroups_train['data'], 'labels': [col_names[i] for i in newsgroups_train['target']]})
# df.to_csv('four_groups.csv', index=False)

In [253]:
class Preprocessor:
  
  def __init__(self, path):
    self.path = path
  
  def read_file(self):
    df = pd.read_csv(self.path)
    return df
  
  def drop_missing_vals(self, df):
    df.dropna(axis = 0, how = 'any', inplace = True)
  
  def mark_long_reviews_method(self, df):
    df['Num_words_text'] = df['text'].apply(lambda x:len(str(x).split()))
    mask = (df['Num_words_text'] < 100) & (df['Num_words_text'] >=20)
    df = df[mask]
    return df
  
  def remove_punctuation_method(self, text):
    return text.translate(str.maketrans('', '', string.punctuation))
  
  def remove_newline_characters_method(self, df):
    df['text'] = df['text'].str.replace('\n', ' ')
    
  def remove_digits_and_nonalphanumeric_characters_method(self, df):
    df['text'] = df['text'].str.replace('[^a-zA-Z0-9\s]', '')
    
  def lowercase_text_method(self, df):
    df['text'] = df['text'].str.lower()
    
  def normalize_whitespace_method(self, text):
    return re.sub('[\s]+', ' ', text)
  
  def remove_stopwords_method(self, text):
    return " ".join([word for word in text.split() if word not in stops])
  
  def lemmatization_method(self, text, allowed_postags = ['NOUN', 'ADJ']):
    nlp = spacy.load('en_core_web_sm', disable = ['parser', 'ner'])
    doc = nlp(''.join(text))
    doc = [token.lemma_ for token in doc if token.pos_ in allowed_postags]
    return ' '.join(doc)

  def stemming_method(self, text):
    doc = [PorterStemmer().stem(w).strip() for w in text.split()]
    return ' '.join(doc)
  
  def remove_digit_words_method(self, text):
    pattern = re.compile(r'\b\w*\d\w*\b')
    return pattern.sub('', text)
  
  def remove_short_words_method(self, text):
    text = text.split()
    text = [word for word in text if len(word) > 2]
    text = ' '.join(text)
    return text
  
  
  
  def __call__(self, 
               drop_missing=True, 
               mark_long_reviews=False, 
               remove_punctuation=True,
               remove_newline_characters=True,
               remove_digits_and_nonalphanumeric_characters=True,
               lowercase_text=True,
               normalize_whitespace=True,
               remove_stopwords=True,
               lemmatize=True,
               stem=True,
               remove_digit_words=False,
               remove_short_words=False):
    
    df = self.read_file()
    df = df.head(20)
    
    if drop_missing:
      self.drop_missing_vals(df)
    
    if mark_long_reviews:
      df = self.mark_long_reviews_method(df)
    
    if remove_punctuation:
      df['text'] = df['text'].apply(self.remove_punctuation_method)
    
    if remove_newline_characters:
      self.remove_newline_characters_method(df)
    
    if remove_digits_and_nonalphanumeric_characters:
      self.remove_digits_and_nonalphanumeric_characters_method(df)
      
    if lowercase_text:
      self.lowercase_text_method(df)
      
    if normalize_whitespace:
      df['text'] = df['text'].map(self.normalize_whitespace_method)
    
    if remove_stopwords:
      df['text'] = df['text'].apply(self.remove_stopwords_method)
    
    if lemmatize:
      nlp = spacy.load('en_core_web_sm', disable = ['parser', 'ner'])
      df['text'] = df['text'].map(self.lemmatization_method)
    
    if stem:
      df['text'] = df['text'].map(self.stemming_method)
      
    if remove_digit_words:
      df['text'] = df['text'].apply(self.remove_digit_words_method)
    
    if remove_short_words:
      df['text'] = df['text'].map(self.remove_short_words_method)
      
    if normalize_whitespace:
      df['text'] = df['text'].map(self.normalize_whitespace_method)
      
    return df

In [254]:
p = Preprocessor('four_groups.csv')

In [255]:
df = p()
df['text']

'first rule humor sorri second rule thing bad joke spite request post list nicknam list plot devic keeper list obviou play last name full advanc idea list ol timer nicknam altath poster good post other start poster bobbi bill second rule humor copeland'

In [272]:
df = pd.read_csv('four_groups.csv')
df = df.dropna(axis = 0, how = 'any')

def remove_digit_words(text):
    pattern = re.compile(r'\b\w*\d\w*\b')
    return pattern.sub('', text)
  
df['text'][1]

'Hello,\nI purchased my new 486 with a NoName graphics card installed which is obviously \nSpeedstar 24 compatible. Its name is "VGA 4000 TrueColor".\nIt is accompanied with some drivers and the utilities VMODE, XMODE and\nat least one more MODE, as well as some drivers for Lotus, Windows, etc.\nOnly one of the drivers is told to provide the TrueColor mode, namely\nthe Windows 3.1 driver.\nNowhere else, except in the ad, is any pointer to the TrueColor mode.\nSome articles in this group about the Speedstar 24 and some other facts\nmade me believe that my card is compatible to that one.\n\nDoes anybody out there know how this mode can be adjusted? How can I write\na driver which allows me to have 16.7 millions of colors with a resolution\nof 640 x 480 with 45 Hz interlaced ?'

In [273]:
df['text'] = df['text'].apply(remove_digit_words)
df['text'][1]

'Hello,\nI purchased my new  with a NoName graphics card installed which is obviously \nSpeedstar  compatible. Its name is "VGA  TrueColor".\nIt is accompanied with some drivers and the utilities VMODE, XMODE and\nat least one more MODE, as well as some drivers for Lotus, Windows, etc.\nOnly one of the drivers is told to provide the TrueColor mode, namely\nthe Windows . driver.\nNowhere else, except in the ad, is any pointer to the TrueColor mode.\nSome articles in this group about the Speedstar  and some other facts\nmade me believe that my card is compatible to that one.\n\nDoes anybody out there know how this mode can be adjusted? How can I write\na driver which allows me to have . millions of colors with a resolution\nof  x  with  Hz interlaced ?'

In [266]:
pattern = re.compile(r'\b\w*\d\w*\b')
pattern.sub('', df['text'][1])

'Hello,\nI purchased my new  with a NoName graphics card installed which is obviously \nSpeedstar  compatible. Its name is "VGA  TrueColor".\nIt is accompanied with some drivers and the utilities VMODE, XMODE and\nat least one more MODE, as well as some drivers for Lotus, Windows, etc.\nOnly one of the drivers is told to provide the TrueColor mode, namely\nthe Windows . driver.\nNowhere else, except in the ad, is any pointer to the TrueColor mode.\nSome articles in this group about the Speedstar  and some other facts\nmade me believe that my card is compatible to that one.\n\nDoes anybody out there know how this mode can be adjusted? How can I write\na driver which allows me to have . millions of colors with a resolution\nof  x  with  Hz interlaced ?'

In [None]:
df['text'][0]

In [42]:
df = pd.DataFrame({'text': newsgroups_train['data'], 'labels': [col_names[i] for i in newsgroups_train['target']]})

In [53]:
df.to_csv('four_groups.csv', index=False)

In [54]:
df = pd.read_csv('four_groups.csv')

In [72]:
df[df['text'] is nan]

NameError: name 'nan' is not defined

In [166]:
df[df['text'].isna()]

Unnamed: 0,text,labels
133,,space
176,,religion
202,,religion
220,,atheism
268,,politics
332,,religion
359,,religion
366,,politics
393,,graphics
410,,religion


In [71]:
df.iloc[2033]['text']

nan