In [5]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import nltk
import joblib
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

In [6]:
nltk.download('punkt',quiet=True)
nltk.download('stopwords',quiet=True)
nltk.download('wordnet',quiet=True)

True

In [7]:
df = pd.read_csv('MN-DS-news-classification.csv')

In [8]:
default_stemmer = PorterStemmer()
BAD_SYMBOLS_RE = re.compile("[^a-zA-Z,\d]")
REPLACE_IP_ADDRESS = re.compile(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b')
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
default_stopwords = stopwords.words('english')
default_stopwords = default_stopwords + ['said', 'would','even','according','could','year',
                                         'years','also','new','people','old,''one','two','time',
                                         'first','last','say','make','best','get','three','make',
                                         'year old','told','made','like','take','many','set','number',
                                         'month','week','well','back']
def clean_text(text):
    def tokenize_text(text):
        tokens = []
        sentences = sent_tokenize(text)
        for sentence in sentences:
            words = word_tokenize(sentence)
            for word in words:
                if len(word) >= 3:
                    tokens.append(word)
        return tokens
    def stem_text(text, stemmer=default_stemmer):
        stemmed_tokens = []
        tokens = tokenize_text(text)
        for t in tokens:
            stemmed_word = stemmer.stem(t) 
            stemmed_tokens.append(stemmed_word) 
        return ' '.join(stemmed_tokens)  

    def preprocessing_text(text):
        # text = text.lower()
        text=text.replace('\n',' ').replace('\xa0',' ').replace('-',' ').replace('ó','o').replace('ğ','g').replace('á','a').replace("'"," ")
        text=re.sub(r'\d+','', text)
        text=re.sub(r'http\S+', '', text)
        text=BAD_SYMBOLS_RE.sub(' ', text)
        text=REPLACE_IP_ADDRESS.sub('', text)
        text=REPLACE_BY_SPACE_RE.sub(' ', text)
        # text=' '.join(word for word in text.split() if len(word)>3)
        return text

    def remove_special_characters(text, characters=string.punctuation.replace('-', '')):
        tokens = tokenize_text(text)
        pattern = re.compile('[{}]'.format(re.escape(characters + '0123456789')))
        return ' '.join(filter(None, [pattern.sub('', t) for t in tokens]))

    def lemm_text(text, lemm=WordNetLemmatizer()):
        tokens = tokenize_text(text)
        
        return ' '.join([lemm.lemmatize(t,pos='n') for t in tokens])
    
    def remove_stopwords(text, stop_words=default_stopwords):
        tokens = [w for w in tokenize_text(text) if w not in stop_words]
        return ' '.join(tokens)
    text = text.strip()
    text = text.lower()
    text = stem_text(text)
    text = preprocessing_text(text)
    text = remove_special_characters(text)
    text = lemm_text(text)
    text = remove_stopwords(text)
    
    return text

In [9]:
df['text']= df[['title', 'content']].apply(lambda x: ' . '.join(x.astype(str)),axis=1)

In [10]:
df['text']=df['text'].apply(clean_text)

In [16]:
df['text'][0:5]

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       