In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

In [2]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cheri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cheri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def content_parsing(df): 

    #removing carriage returns, line breaks, quotaion marks, possesive "'s".

    df['Content_parsed'] = df['Content']
    df['Content_parsed'] = df['Content_parsed'].str.replace('\r',' ')
    df['Content_parsed'] = df['Content_parsed'].str.replace('\n',' ')
    df['Content_parsed'] = df['Content_parsed'].str.replace('\\s+',' ')
    df['Content_parsed'] = df['Content_parsed'].str.replace('"','')
    df['Content_parsed'] = df['Content_parsed'].str.replace("'s","")

    #converting whole text to lower case.

    df['Content_parsed'] = df['Content_parsed'].str.lower()

    #removing special characters and punctuations.

    for sign in list("?!,.:;"):
        df['Content_parsed'] = df['Content_parsed'].str.replace(sign,'')

    #lemmatizing the text using wordnet lemmatizer.

    lemmatizer = WordNetLemmatizer()
    rows = len(df)
    lemmatized_text_list = []

    for row in range(0,rows):

        lemmatized_list = []
        words = df['Content_parsed'][row].split(" ")

        for word in words:
            lemmatized_list.append(lemmatizer.lemmatize(word,pos='v'))

        lemmatized_text = " ".join(lemmatized_list)
        lemmatized_text_list.append(lemmatized_text)

    df['Content_parsed'] = lemmatized_text_list

    #removing stop words.

    stop_words = list(stopwords.words('english'))

    for stop_word in stop_words:
        regex_stop = r"\b" + stop_word + r"\b"
        df['Content_parsed'] = df['Content_parsed'].str.replace(regex_stop,'')

    #removing extra spaces

    df['Content_parsed'] = df['Content_parsed'].str.replace('\\s+',' ')
    cols_list = ['Content','Summary','Content_parsed','Category']
    df = df[cols_list]
    
    #assigning category code for each category

    category_codes = {
        'business':0,
        'entertainment':1,
        'politics':2,
        'sport':3,
        'tech':4
    }

    df['Category_code'] = df['Category']
    df = df.replace({'Category_code':category_codes})
    
    return df

In [4]:
df = pd.read_csv('dataset\\dataset.csv')
df = content_parsing(df)

x_train, x_test, y_train, y_test = train_test_split(df['Content_parsed'],df['Category_code'],test_size=0.15,random_state=12)

#vectorizing the text using TF-IDF vectorizer

#setting the parameters

ngram_range = (1,2)
min_df = 10
max_df = 1.0 
max_features = 300

tfidf = TfidfVectorizer(encoding = 'utf-8',
                        ngram_range = ngram_range,
                        stop_words = None,
                        lowercase = False,
                        min_df = min_df,
                        max_df = max_df,
                        max_features = max_features,
                        norm = 'l2',
                        sublinear_tf = True)

features_train = tfidf.fit_transform(x_train).toarray()
labels_train = y_train
features_test = tfidf.transform(x_test).toarray()
labels_test = y_test

with open('pickles/df.pickle', 'wb') as output:
    pickle.dump(df, output)
    
with open('pickles/features_train.pickle', 'wb') as output:
    pickle.dump(features_train, output)
    
with open('pickles/labels_train.pickle', 'wb') as output:
    pickle.dump(labels_train, output)
    
with open('pickles/features_test.pickle', 'wb') as output:
    pickle.dump(features_test, output)
    
with open('pickles/labels_test.pickle', 'wb') as output:
    pickle.dump(labels_test, output)
    
with open('pickles/tfidf.pickle', 'wb') as output:
    pickle.dump(tfidf, output)