In [13]:
# import libraries
import nltk
import re
import sys
import sqlite3
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
import gc 
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_score, recall_score, make_scorer, f1_score, accuracy_score,hamming_loss
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from skmultilearn.adapt import MLkNN
from skmultilearn.problem_transform import ClassifierChain
from nltk.corpus import wordnet
import time
import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.utils.class_weight import compute_class_weight
from python_functions import *

def load_data(data_filepath):
    
    """
        This function takes in the database path and it reads the data
        
        
        return: The messages which are the predators, the response i.e the labels as a dataframe 
            and an array of all the labels
    """
    df=pd.read_csv(data_filepath, sep=",", header=0)
    new_df=df[df['sent_length']>20].dropna(subset=['clean_headline']).reset_index(drop=True)
    
    return new_df 

def preprocess(data_set,STOPWORDS):
    
    """
        This function takes in text and stopwords, it uses the function WORDCOUNT to count the 
        occurence of each word and add word with only one occurence to the stopwords, then uses function 
        CLEAN_DATA to remove the combined stopwords and also preprocesed the data
        
        return: Text devoid of noise.
    """
    # Count of each tokens in the dataset
    start = time.time()
    print("getting less frequent words in dataset ......")
    wordcount=word_count(data_set)
    new_stopword=wordcount[wordcount['frequency']==0]['Unigram'].values.tolist()
    print('collection of words completed.: {} mins'.format(round((time.time()-start)/60 , 2)))
    ## Adding our own stopwords
    STOPWORDS.extend(new_stopword)

    ## De-noising the dataset and normalisation
    print("starting data preprocessing ......")
    clean_data=clean_text_process(data_set,stopwords=STOPWORDS)
    print('data preprocessing completed.: {} mins'.format(round((time.time()-start)/60 , 2)))

    return clean_data

 
STOPWORDS = stopwords.words('english')   


In [6]:
df=load_data('model_df.csv')

In [7]:
STOPWORDS = stopwords.words('english')
df['new_cln_data']=preprocess(df['clean_headline'],STOPWORDS)

getting less frequent words in dataset ......
collection of words completed.: 0.03 mins
starting data preprocessing ......
data preprocessing completed.: 0.04 mins


In [8]:
df

Unnamed: 0,clean_headline,category,sent_length,new_cln_data
0,threat right wing supreme court anal zing trum...,POLITICS,68,threat right wing supreme court anal zing trum...
1,hillary clinton really wants think tough wall ...,POLITICS,52,hillary clinton really wants think tough wall ...
2,photo barack obama irish immigrant rd great gr...,POLITICS,84,photo barack obama irish immigrant rd great gr...
3,rethinking battlefield,TRAVEL,22,rethinking battlefield
4,scalia utter moral failure exposed,WELLNESS,34,scalia utter moral failure exposed
...,...,...,...,...
21967,spending lot time digital devices,WELLNESS,33,spending lot time digital devices
21968,zone use breath posture passion get flow state,POLITICS,46,zone use breath posture passion get flow state
21969,road term jerry brown ash rolls arnold schwarz...,POLITICS,53,road term jerry brown ash rolls arnold schwarz...
21970,shallow salesmanship carly fi,POLITICS,29,shallow salesmanship carly fi


In [10]:
## Changing label to categorical values
news_labels=df['category'].unique()
news_labels_dict={}
for index in range(len(news_labels)):
    news_labels_dict[news_labels[index]]=index
news_labels_dict 

{'POLITICS': 0, 'TRAVEL': 1, 'WELLNESS': 2, 'ENTERTAINMENT': 3}

In [11]:
labels = df['category'].apply(lambda x: news_labels_dict[x])
labels.head()

0    0
1    0
2    0
3    1
4    2
Name: category, dtype: int64

In [12]:
# Splitting to training and test split.

X_train, X_test, y_train, y_test = train_test_split(df['new_cln_data'],labels, 
                                                               test_size=0.25, 
                                                               random_state=3)

In [14]:
## Calculation of the class weight
weights= compute_class_weight(
           'balanced',
            np.unique(y_train), 
            y_train)

weights_dict = dict(zip( np.unique(y_train),weights))
weights_dict

311      0
14465    1
684      2
12460    3
        ..
6400     0
15288    0
11513    3
1688     0
5994     0
Name: category, Length: 16479, dtype: int64 as keyword args. From version 0.25 passing these as positional arguments will result in an error


{0: 0.587360992301112,
 1: 1.9188402421984163,
 2: 1.0779042386185242,
 3: 1.1784181922196797}

## Training of model with different algorithms

## MultinomialNB

In [16]:
pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(sublinear_tf=True,norm='l2',
                        ngram_range=(1, 2),)),
                              
                ('mnb', MultinomialNB())
    ])

    # specify parameters for grid search
parameters = {
    'tfidf__min_df': np.array([10, 20,30,40,50]), 
    'tfidf__sublinear_tf': np.array([True,False]),
    'mnb__alpha': np.linspace(0.1, 1.5, 10),
               
        
    }

    # create grid search object
model =  GridSearchCV(pipeline, param_grid=parameters, scoring='accuracy')
    
 


In [17]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [18]:
print('\t\t\t\tCLASSIFICATIION METRICS\n')
print(metrics.classification_report(y_test, y_pred, 
                                    target_names= df['category'].unique()))

				CLASSIFICATIION METRICS

               precision    recall  f1-score   support

     POLITICS       0.43      0.99      0.60      2371
       TRAVEL       0.00      0.00      0.00       681
     WELLNESS       0.23      0.01      0.01      1289
ENTERTAINMENT       0.19      0.00      0.01      1152

     accuracy                           0.43      5493
    macro avg       0.21      0.25      0.16      5493
 weighted avg       0.28      0.43      0.26      5493



  _warn_prf(average, modifier, msg_start, len(result))


## RandomForestClassifier

In [19]:
pipeline2 = Pipeline([
                ('tfidf', TfidfVectorizer(sublinear_tf=True,norm='l2',
                        ngram_range=(1, 2),)),
                              
                ('rfc', RandomForestClassifier())
    ])
    # specify parameters for grid search
parameters2 = {
    'tfidf__min_df': np.array([10, 20,30,40,50]), 
    'tfidf__sublinear_tf': np.array([True,False]),
    'rfc__n_estimators': np.arange(100, 201, 10),
    'rfc__class_weight': (None,weights_dict)
               
        
    }

    # create grid search object
model2 =  GridSearchCV(pipeline2, param_grid=parameters2, scoring='accuracy')
    

In [None]:
model2.fit(X_train, y_train)
y_pred2 = model2.predict(X_test)

In [None]:
print('\t\t\t\tCLASSIFICATIION METRICS\n')
print(metrics.classification_report(y_test, y_pred2, 
                                    target_names= df['category'].unique()))

## LinearSVC

In [None]:
pipeline3 = Pipeline([
                ('tfidf', TfidfVectorizer(sublinear_tf=True,norm='l2',
                        ngram_range=(1, 2),)),
                              
                ('Lsvc', LinearSVC())
    ])
    # specify parameters for grid search
parameters3 = {
    'tfidf__min_df': np.array([10, 20,30,40,50]), 
    'tfidf__sublinear_tf': np.array([True,False]),
    'Lsvc__C': np.linspace(0.1, 1, 6),
    'Lsvc__class_weight': (None,weights_dict),
    'Lsvc__loss':np.array(['hinge', 'squared_hinge'])
               
        
    }

    # create grid search object
model3 =  GridSearchCV(pipeline3, param_grid=parameters3, scoring='accuracy')
        

In [None]:
model3.fit(X_train, y_train)
y_pred3 = model3.predict(X_test)

In [None]:
print('\t\t\t\tCLASSIFICATIION METRICS\n')
print(metrics.classification_report(y_test, y_pred3, 
                                    target_names= df['category'].unique()))