# Text Preprocessing

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt
import string
pd.options.mode.chained_assignment = None

full_df = pd.read_csv("DATA.csv")
# print(full_df)

df = full_df

df["Text"] = df["Text"].astype(str)
full_df.head()
full_df.isnull().sum()   #check for null values
df



Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business
...,...,...,...
1485,857,double eviction from big brother model caprice...,entertainment
1486,325,dj double act revamp chart show dj duo jk and ...,entertainment
1487,1590,weak dollar hits reuters revenues at media gro...,business
1488,1587,apple ipod family expands market apple has exp...,tech


In [2]:
#converting into lower case

df["new_text"] = df["Text"].str.lower()
df.head()

Unnamed: 0,ArticleId,Text,Category,new_text
0,1833,worldcom ex-boss launches defence lawyers defe...,business,worldcom ex-boss launches defence lawyers defe...
1,154,german business confidence slides german busin...,business,german business confidence slides german busin...
2,1101,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicates economic gloom citizens in ...
3,1976,lifestyle governs mobile choice faster bett...,tech,lifestyle governs mobile choice faster bett...
4,917,enron bosses in $168m payout eighteen former e...,business,enron bosses in $168m payout eighteen former e...


In [3]:
#removing punctuations

PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df["new_text"] = df["new_text"].apply(lambda text: remove_punctuation(text))
df.head()


Unnamed: 0,ArticleId,Text,Category,new_text
0,1833,worldcom ex-boss launches defence lawyers defe...,business,worldcom exboss launches defence lawyers defen...
1,154,german business confidence slides german busin...,business,german business confidence slides german busin...
2,1101,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicates economic gloom citizens in ...
3,1976,lifestyle governs mobile choice faster bett...,tech,lifestyle governs mobile choice faster bett...
4,917,enron bosses in $168m payout eighteen former e...,business,enron bosses in 168m payout eighteen former en...


In [4]:
#removing stop words

from nltk.corpus import stopwords
stopwords=set(stopwords.words("english"))

def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in stopwords])

df["new_text"] = df["new_text"].apply(lambda text: remove_stopwords(text))
df.head()

Unnamed: 0,ArticleId,Text,Category,new_text
0,1833,worldcom ex-boss launches defence lawyers defe...,business,worldcom exboss launches defence lawyers defen...
1,154,german business confidence slides german busin...,business,german business confidence slides german busin...
2,1101,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicates economic gloom citizens maj...
3,1976,lifestyle governs mobile choice faster bett...,tech,lifestyle governs mobile choice faster better ...
4,917,enron bosses in $168m payout eighteen former e...,business,enron bosses 168m payout eighteen former enron...


In [5]:
#Lemmatizing

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemmatizer=WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

df["new_text"] = df["new_text"].apply(lambda text: lemmatize_words(text))
df.head()



Unnamed: 0,ArticleId,Text,Category,new_text
0,1833,worldcom ex-boss launches defence lawyers defe...,business,worldcom exboss launch defence lawyer defend f...
1,154,german business confidence slides german busin...,business,german business confidence slides german busin...
2,1101,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicate economic gloom citizen major...
3,1976,lifestyle governs mobile choice faster bett...,tech,lifestyle governs mobile choice faster well fu...
4,917,enron bosses in $168m payout eighteen former e...,business,enron boss 168m payout eighteen former enron d...


In [6]:
#label encoding

from sklearn.preprocessing import LabelEncoder
# news_types=(df["Category"])
labelencoder=LabelEncoder()
df["news_types_cat"]=labelencoder.fit_transform(df["Category"])
df.head(10)
# df.Category.unique()

Unnamed: 0,ArticleId,Text,Category,new_text,news_types_cat
0,1833,worldcom ex-boss launches defence lawyers defe...,business,worldcom exboss launch defence lawyer defend f...,0
1,154,german business confidence slides german busin...,business,german business confidence slides german busin...,0
2,1101,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicate economic gloom citizen major...,0
3,1976,lifestyle governs mobile choice faster bett...,tech,lifestyle governs mobile choice faster well fu...,4
4,917,enron bosses in $168m payout eighteen former e...,business,enron boss 168m payout eighteen former enron d...,0
5,1582,howard truanted to play snooker conservative...,politics,howard truanted play snooker conservative lead...,2
6,651,wales silent on grand slam talk rhys williams ...,sport,wale silent grand slam talk rhys williams say ...,3
7,1797,french honour for director parker british film...,entertainment,french honour director parker british film dir...,1
8,2034,car giant hit by mercedes slump a slump in pro...,business,car giant hit mercedes slump slump profitabili...,0
9,1866,fockers fuel festive film chart comedy meet th...,entertainment,fockers fuel festive film chart comedy meet fo...,1


# Splitting data

In [7]:
!pip install sklearn




You should consider upgrading via the 'c:\users\aditi\anaconda3\python.exe -m pip install --upgrade pip' command.


In [8]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(df['new_text'],df['news_types_cat'],test_size=0.2,random_state=8)

In [9]:
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 300

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer



tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
features_train = tfidf.fit_transform(X_train).toarray()
labels_train = y_train
print(features_train)

features_test = tfidf.transform(X_test).toarray()
labels_test = y_test
print(features_test.shape)

[[0.08457037 0.         0.10775278 ... 0.         0.10135638 0.0301139 ]
 [0.05568894 0.         0.07095438 ... 0.08308142 0.07701172 0.07046034]
 [0.         0.         0.         ... 0.         0.         0.11940016]
 ...
 [0.11568851 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.0360849 ]]
(298, 300)


# Random Forest

In [11]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

model  = RandomForestClassifier(random_state=1)
model.fit(features_train, labels_train)
model_predictions = model.predict(features_test)
print('Accuracy: ', accuracy_score(labels_test, model_predictions))
print(classification_report(labels_test, model_predictions))

Accuracy:  0.9261744966442953
              precision    recall  f1-score   support

           0       0.92      0.91      0.91        76
           1       0.98      0.91      0.95        47
           2       0.90      0.84      0.87        55
           3       0.96      1.00      0.98        65
           4       0.88      0.96      0.92        55

    accuracy                           0.93       298
   macro avg       0.93      0.92      0.93       298
weighted avg       0.93      0.93      0.93       298



In [12]:
model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}

In [13]:
from sklearn.model_selection import GridSearchCV
n_estimators = [100, 300, 500, 800, 1200]
max_depth = [5, 8, 15, 25, 30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10] 

hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split, 
             min_samples_leaf = min_samples_leaf)

gridF = GridSearchCV(model, hyperF, cv = 3, verbose = 1, 
                      n_jobs = -1)
bestF = gridF.fit(features_train, labels_train)

Fitting 3 folds for each of 500 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   44.4s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed:  6.9min finished


In [14]:
bestF.best_params_

{'max_depth': 25,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 1200}

In [19]:
from sklearn.ensemble import RandomForestClassifier
model1  = RandomForestClassifier(random_state=1,max_depth= 25, min_samples_leaf= 1, min_samples_split= 2, n_estimators= 1200)
model1.fit(features_train, labels_train)
model_predictions = model1.predict(features_test)
print('Accuracy: ', accuracy_score(labels_test, model_predictions))
print(classification_report(labels_test, model_predictions))

import joblib
joblib.dump(model1, 'RandomForest_spam_model.pkl')


Accuracy:  0.9328859060402684
              precision    recall  f1-score   support

           0       0.93      0.92      0.93        76
           1       0.98      0.91      0.95        47
           2       0.92      0.87      0.90        55
           3       0.96      1.00      0.98        65
           4       0.88      0.95      0.91        55

    accuracy                           0.93       298
   macro avg       0.93      0.93      0.93       298
weighted avg       0.93      0.93      0.93       298



['RandomForest_spam_model.pkl']

# Decision Tree

In [16]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=1)
model.fit(features_train, labels_train)
model_predictions = model.predict(features_test)
print('Accuracy: ', accuracy_score(labels_test, model_predictions))
print(classification_report(labels_test, model_predictions))

Accuracy:  0.8154362416107382
              precision    recall  f1-score   support

           0       0.84      0.78      0.81        76
           1       0.85      0.70      0.77        47
           2       0.76      0.71      0.74        55
           3       0.90      0.98      0.94        65
           4       0.72      0.87      0.79        55

    accuracy                           0.82       298
   macro avg       0.81      0.81      0.81       298
weighted avg       0.82      0.82      0.81       298



# KNeighbors Classifier

In [17]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(features_train, labels_train)
model_predictions = model.predict(features_test)
print('Accuracy: ', accuracy_score(labels_test, model_predictions))
print(classification_report(labels_test, model_predictions))

Accuracy:  0.9228187919463087
              precision    recall  f1-score   support

           0       0.92      0.87      0.89        76
           1       1.00      0.91      0.96        47
           2       0.89      0.91      0.90        55
           3       0.97      0.97      0.97        65
           4       0.85      0.96      0.91        55

    accuracy                           0.92       298
   macro avg       0.93      0.93      0.92       298
weighted avg       0.93      0.92      0.92       298



# Selecting best and implementing

In [20]:
clf_model = open('RandomForest_spam_model.pkl','rb')
clf = joblib.load(clf_model)

In [26]:
from flask import Flask,render_template,url_for,request
 
import pickle

app = Flask(__name__)
@app.route('/')
def home():
	return render_template('home.html')

@app.route('/predict',methods=['POST'])

def predict():
    clf_model = open('RandomForest_spam_model.pkl','rb')
    clf = joblib.load(clf_model)
    if request.method == 'POST':
	    message = request.form['message']
	    data = [message]
	    vect = cv.transform(data).toarray()
	    my_prediction = clf.predict(vect)
        
    return render_template('result.html',prediction = my_prediction)

if __name__ == '__main__':
	app.run(debug=True)


 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on


 * Restarting with windowsapi reloader


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
