# Exploring and Cleaning the data

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_colwidth', None)

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('train_aug_in_kag_org.csv')

In [3]:
data.head(10)

Unnamed: 0,title,text,title_text,label
0,squirrel financial wellbeing,A Squirrel account makes it easy to stick to your budget and save. Start for free today!,squirrel financial wellbeing. A Squirrel account makes it easy to stick to your budget and save. Start for free today!,payment-credit-card-balance-euro-dollars-credit-debit-receipt-billing
1,senior insurance marketing,"Welcome to the home page of Senior Insurance Marketing, a full service brokerage specializing in Medicare and senior insurance products. Located in Lincoln, NE.","senior insurance marketing. Welcome to the home page of Senior Insurance Marketing, a full service brokerage specializing in Medicare and senior insurance products. Located in Lincoln, NE.",payment-credit-card-balance-euro-dollars-credit-debit-receipt-billing
2,3i financial group inc.,"A Small Boutique Firm Dedicated to Achieving Our Clients Goals.#sep#inform.invest.insure Get a Free, No Obligation Quote forAllYour Insurance Needs. 3i Financial offers a wide variety of insurance products to help suit your needs. Choose from individual, family, and group. GET A QUOTE TODAY Travel Insurance Get peace of mind with comprehensive travel insurance. Get a free quote today! Health & Dental Get affordable health Read more","3i financial group inc.. A Small Boutique Firm Dedicated to Achieving Our Clients Goals.#sep#inform.invest.insure Get a Free, No Obligation Quote forAllYour Insurance Needs. 3i Financial offers a wide variety of insurance products to help suit your needs. Choose from individual, family, and group. GET A QUOTE TODAY Travel Insurance Get peace of mind with comprehensive travel insurance. Get a free quote today! Health & Dental Get affordable health Read more",payment-credit-card-balance-euro-dollars-credit-debit-receipt-billing
3,brownstone insurance,"Brownstone Insurance provides master insurance policies for multi-unit buildings throughout Massachusetts, as well as personal insurance products such as renter's insurance and homeowner's insurance.#sep#Brownstone Insurance provides master insurance policies for multi-unit \nbuildings throughout Massachusetts, as well as personal insurance products \nsuch as renter's insurance and homeowner's insurance.","brownstone insurance. Brownstone Insurance provides master insurance policies for multi-unit buildings throughout Massachusetts, as well as personal insurance products such as renter's insurance and homeowner's insurance.#sep#Brownstone Insurance provides master insurance policies for multi-unit \nbuildings throughout Massachusetts, as well as personal insurance products \nsuch as renter's insurance and homeowner's insurance.",payment-credit-card-balance-euro-dollars-credit-debit-receipt-billing
4,pacificwide lending,Learn more about Pacificwide Lending and our commercial & residential loan services in California.,pacificwide lending. Learn more about Pacificwide Lending and our commercial & residential loan services in California.,payment-credit-card-balance-euro-dollars-credit-debit-receipt-billing
5,city auto finance,"City Auto Finance is the Southeast's most trusted used-car floor plan financing company with locations in Memphis, TN; Mobile, AL; Huntsville, AL and Murfreesboro, TN. We provide you with competitive rates and the capability to make payments online.","city auto finance. City Auto Finance is the Southeast's most trusted used-car floor plan financing company with locations in Memphis, TN; Mobile, AL; Huntsville, AL and Murfreesboro, TN. We provide you with competitive rates and the capability to make payments online.",payment-credit-card-balance-euro-dollars-credit-debit-receipt-billing
6,"veterans financial, inc.","VA Aid and Attendance benefit can help pay for assisted living, home care or nursing home. Veterans Financial Inc can advise you at no cost about benefits","veterans financial, inc.. VA Aid and Attendance benefit can help pay for assisted living, home care or nursing home. Veterans Financial Inc can advise you at no cost about benefits",payment-credit-card-balance-euro-dollars-credit-debit-receipt-billing
7,keystart country home loans,"Make your home ownership dream a reality. Keystart's low-deposit, no LMI home loans lower the entry barriers so you can get into a home of your own sooner.","keystart country home loans. Make your home ownership dream a reality. Keystart's low-deposit, no LMI home loans lower the entry barriers so you can get into a home of your own sooner.",payment-credit-card-balance-euro-dollars-credit-debit-receipt-billing
8,united insurance,"United Insurance provides personal insurance, commercial insurance, and group benefits with 16 location in Maine and New Hampshire since 1976.","united insurance. United Insurance provides personal insurance, commercial insurance, and group benefits with 16 location in Maine and New Hampshire since 1976.",payment-credit-card-balance-euro-dollars-credit-debit-receipt-billing
9,"insurance brokers co., inc.","Auto Insurance, Home Insurance, Business Insurance, Life & Health Insurance in Rockville, Gaithersburg, Potomac, Bethesda, Germantown and Silver Spring.","insurance brokers co., inc.. Auto Insurance, Home Insurance, Business Insurance, Life & Health Insurance in Rockville, Gaithersburg, Potomac, Bethesda, Germantown and Silver Spring.",payment-credit-card-balance-euro-dollars-credit-debit-receipt-billing


In [4]:
data.shape

(34953, 4)

In [5]:
data.isnull().sum()

title         42
text           0
title_text     0
label          0
dtype: int64

In [6]:
data = data.iloc[:, 2:]

In [7]:
data.head()

Unnamed: 0,title_text,label
0,squirrel financial wellbeing. A Squirrel account makes it easy to stick to your budget and save. Start for free today!,payment-credit-card-balance-euro-dollars-credit-debit-receipt-billing
1,"senior insurance marketing. Welcome to the home page of Senior Insurance Marketing, a full service brokerage specializing in Medicare and senior insurance products. Located in Lincoln, NE.",payment-credit-card-balance-euro-dollars-credit-debit-receipt-billing
2,"3i financial group inc.. A Small Boutique Firm Dedicated to Achieving Our Clients Goals.#sep#inform.invest.insure Get a Free, No Obligation Quote forAllYour Insurance Needs. 3i Financial offers a wide variety of insurance products to help suit your needs. Choose from individual, family, and group. GET A QUOTE TODAY Travel Insurance Get peace of mind with comprehensive travel insurance. Get a free quote today! Health & Dental Get affordable health Read more",payment-credit-card-balance-euro-dollars-credit-debit-receipt-billing
3,"brownstone insurance. Brownstone Insurance provides master insurance policies for multi-unit buildings throughout Massachusetts, as well as personal insurance products such as renter's insurance and homeowner's insurance.#sep#Brownstone Insurance provides master insurance policies for multi-unit \nbuildings throughout Massachusetts, as well as personal insurance products \nsuch as renter's insurance and homeowner's insurance.",payment-credit-card-balance-euro-dollars-credit-debit-receipt-billing
4,pacificwide lending. Learn more about Pacificwide Lending and our commercial & residential loan services in California.,payment-credit-card-balance-euro-dollars-credit-debit-receipt-billing


# Preparing our data for modeling 

In [8]:
import re
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
# text
text_corpus = []
lm = WordNetLemmatizer()
words = stopwords.words('english')

for i in range(data.shape[0]):
    row = re.sub('[^a-zA-Z\s]', '', data.iloc[i, 0]).lower()
    row = row.split()  
    row = [lm.lemmatize(word) for word in row if word not in words]
    row = ' '.join(row)
    text_corpus.append(row)

In [10]:
len(text_corpus)

34953

In [11]:
text_corpus[0]

'squirrel financial wellbeing squirrel account make easy stick budget save start free today'

_Our Bag of Words_

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
text_cv = CountVectorizer(max_features=1000)   
X_text = text_cv.fit_transform(text_corpus).toarray()

In [14]:
X_text.shape

(34953, 1000)

In [16]:
X = X_text.copy()
y = data.iloc[:, 1].values

In [17]:
X.shape, y.shape

((34953, 1000), (34953,))

In [18]:
# LabelEncoder
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)

In [19]:
np.unique(y)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

# Splitting the data

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [26]:
X_train.shape, X_val.shape

((27962, 1000), (6991, 1000))

# Modeling

In [27]:
from time import time 

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier 

from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score

In [28]:
models = [
    ('GradientBoostingClassifier',GradientBoostingClassifier()),
    ('XGBoost', XGBClassifier(use_label_encoder=False, eval_metric='logloss')),
    ('LightGBM', LGBMClassifier())
]

results = pd.DataFrame(columns = ['Model', 'Accuracy', 'Precision-MacroAVG', 'Precision-WeightedAVG', 
                                  'Recall-MacroAVG', 'Recall-WeightedAVG', 'F1_score-MacroAVG','F1_score-WeightedAVG', 'Time_taken'])
for model_name, model in models:
    start = time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    end = time()
    
    record = {}
    record['Model'] = model_name
    record['Accuracy'] = accuracy_score(y_val, y_pred)*100
    record['Precision-MacroAVG'] = precision_score(y_val, y_pred, average='macro')*100
    record['Precision-WeightedAVG'] = precision_score(y_val, y_pred, average='weighted')*100
    record['Recall-MacroAVG'] = recall_score(y_val, y_pred, average='macro')*100
    record['Recall-WeightedAVG'] = recall_score(y_val, y_pred, average='weighted')*100
    record['F1_score-MacroAVG'] = f1_score(y_val, y_pred, average='macro')*100
    record['F1_score-WeightedAVG'] = f1_score(y_val, y_pred, average='weighted')*100
    record['Time_taken'] = end-start
    
    results = results.append(record, ignore_index=True)
    
results = results.sort_values(by='Accuracy', ascending=False)

In [29]:
results

Unnamed: 0,Model,Accuracy,Precision-MacroAVG,Precision-WeightedAVG,Recall-MacroAVG,Recall-WeightedAVG,F1_score-MacroAVG,F1_score-WeightedAVG,Time_taken
2,LightGBM,88.714061,89.446828,88.810175,81.328121,88.714061,84.431744,88.620437,54.182753
0,GradientBoostingClassifier,84.866257,81.477969,84.92767,74.054343,84.866257,76.800851,84.5171,1684.445121
1,XGBoost,83.364326,83.29806,83.337819,67.875409,83.364326,71.192825,82.532714,720.993018


In [30]:
# saving the models
import joblib

joblib.dump(models[0][1], 'GradientBoostingClassifier.sav')
joblib.dump(models[1][1], 'XGBoost.sav')
joblib.dump(models[2][1], 'LightGBM.sav')

['LightGBM.sav']

# Testing on test_data

In [31]:
test_data = pd.read_csv('test_in_kag_org.csv')

In [32]:
test_data.shape

(6501, 4)

In [33]:
test_data.isnull().sum()

title         0
text          0
title_text    0
label         0
dtype: int64

In [34]:
test_data['label'].nunique()

15

In [35]:
test_data = test_data.iloc[:, 2:]

# Preparing our data for modeling 

In [36]:
# text
text_corpus_test = []
lm = WordNetLemmatizer()
words = stopwords.words('english')

for i in range(test_data.shape[0]):
    row = re.sub('[^a-zA-Z\s]', '', test_data.iloc[i, 0]).lower()
    row = row.split()  
    row = [lm.lemmatize(word) for word in row if word not in words]
    row = ' '.join(row)
    text_corpus_test.append(row)

In [37]:
len(text_corpus_test)

6501

In [38]:
text_corpus_test[0]

'centralized hub procore offer ability communicate team get full view project one place assign task stay connected google map google doc'

_Our Bag of Words_

In [39]:
X_text_test = text_cv.transform(text_corpus_test).toarray()

In [40]:
X_text_test.shape

(6501, 1000)

In [41]:
X_test = X_text_test.copy()
y_test = test_data.iloc[:, 1].values

In [42]:
X_test.shape, y_test.shape

((6501, 1000), (6501,))

In [43]:
y_test = le.transform(y_test)

In [44]:
np.unique(y_test)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [45]:
import joblib

GB_model = joblib.load('GradientBoostingClassifier.sav')
XGB_model = joblib.load('XGBoost.sav') 
LGBM_model = joblib.load('LightGBM.sav')

In [46]:
models = [
    ('GradientBoostingClassifier',GB_model),
    ('XGBoost', XGB_model),
    ('LightGBM', LGBM_model)
]

results = pd.DataFrame(columns = ['Model', 'Accuracy', 'Precision-MacroAVG', 'Precision-WeightedAVG', 
                                  'Recall-MacroAVG', 'Recall-WeightedAVG', 'F1_score-MacroAVG','F1_score-WeightedAVG'])
for model_name, model in models:
    y_pred = model.predict(X_test)
    
    record = {}
    record['Model'] = model_name
    record['Accuracy'] = accuracy_score(y_test, y_pred)*100
    record['Precision-MacroAVG'] = precision_score(y_test, y_pred, average='macro')*100
    record['Precision-WeightedAVG'] = precision_score(y_test, y_pred, average='weighted')*100
    record['Recall-MacroAVG'] = recall_score(y_test, y_pred, average='macro')*100
    record['Recall-WeightedAVG'] = recall_score(y_test, y_pred, average='weighted')*100
    record['F1_score-MacroAVG'] = f1_score(y_test, y_pred, average='macro')*100
    record['F1_score-WeightedAVG'] = f1_score(y_test, y_pred, average='weighted')*100
    
    results = results.append(record, ignore_index=True)
    
results = results.sort_values(by='Accuracy', ascending=False)
results

Unnamed: 0,Model,Accuracy,Precision-MacroAVG,Precision-WeightedAVG,Recall-MacroAVG,Recall-WeightedAVG,F1_score-MacroAVG,F1_score-WeightedAVG
2,LightGBM,89.801569,71.104604,89.525986,68.395461,89.801569,68.46377,89.496989
0,GradientBoostingClassifier,88.217197,64.766205,88.042285,65.584976,88.217197,64.121906,87.958457
1,XGBoost,87.878788,68.834459,87.760178,67.403835,87.878788,65.896011,87.500425
