In [1]:
# All Libraries

import numpy as np
import pandas as pd
import random
random.seed(42)
import matplotlib.pyplot as plt


#For spliting data
from sklearn.model_selection import train_test_split

#PRE PROCESSING
import re
from nltk.corpus import stopwords
import nltk
import nltk.stem
from nltk.stem import SnowballStemmer 
from nltk import stem
from nltk.stem.wordnet import WordNetLemmatizer

#ROC-AUC PLOT
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import roc_curve,auc,roc_auc_score
from sklearn.metrics import roc_curve,auc
from scipy import interp
from itertools import cycle

#TRANSFORMATION
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#SCORE
from sklearn import metrics
from sklearn.metrics import accuracy_score 

# MODELS
from xgboost import XGBClassifier                       #XGBOOST
from sklearn.linear_model import LogisticRegression     #LOGISTIC REGRESSION
from sklearn.ensemble import RandomForestClassifier     #RANDOM FOREST
from sklearn.naive_bayes import MultinomialNB           #NB
from sklearn.svm import SVC                             #SVM
from sklearn.tree import DecisionTreeClassifier         #DESICION TREE
from sklearn.neighbors import KNeighborsClassifier      #KNN


In [2]:
train = pd.read_csv("dbpedia_csv/train.csv")
train_text = train['content']
train_label = train['target']

test = pd.read_csv("dbpedia_csv/test.csv")
test_text = test['content']
test_label = test['target']


<h1>DATASET</h1>

In [3]:
train.head()

Unnamed: 0,target,title,content
0,1,E. D. Abbott Ltd,Abbott of Farnham E D Abbott Limited was a Br...
1,1,Schwan-Stabilo,Schwan-STABILO is a German maker of pens for ...
2,1,Q-workshop,Q-workshop is a Polish company located in Poz...
3,1,Marvell Software Solutions Israel,Marvell Software Solutions Israel known as RA...
4,1,Bergan Mercy Medical Center,Bergan Mercy Medical Center is a hospital loc...


In [5]:
print(train['target'].unique())
print(len(train['target'].unique()))

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14]
14


There are 14 labels for this dataset.

<h1>DATA EXPLORATION</h1>

In [None]:
train['word_count'] = train['content'].apply(lambda x: len(str(x).split(" ")))
plt.plot(train['target'],train['word_count'])
plt.show()

For label 10 - word count is most when compared to any other label

In [None]:
stop = stopwords.words('english')
train['stopwords'] = train['content'].apply(lambda x: len([x for x in x.split() if x in stop]))
plt.plot(train['target'],train['stopwords'])
plt.show()

For label 9 - stop words count is least when compared to any other label

In [None]:
def top(vec,corpus, n=None):
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]


In [None]:
num = 10

vec = CountVectorizer().fit(train['content'])
top_n_words_with_stopwords = top(vec,train['content'], num)

vec = CountVectorizer(stop_words = 'english').fit(train['content'])
top_n_words_without_stopwords = top(vec,train['content'], num)

print("No.\tWith Stop Words\t\tWithout Stop Words\n")
          
for i in range(num):
    print(i+1," - \t"\
          ,top_n_words_with_stopwords[i][0],"-" \
          ,top_n_words_with_stopwords[i][1],"\t\t" \
          ,top_n_words_without_stopwords[i][0],"-" \
          ,top_n_words_without_stopwords[i][1])

#BIGRAMS

vec = CountVectorizer(ngram_range=(2, 2)).fit(train['content'])
top_n_bigrams_with_stopwords = top(vec,train['content'], num)

vec = CountVectorizer(ngram_range=(2, 2),stop_words = 'english').fit(train['content'])
top_n_bigrams_without_stopwords = top(vec,train['content'], num)

print("\nNo.\tWith Stop Words\t\tWithout Stop Words\n")
          
for i in range(num):
    print(i+1," - \t"\
          ,top_n_bigrams_with_stopwords[i][0],"- " \
          ,top_n_bigrams_with_stopwords[i][1],"\t" \
          ,top_n_bigrams_without_stopwords[i][0],"- " \
          ,top_n_bigrams_without_stopwords[i][1])

These are top 10 words and bigrams with and without stopwords included. As we can can see removing stopwords all the top 10 is changed. Removing Stopwords will help us to decrease the corpus of our dataset without losing much information.

<h1>PRE PROCESSING</h1>


In [None]:

space = re.compile('[./(){}\[\]\|@,;:<>?!$%^&*+-]')
stopwords = stopwords.words('english')
words = set(nltk.corpus.words.words())
def lemmatize_stemming(text):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def clean(text):
    text = text.lower()
    text = space.sub("",text)
    text = lemmatize_stemming(text)
    text = " ".join(w for w in nltk.wordpunct_tokenize(text)  if w in words or not w.isalpha())    
    text = " ".join(x for x in text.split() if x not in stopwords)
    return(text)


train['content_clean'] = train['content'].apply(clean)
test['content_clean'] = test['content'].apply(clean)

In [None]:
def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return roc_auc_score(y_test, y_pred, average=average)

### ROC_AUC_PLOT
def plot_roc(y_test,predict_test):
    n_classes = 14
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(np.array(pd.get_dummies(y_test))[:, i], np.array(pd.get_dummies(predict_test))[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])


    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    lw=2
    plt.figure(figsize=(8,5))
    plt.plot(fpr["macro"], tpr["macro"],
             label='macro-average ROC curve (area = {0:0.4f})'
                   ''.format(roc_auc["macro"]),
             color='green', linestyle=':', linewidth=4)

    colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=lw,
                 label='ROC curve of class {0} (area = {1:0.4f})'
                 ''.format(i+1, roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--',color='red', lw=lw)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.annotate('Random Guess',(.5,.48),color='red')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC')
    plt.legend(loc="lower right")
    plt.show()

<h1>TRANSFORMATION - VECTORIZATION</h1>
<h3> Using CountVectorizer</h3>

In [None]:
#FOR TRAIN DATASET
content = train['content_clean'].values
label = train['target'].values

X_train, X_test, y_train, y_test = train_test_split(
    content, label, test_size=0.2, random_state=42)

#FOR TEST DATASET
contenttest = test['content_clean'].values
labeltest = test['target'].values

TEST_content, _, TEST_label, __ = train_test_split(
    contenttest, labeltest, test_size=0.0, random_state=42)

vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1))
vectorizer.fit(X_train)
vectorizer.fit(X_test)
vectorizer.fit(TEST_content)

X_train = vectorizer.transform(X_train)
X_test  = vectorizer.transform(X_test)
TEST_content = vectorizer.transform(TEST_content)

In [None]:
############# XGBOOST ##############
print ("\nXGBOOST\n")

XG_model = XGBClassifier(subsample=0.20,n_estimators=100, random_state=42)
print ("Compiling...")
XG_model.fit(X_train, y_train)
print ("Predicting...")

# Accuray Score on train dataset
predict_train = XG_model.predict(X_train)
accuracy_train = accuracy_score(y_train,predict_train)
print('\nAccuracy on train dataset : ', accuracy_train)

print('\nROC - AUC Score on train dataset : ',multiclass_roc_auc_score(y_train,predict_train))

plot_roc(y_train,predict_train)

# Accuray Score on test dataset
predict_test = XG_model.predict(X_test)
accuracy_test = accuracy_score(y_test,predict_test)
print('\nAccuracy on test dataset : ', accuracy_test)

print('\nROC - AUC Score on test dataset: ', multiclass_roc_auc_score(y_test,predict_test))

plot_roc(y_test,predict_test)

In [None]:
############## Logistic Regression  ###############
print ("\nLogistic Regression \n")


print ("Compiling...")
LR = LogisticRegression()
LR.fit(X_train, y_train)
print ("Predicting...")

# Accuray Score on train dataset
predict_train = LR.predict(X_train)
accuracy_train = accuracy_score(y_train,predict_train)
print('\nAccuracy on train dataset : ', accuracy_train)

print('\nROC - AUC Score on train dataset : ',multiclass_roc_auc_score(y_train,predict_train))

plot_roc(y_train,predict_train)

# Accuray Score on test dataset
predict_test = LR.predict(X_test)
accuracy_test = accuracy_score(y_test,predict_test)
print('\nAccuracy on test dataset : ', accuracy_test)

print('\nROC - AUC Score on test dataset: ', multiclass_roc_auc_score(y_test,predict_test))

plot_roc(y_test,predict_test)

In [None]:
##################### RANDOM FOREST #################
print ("\nRANDOM FOREST\n")

RF_model = RandomForestClassifier(n_estimators=100, random_state=42)
print ("Compiling...")
# fit the model with the training data
RF_model.fit(X_train, y_train)
print ("Predicting...")
# Accuray Score on train dataset
predict_train = RF_model.predict(X_train)
accuracy_train = accuracy_score(y_train,predict_train)
print('\nAccuracy on train dataset : ', accuracy_train)

print('\nROC-AUC Score on train dataset : ',multiclass_roc_auc_score(y_train,predict_train))
plot_roc(y_train,predict_train)

# Accuray Score on test dataset
predict_test = RF_model.predict(X_test)
accuracy_test = accuracy_score(y_test,predict_test)
print('\nAccuracy on test dataset : ', accuracy_test)

print('\nROC-AUC Score on train dataset : ',multiclass_roc_auc_score(y_test,predict_test))

plot_roc(y_test,predict_test)

In [None]:
########## nDESICION tree #################
print ("\nDESICION TREE\n")
 
print ("Compiling...")
# training a DescisionTreeClassifier 
 
dtree_model = DecisionTreeClassifier(criterion='entropy',max_depth = 100).fit(X_train, y_train)
print ("Predicting...")

# Accuray Score on train dataset
predict_train = dtree_model.predict(X_train)
accuracy_train = accuracy_score(y_train,predict_train)
print('\nAccuracy on train dataset : ', accuracy_train)

print('\nROC-AUC Score on train dataset : ',multiclass_roc_auc_score(y_train,predict_train))

plot_roc(y_train,predict_train)

# Accuray Score on test dataset
predict_test = dtree_model.predict(X_test)
accuracy_test = accuracy_score(y_test,predict_test)
print('\nAccuracy on test dataset : ', accuracy_test)

print('\nROC-AUC Score on train dataset : ',multiclass_roc_auc_score(y_test,predict_test))

plot_roc(y_test,predict_test)

In [None]:
########### svm ##########
print ("\nSVM\n")

 
# training a linear SVM classifier 

svm_model = SVC().fit(X_train, y_train) 
print ("Predicting...")

# Accuray Score on train dataset
predict_train = svm_model.predict(X_train)
accuracy_train = accuracy_score(y_train,predict_train)
print('\nAccuracy on train dataset : ', accuracy_train)

print('\nROC-AUC Score on train dataset : ',multiclass_roc_auc_score(y_train,predict_train))

plot_roc(y_train,predict_train)

# Accuray Score on test dataset
predict_test = svm_model.predict(X_test)
accuracy_test = accuracy_score(y_test,predict_test)
print('\nAccuracy on test dataset : ', accuracy_test)

print('\nROC-AUC Score on train dataset : ',multiclass_roc_auc_score(y_test,predict_test))

plot_roc(y_test,predict_test)


In [None]:
########### knn ##########
print ("\nKNN\n")

 
# training a KNN classifier 
knn = KNeighborsClassifier(n_neighbors = 14).fit(X_train, y_train) 
print ("Predicting...")

# Accuray Score on train dataset
predict_train = knn.predict(X_train)
accuracy_train = accuracy_score(y_train,predict_train)
print('\nAccuracy on train dataset : ', accuracy_train)

print('\nROC-AUC Score on train dataset : ',multiclass_roc_auc_score(y_train,predict_train))

plot_roc(y_train,predict_train)

# Accuray Score on test dataset
predict_test = knn.predict(X_test)
accuracy_test = accuracy_score(y_test,predict_test)
print('\nAccuracy on test dataset : ', accuracy_test)

print('\nROC-AUC Score on train dataset : ',multiclass_roc_auc_score(y_test,predict_test))

plot_roc(y_test,predict_test)

In [None]:
########### NB ##########
print ("\nNAIVE BAYES\n")



nb = MultinomialNB().fit(X_train, y_train)
print ("Predicting...")

# Accuray Score on train dataset
predict_train = nb.predict(X_train)
accuracy_train = accuracy_score(y_train,predict_train)
print('\nAccuracy on train dataset : ', accuracy_train)

print('\nROC-AUC Score on train dataset : ',multiclass_roc_auc_score(y_train,predict_train))

plot_roc(y_train,predict_train)

# Accuray Score on test dataset
predict_test = nb.predict(X_test)
accuracy_test = accuracy_score(y_test,predict_test)
print('\nAccuracy on test dataset : ', accuracy_test)

print('\nROC-AUC Score on train dataset : ',multiclass_roc_auc_score(y_test,predict_test))

plot_roc(y_test,predict_test)

<h3>Using TF-idf Vectorization</h3>

In [None]:

content = train['content_clean'].values
label = train['target'].values

X_train, X_test, y_train, y_test = train_test_split(
    content, label, test_size=0.2, random_state=42)
tfidf = TfidfVectorizer(max_features=1000,
                         analyzer='word',stop_words= 'english',ngram_range=(1,1),)
tfidf.fit(X_train)
tfidf.fit(X_test)
X_train = tfidf.fit_transform(X_train)
X_test  = tfidf.fit_transform(X_test)


In [None]:
############# XGBOOST ##############
print ("\nXGBOOST\n")


XG_model = XGBClassifier(subsample=0.20,n_estimators=100, random_state=42)
print("Compiling...")
XG_model.fit(X_train, y_train)
print("Predicting...")

# Accuray Score on train dataset
predict_train = XG_model.predict(X_train)
accuracy_train = accuracy_score(y_train,predict_train)
print('\nAccuracy on train dataset : ', accuracy_train)

print('\nROC - AUC Score on train dataset : ',multiclass_roc_auc_score(y_train,predict_train))

plot_roc(y_train,predict_train)

# Accuray Score on test dataset
predict_test = XG_model.predict(X_test)
accuracy_test = accuracy_score(y_test,predict_test)
print('\nAccuracy on test dataset : ', accuracy_test)

print('\nROC - AUC Score on test dataset: ', multiclass_roc_auc_score(y_test,predict_test))

plot_roc(y_test,predict_test)

In [None]:
############## Logistic Regression  ###############
print ("\nLogistic Regression \n")



LR = LogisticRegression()
LR.fit(X_train, y_train)

# Accuray Score on train dataset
predict_train = LR.predict(X_train)
accuracy_train = accuracy_score(y_train,predict_train)
print('\nAccuracy on train dataset : ', accuracy_train)

print('\nROC - AUC Score on train dataset : ',multiclass_roc_auc_score(y_train,predict_train))

plot_roc(y_train,predict_train)

# Accuray Score on test dataset
predict_test = LR.predict(X_test)
accuracy_test = accuracy_score(y_test,predict_test)
print('\nAccuracy on test dataset : ', accuracy_test)

print('\nROC - AUC Score on test dataset: ', multiclass_roc_auc_score(y_test,predict_test))

plot_roc(y_test,predict_test)

In [None]:
##################### RANDOM FOREST #################
print ("\nRANDOM FOREST\n")



RF_model = RandomForestClassifier(n_estimators=100, random_state=42)

# fit the model with the training data
RF_model.fit(X_train, y_train)

# Accuray Score on train dataset
predict_train = RF_model.predict(X_train)
accuracy_train = accuracy_score(y_train,predict_train)
print('\nAccuracy on train dataset : ', accuracy_train)

print('\nROC-AUC Score on train dataset : ',multiclass_roc_auc_score(y_train,predict_train))
plot_roc(y_train,predict_train)

# Accuray Score on test dataset
predict_test = RF_model.predict(X_test)
accuracy_test = accuracy_score(y_test,predict_test)
print('\nAccuracy on test dataset : ', accuracy_test)

print('\nROC-AUC Score on train dataset : ',multiclass_roc_auc_score(y_test,predict_test))

plot_roc(y_test,predict_test)

In [None]:
########## desicion tree #################
print ("\nDESICION TREE\n")
 
 
# training a DescisionTreeClassifier 
 
dtree_model = DecisionTreeClassifier(criterion='entropy',max_depth = 100).fit(X_train, y_train)

# Accuray Score on train dataset
predict_train = dtree_model.predict(X_train)
accuracy_train = accuracy_score(y_train,predict_train)
print('\nAccuracy on train dataset : ', accuracy_train)

print('\nROC-AUC Score on train dataset : ',multiclass_roc_auc_score(y_train,predict_train))

plot_roc(y_train,predict_train)

# Accuray Score on test dataset
predict_test = dtree_model.predict(X_test)
accuracy_test = accuracy_score(y_test,predict_test)
print('\nAccuracy on test dataset : ', accuracy_test)

print('\nROC-AUC Score on train dataset : ',multiclass_roc_auc_score(y_test,predict_test))

plot_roc(y_test,predict_test)

In [None]:
########### svm ##########
print ("\nSVM\n")

 
# training a linear SVM classifier 

svm_model = SVC().fit(X_train, y_train) 

# Accuray Score on train dataset
predict_train = svm_model.predict(X_train)
accuracy_train = accuracy_score(y_train,predict_train)
print('\nAccuracy on train dataset : ', accuracy_train)

print('\nROC-AUC Score on train dataset : ',multiclass_roc_auc_score(y_train,predict_train))

plot_roc(y_train,predict_train)

# Accuray Score on test dataset
predict_test = svm_model.predict(X_test)
accuracy_test = accuracy_score(y_test,predict_test)
print('\nAccuracy on test dataset : ', accuracy_test)

print('\nROC-AUC Score on train dataset : ',multiclass_roc_auc_score(y_test,predict_test))

plot_roc(y_test,predict_test)

In [None]:
########### knn ##########
print ("\nKNN\n")

 
# training a KNN classifier 

knn = KNeighborsClassifier(n_neighbors = 14).fit(X_train, y_train) 
  
# Accuray Score on train dataset
predict_train = knn.predict(X_train)
accuracy_train = accuracy_score(y_train,predict_train)
print('\nAccuracy on train dataset : ', accuracy_train)

print('\nROC-AUC Score on train dataset : ',multiclass_roc_auc_score(y_train,predict_train))

plot_roc(y_train,predict_train)

# Accuray Score on test dataset
predict_test = knn.predict(X_test)
accuracy_test = accuracy_score(y_test,predict_test)
print('\nAccuracy on test dataset : ', accuracy_test)

print('\nROC-AUC Score on train dataset : ',multiclass_roc_auc_score(y_test,predict_test))

plot_roc(y_test,predict_test)

In [None]:
########### NB ##########
print ("\nNAIVE BAYES\n")


nb = MultinomialNB().fit(X_train, y_train)

# Accuray Score on train dataset
predict_train = nb.predict(X_train)
accuracy_train = accuracy_score(y_train,predict_train)
print('\nAccuracy on train dataset : ', accuracy_train)

print('\nROC-AUC Score on train dataset : ',multiclass_roc_auc_score(y_train,predict_train))

plot_roc(y_train,predict_train)

# Accuray Score on test dataset
predict_test = nb.predict(X_test)
accuracy_test = accuracy_score(y_test,predict_test)
print('\nAccuracy on test dataset : ', accuracy_test)

print('\nROC-AUC Score on train dataset : ',multiclass_roc_auc_score(y_test,predict_test))

plot_roc(y_test,predict_test)

# Results

#### This has to be filled and the best ROC value is chosen for TEST dataset

| MODELS/Results | XGBoost | Logistic Regression | Random Forest | SVM | Descion Tree | Knn | Random Forest |
| --- | --- | --- | --- | --- | --- | --- | --- |
| <b>Count - Vectorization</b> |
| ROC on Train (80% OF TRANING DATA) | 0.9564 | 0.9913 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| ROC on Test (20% OF TRANING DATA) | 0.9559 | 0.9818 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| <b>TF-idf Vectorization</b> |
| ROC on Train (80% OF TRANING DATA) | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| ROC on Test (20% OF TRANING DATA) | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |


## For Example :-

CHOSEN TRANSFORMATION - CountVectorization

CHOSEN MODEL - Logistic Regression

<h1>TEST DATASET</h1>

In [None]:
content = train['content_clean'].values
label = train['target'].values
content_test = test['content_clean'].values
label_test = test['target'].values

X_train, X_test, y_train, y_test = train_test_split(
    content, label, test_size=0.2, random_state=42)
TEST_content, _, TEST_label, __ = train_test_split(
    content_test, label_test, test_size=0.0, random_state=42)

#Chosen Transformation
vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1))

vectorizer.fit(X_train)
vectorizer.fit(TEST_content)

X_train = vectorizer.transform(X_train)
TEST_content = vectorizer.transform(TEST_content)


#Chosen Model
print("Compliling Model...")
LR = LogisticRegression()
LR.fit(X_train, y_train)

#Chosen Model
print("Compliling Model...")
LR = LogisticRegression()
LR.fit(X_train, y_train)

print("Predicting...")

# Accuray Score on test dataset
predict_test = LR.predict(TEST_content)
accuracy_test = accuracy_score(TEST_label,predict_test)
print('\nAccuracy on test dataset : ', accuracy_test)

print('\nROC - AUC Score on test dataset : ',multiclass_roc_auc_score(TEST_label,predict_test))

plot_roc(TEST_label,predict_test)