# Project - Text Classification Generalised Model

## Import required libraries

In [272]:
# Data Analysis libraries
import pandas as pd
import numpy as np
import re
import random

# Text analtical Libraries
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import itertools 

# ML Libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

# Word Cloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# For saving the model
import pickle

## Load the dataset

In [274]:
# File Location of the CSV dataset
#FILE_PATH = r'./LnT/train_set.csv'
#FILE_DELIMETER = ','

In [275]:
##dataset = pd.read_csv(FILE_PATH,delimiter=FILE_DELIMETER,engine='python')
dataset = pd.read_excel('./LnT/train_set.xlsx', sheet_name='Sheet1')

## Preprocssing of the data

In [276]:
def preProcessingData(data):
    # drop null values
    data.dropna(inplace=True)
    # remove duplicate rows
    data.drop_duplicates(inplace=True)

    return data

## Line by Line Text Cleaning using NLP libraries

In [277]:
# Lemmatizer initialization
lemmatizer = WordNetLemmatizer()

# Tokenizer initialization
tok = WordPunctTokenizer()

# Expression for @mailaddress.com
exp1 = r'@[A-Za-z0-9_]+'

# Expression for URLs
exp2 = r'https?://[^ ]+'

#Expression for special characters
exp3 = r'[^0-9A-Za-z \t]'

combined_pat = r'|'.join((exp1, exp2,exp3))

#Expression for URL www.
www_pattern = r'www.[^ ]+'

# Dictinory for manuplating text
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}

neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

all_words = []

In [278]:
def text_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    try:
        bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        bom_removed = souped
    
    stripped = re.sub(combined_pat, '', bom_removed)
    
    stripped = re.sub(www_pattern, '', stripped)
    
    lower_case = stripped.lower()
    
    neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], lower_case)
    
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    
    stop_words = set(stopwords.words("english"))
    
    words = [x for x  in tok.tokenize(neg_handled) if len(x) > 1]
    after_removing_stop_words = [word for word in words if word not in (stop_words)]
    
    # Replace abbreviations and some spell correction
    after_lemmatizer = []
    for word in after_removing_stop_words:
        word=lemmatizer.lemmatize(word)
        all_words.append(word)
        after_lemmatizer.append(word)
        
    return(" ".join(after_lemmatizer)).strip()

In [279]:
def wordCloud(all_words):
    all_words_freq = nltk.FreqDist(all_words)
    word_cloud_words = list(filter(lambda x:x[1]>=50,all_words_freq.items()))
    word_cloud = []
    for key,value in word_cloud_words:
        word_cloud.append(key)   

    wordcloud = WordCloud(width = 1000, height = 1000,background_color='black')

    wordcloud.generate(str(word_cloud))
    plt.figure(figsize = (8, 8), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)
    plt.title('Word Cloud')

In [280]:
###**convert text tu number**

In [292]:
def featurePrepration(data):
    documents = []
    for index,row in data.iterrows():
        documents.append((text_cleaner(row.text),row.category))
    
    random.shuffle(documents)
    
    dataframe = pd.DataFrame(documents,columns=['text','category']) 
    
    
    #wordCloud(all_words)
    
    return dataframe

In [293]:
# confusion matrix
def plot_confusion_matrix(cm,target_names,title):
    cmap=None
    normalize=False
    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy
    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))

In [294]:
# Preprocessing the data
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
data = preProcessingData(dataset)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dkm20\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dkm20\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [295]:
# Total feature preparation by cleaning each text
data = featurePrepration(data)
# Showing wordCloud below

In [296]:
dataset.head()

Unnamed: 0,text,category
0,WORK CATEGORY : Formwork. APPLICATION : Constr...,Shuttering Work
1,WORK CATEGORY : Formwork. SCOPE OF WORK : Maki...,Shuttering Work
2,WORK CATEGORY : Formwork. APPLICATION : For ge...,Shuttering Work
3,WORK GROUP:-Fixing of shuttering;SCOPE:-Fixing...,Shuttering
20,Construction of concrete structure; TYPE OF ST...,Shuttering Work


## Model Development

In [297]:
# Train - Test Spliting, Test dataset has taken 30% for validation
x = data['text']
y = data['category']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,stratify=y,random_state=1)

In [298]:
# Tf-Idf feature set creation
vectorizer = TfidfVectorizer(stop_words='english')
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [299]:
# Model Training by using Random Forest Algorithm
textClassifierRandomForest = RandomForestClassifier(n_estimators=300)
textClassifierRandomForest.fit(x_train,y_train)

RandomForestClassifier(n_estimators=300)

## Model Validation on testing dataset

In [300]:
predicted_labels_rf = textClassifierRandomForest.predict(x_test)

In [301]:
accuracy = textClassifierRandomForest.score(x_test,y_test)

In [302]:
print('Accuracy of the model - '+str(round(accuracy*100,2))+'%')

Accuracy of the model - 56.0%


In [305]:
# Confusion matrix to understand the results
classes = list(y_test.unique())
c_matrix = confusion_matrix(y_test,predicted_labels_rf,labels=classes)
#plot_confusion_matrix(np.array(c_matrix),classes,"Confusion Matrix")

### Random Forest is giving the accuracy of  test dataset, For using the same model in future we are retraining the model on complete dataset

## Model Retraining on complete dataset

In [306]:
# Complete dataset is already cleaned and processed
data.shape

(248, 2)

In [307]:
data.head()

Unnamed: 0,text,category
0,scope cement flooring work categorization char...,Concreting
1,scope asorted activity,Concreting
2,scope placing concrete using stationary concre...,Concreting
3,scope placing concrete method type concrete re...,Concreting
4,work groupsupporting activitiesscopeerection s...,Shuttering Work\n


In [308]:
x = data['text']
y = data['category']

In [309]:
# Tf-Idf feature set creation
vectorizer_final = TfidfVectorizer(stop_words='english')
x_train_all = vectorizer_final.fit_transform(x)

In [310]:
# Model Training by using Random Forest Algorithm
textClassifierRandomForestModel = RandomForestClassifier(n_estimators=300)
textClassifierRandomForestModel.fit(x_train_all,y)

RandomForestClassifier(n_estimators=300)

In [311]:
# Save the pickle files of model and Tf-idf matrix

In [312]:
# Save the tf-idf matrix for future predictions
filename = open('./LnT/IfIdf_matrix.pkl','wb')
pickle.dump(vectorizer_final,filename)
filename.close()

In [314]:
# Save the trained model for future predictions
filename = open('./LnT/textClassifierRandomForestModel.pkl','wb')
pickle.dump(textClassifierRandomForestModel,filename)
filename.close()

## Model on Unseen Dataset

In [319]:
# Load the unseen dataset
newDatset = pd.read_excel('./LnT/test_set.xlsx', sheet_name='Sheet1')
#newDatset = pd.read_csv('./LnT/test_set.csv',engine='python')

In [320]:
newDatset.head()

Unnamed: 0,text
0,WORK CATEGORY : Formwork. APPLICATION : Constr...
1,Labour charges for Fixing & Removing of shutte...
2,SCOPE :- Assembly of straight slipform; HEIGHT...
3,SCOPE :- Slipform concreting; Work Categorizat...
4,"SCOPE :- Cutting, bending and tying of rebar i..."


In [321]:
# Cleaning the text
documents_unseen = []
for index,row in newDatset.iterrows():
    documents_unseen.append((text_cleaner(row.text)))

In [322]:
unseen_cleaned_data =  pd.DataFrame(documents_unseen,columns=['cleaned_text'])    

In [323]:
unseen_cleaned_data['text'] = newDatset.text

In [324]:
new_y = unseen_cleaned_data['cleaned_text']

In [325]:
# Load the saved pickle of matrix
filename = open('./LnT/IfIdf_matrix.pkl','rb')
vectorizer = pickle.load(filename)
filename.close()

# Tranform the deatures
newDatset_vectorised = vectorizer.transform(new_y)

In [326]:
# Load the saved model
filename = open('./LnT/textClassifierRandomForestModel.pkl','rb')
textClassifierRandomForest = pickle.load(filename)
filename.close()

#Predict the result on unseen data
predicted_newDataset_labels = textClassifierRandomForest.predict(newDatset_vectorised)

In [327]:
# Generate the result in dataframe
result = pd.DataFrame()
result['category'] = predicted_newDataset_labels
result['text'] = unseen_cleaned_data.text

In [328]:
result.shape

(40, 2)

In [329]:
result.head(20).reset_index(drop='index')

Unnamed: 0,category,text
0,Shuttering Work,WORK CATEGORY : Formwork. APPLICATION : Constr...
1,Shuttering Work,Labour charges for Fixing & Removing of shutte...
2,Slipform Shuttering Work,SCOPE :- Assembly of straight slipform; HEIGHT...
3,Slipform Shuttering Work,SCOPE :- Slipform concreting; Work Categorizat...
4,Slipform Shuttering Work,"SCOPE :- Cutting, bending and tying of rebar i..."
5,Slipform Shuttering Work,SCOPE:-Operation of straight slipform;HEIGHT:-...
6,Slipform Shuttering Work,SCOPE:-Erection of stair tower;HEIGHT:-0 to 10...
7,Shuttering\n,SCOPE :- Labour charges for fixing and removin...
8,Shuttering\n,SCOPE :- Labour Charges for shutter Making wit...
9,Shuttering\n,SCOPE :- As described further; Labour Charges ...


In [330]:
# Save the output prediction in csv file
result.to_excel('./LnT/predictedCategories.xlsx',index=False)