<a href="https://colab.research.google.com/github/15nisha/EXperimets/blob/main/tag_prediction_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**How to solve Multi - label classification Problem**

In [1]:
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
import sys
import warnings

**Importing data**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
def impoting_dataset(path):
  ''' Import csv file  from google drive into colab'''
  data_raw = pd.read_csv(path)
  return data_raw


In [4]:
 df = impoting_dataset('/content/drive/MyDrive/data.csv')

In [None]:
len(df)

4375

In [5]:
df.drop(columns='Unnamed: 0',inplace=True,axis=1)

In [6]:
df.head()

Unnamed: 0,tags,description
0,licence-needed supervising-job 5-plus-years-ex...,THE COMPANY Employer is a midstream service...
1,2-4-years-experience-needed salary full-time-job,ICR Staffing is now accepting resumes for Indu...
2,part-time-job,This is a great position for the right person....
3,licence-needed,A large multi-specialty health center is expan...
4,5-plus-years-experience-needed full-time-job b...,JOB PURPOSE: The Account Director is respon...


**Data Cleaning**

In [7]:
def removing_null(dataset):
  ''' remove null value'''
  dataset.dropna(inplace=True)
  return dataset

def removing_duplicate(dataset_df):
  ''' remove duplicate varaible'''

  dataset = dataset_df.drop_duplicates( keep='first')
  return dataset



In [8]:
df = df.apply(removing_null)
# df = df.apply(removing_duplicate)

In [9]:
df = df.apply(removing_duplicate)

In [None]:
len(df)

4375

In [None]:
df.head()

Unnamed: 0,tags,description
0,licence-needed supervising-job 5-plus-years-ex...,THE COMPANY Employer is a midstream service...
1,2-4-years-experience-needed salary full-time-job,ICR Staffing is now accepting resumes for Indu...
2,part-time-job,This is a great position for the right person....
3,licence-needed,A large multi-specialty health center is expan...
4,5-plus-years-experience-needed full-time-job b...,JOB PURPOSE: The Account Director is respon...


In [10]:
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext
def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned
def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent


In [11]:
df['comment_text'] = df['description'].str.lower()
df['comment_text'] = df['comment_text'].apply(cleanHtml)
df['comment_text'] = df['comment_text'].apply(cleanPunc)
df['comment_text'] = df['comment_text'].apply(keepAlpha)

In [12]:
df.head()

Unnamed: 0,tags,description,comment_text
0,licence-needed supervising-job 5-plus-years-ex...,THE COMPANY Employer is a midstream service...,the company employer is a midstream service pr...
1,2-4-years-experience-needed salary full-time-job,ICR Staffing is now accepting resumes for Indu...,icr staffing is now accepting resumes for indu...
2,part-time-job,This is a great position for the right person....,this is a great position for the right person ...
3,licence-needed,A large multi-specialty health center is expan...,a large multi specialty health center is expan...
4,5-plus-years-experience-needed full-time-job b...,JOB PURPOSE: The Account Director is respon...,job purpose the account director is responsib...


In [15]:
stop_words = set(stopwords.words('english'))
stop_words.update(['zero','one','two','three','four','five','six','seven','eight','nine','ten','may','also','across','among','beside','however','yet','within'])
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
def removeStopWords(sentence):
  ''' Removing stopwords'''
    global re_stop_words
    return re_stop_words.sub(" ", sentence)


In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [16]:
df['comment_text'] = df['comment_text'].apply(removeStopWords)

In [None]:
df.head()

Unnamed: 0,tags,description,comment_text
0,licence-needed supervising-job 5-plus-years-ex...,THE COMPANY Employer is a midstream service...,company employer midstream service provider...
1,2-4-years-experience-needed salary full-time-job,ICR Staffing is now accepting resumes for Indu...,icr staffing accepting resumes industrial m...
2,part-time-job,This is a great position for the right person....,great position right person healthcaresee...
3,licence-needed,A large multi-specialty health center is expan...,large multi specialty health center expandin...
4,5-plus-years-experience-needed full-time-job b...,JOB PURPOSE: The Account Director is respon...,job purpose account director responsible m...


In [17]:
stemmer = SnowballStemmer("english")
def stemming(sentence):
  ''' removing same root originated words'''
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence


In [18]:
df['comment_text'] = df['comment_text'].apply(stemming)

In [None]:
df['comment_text'][18]

'join club today club strong grow whether member small busi owner shop product busi head household shop famili aim provid solut save money time take step toward help environ sum work help member live better everi day motiv enthusiast want part uniqu retail experi read career opportun wait sam club local onalaska wi sam club hire opportun includ profession cash offic associ front end cashier cart attend sale wireless sale associ inform sam club appli onlin click appli link specifi interest posit interest locat sam club equal opportun employ'

**visualisation**

**train _test split**

In [19]:
X_train, X_validation, X_test = np.split(df['description'], [int(0.6*len(df)),int(0.8*len(df))])
y_train, y_validation, y_test = np.split(df['tags'], [int(0.6*len(df)),int(0.8*len(df))])

**Preprocessing**