In [1]:
import pandas as pd
import numpy as np
import string
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, hamming_loss, classification_report

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ankit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
train=pd.read_csv('preprocessedTrain.csv')
test=pd.read_csv('preprocessedTest.csv')

In [3]:
train.head()

Unnamed: 0,Title,Body,Tags
0,Java: Repeat Task Every Random Seconds,<p>I'm already familiar with repeating tasks e...,java repeat
1,Why are Java Optionals immutable?,<p>I'd like to understand why Java 8 Optionals...,java optional
2,Text Overlay Image with Darkened Opacity React...,<p>I am attempting to overlay a title over an ...,javascript image overlay react-native opacity
3,Why ternary operator in swift is so picky?,"<p>The question is very simple, but I just cou...",swift operators whitespace ternary-operator op...
4,hide/show fab with scale animation,<p>I'm using custom floatingactionmenu. I need...,android material-design floating-action-button


In [4]:
test.head()

Unnamed: 0,Title,Body,Tags
0,How to get all the child records from differen...,I am having 4 different tables like \r\nselect...,sql sql-server
1,Retrieve all except some data of the another t...,I have two table m_master and tbl_appointment\...,php mysql sql codeigniter mysqli
2,Pandas: read_html,<p>I'm trying to extract US states from wiki U...,python pandas
3,Reader Always gimme NULL,"I'm so new to C#, I wanna make an application ...",sql-server c#-4.0
4,php rearrange array elements based on condition,basically i have this array:\r\n\r\n array(...,php


In [5]:
def cleaning(text):
    text = text.lower()
    pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    clean = re.compile('<.*?>')
    text = re.sub(clean,'',text)
    text = pattern.sub('', text)
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)        
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text) 
    text = re.sub(r"\'ll", " will", text)  
    text = re.sub(r"\'ve", " have", text)  
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"did't", "did not", text)
    text = re.sub(r"can't", "can not", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"couldn't", "could not", text)
    text = re.sub(r"have't", "have not", text)

    text = re.sub(r"[,.\"!@#$%^&*(){}?/;`~:<>+=-]", "", text)
    tokens = word_tokenize(text)
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    text = ' '.join(words)
    return text

train['Title'] = train['Title'].map(cleaning)
print('Train data Title cleaned...')
train['Body'] = train['Body'].map(cleaning)
print('Train data Body cleaned...')
test['Title'] = test['Title'].map(cleaning)
print('Test data Title cleaned...')
test['Body'] = test['Body'].map(cleaning)
print('Test data Body cleaned...')

Train data Title cleaned...
Train data Body cleaned...
Test data Title cleaned...
Test data Body cleaned...


In [6]:
train.head()

Unnamed: 0,Title,Body,Tags
0,java repeat task every random seconds,i am already familiar with repeating tasks eve...,java repeat
1,why are java optionals immutable,i would like to understand why java optionals ...,java optional
2,text overlay image with darkened opacity react...,i am attempting to overlay a title over an ima...,javascript image overlay react-native opacity
3,why ternary operator in swift is so picky,the question is very simple but i just could n...,swift operators whitespace ternary-operator op...
4,hideshow fab with scale animation,i am using custom floatingactionmenu i need to...,android material-design floating-action-button


In [7]:
test.head()

Unnamed: 0,Title,Body,Tags
0,how to get all the child records from differen...,i am having different tables like select from ...,sql sql-server
1,retrieve all except some data of the another t...,i have two table mmaster and tblappointment th...,php mysql sql codeigniter mysqli
2,pandas readhtml,i am trying to extract us states from wiki url...,python pandas
3,reader always gim me null,i am so new to c i wan na make an application ...,sql-server c#-4.0
4,php rearrange array elements based on condition,basically i have this array array array t sub ...,php


In [8]:
from datetime import datetime
start = datetime.now()
vectorizer = TfidfVectorizer(min_df=0.00009, max_features=100000, smooth_idf=True, norm="l2", \
                             tokenizer = lambda x: x.split(), sublinear_tf=False, ngram_range=(1,3))
x_train_multilabel = vectorizer.fit_transform(train['Title']+train['Body'])
x_test_multilabel = vectorizer.transform(test['Title']+test['Body'])

print("Time taken to run this cell :", datetime.now() - start)

Time taken to run this cell : 0:00:27.755403


In [9]:
start = datetime.now()
vectorizer = CountVectorizer(tokenizer = lambda x: x.split(), binary='true')
y_train_multilabel = vectorizer.fit_transform(train['Tags'])
y_test_multilabel = vectorizer.transform(test['Tags'])
print("Time taken to run this cell :", datetime.now() - start)

Time taken to run this cell : 0:00:00.561576


In [10]:
print("Number of data points of in ytrain: ",y_train_multilabel.shape)
print("Number of data points of in ytest: ",y_test_multilabel.shape)

Number of data points of in ytrain:  (45000, 9336)
Number of data points of in ytest:  (15000, 9336)


In [11]:
print("Number of data points of in xtrain: ",x_train_multilabel.shape)
print("Number of data points of in xtest: ",x_test_multilabel.shape)

Number of data points of in xtrain:  (45000, 164528)
Number of data points of in xtest:  (15000, 164528)


In [13]:
from sklearn.metrics import f1_score
classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l1'), n_jobs=-1)
classifier.fit(x_train_multilabel, y_train_multilabel)
predictions = classifier.predict(x_test_multilabel)

print("accuracy :",accuracy_score(y_test_multilabel,predictions))
print("macro f1 score :",f1_score(y_test_multilabel, predictions, average = 'macro'))
print("micro f1 scoore :",f1_score(y_test_multilabel, predictions, average = 'micro'))
print("hamming loss :",hamming_loss(y_test_multilabel,predictions))
print("Precision recall report :\n",classification_report(y_test_multilabel, predictions))

KeyboardInterrupt: 