In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import seaborn as sns
import os
import warnings
import time
import re
import nltk
import bs4
from nltk.tokenize import ToktokTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import corpus
from string import punctuation
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, learning_curve, ShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
from sklearn.metrics import make_scorer, confusion_matrix, hamming_loss, accuracy_score, precision_score, recall_score
from sklearn.cluster import KMeans

import logging
from scipy.sparse import hstack

warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
question_df = pd.read_csv("Questions.csv", encoding="ISO-8859-1")
question_df

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...
...,...,...,...,...,...,...,...
1264211,40143210,5610777.0,2016-10-19T23:38:01Z,,0,URL routing in PHP (MVC),<p>I am building a custom MVC project and I ha...
1264212,40143300,3791161.0,2016-10-19T23:48:09Z,,0,Bigquery.Jobs.Insert - Resumable Upload?,<p>The API docs show that you should be able t...
1264213,40143340,7028647.0,2016-10-19T23:52:50Z,,1,Obfuscating code in android studio,<p>Under minifyEnabled I changed from false to...
1264214,40143360,871677.0,2016-10-19T23:55:24Z,,0,How to fire function after v-model change?,<p>I have input which I use to filter my array...


In [3]:
question_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1264216 entries, 0 to 1264215
Data columns (total 7 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   Id            1264216 non-null  int64  
 1   OwnerUserId   1249762 non-null  float64
 2   CreationDate  1264216 non-null  object 
 3   ClosedDate    55959 non-null    object 
 4   Score         1264216 non-null  int64  
 5   Title         1264216 non-null  object 
 6   Body          1264216 non-null  object 
dtypes: float64(1), int64(2), object(4)
memory usage: 67.5+ MB


In [4]:
tag_df = pd.read_csv("Tags.csv", encoding="ISO-8859-1", dtype={'Tag': str})
tag_df

Unnamed: 0,Id,Tag
0,80,flex
1,80,actionscript-3
2,80,air
3,90,svn
4,90,tortoisesvn
...,...,...
3750989,40143360,javascript
3750990,40143360,vue.js
3750991,40143380,npm
3750992,40143380,mocha


In [5]:
tag_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3750994 entries, 0 to 3750993
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   Id      int64 
 1   Tag     object
dtypes: int64(1), object(1)
memory usage: 57.2+ MB


In [6]:
tag_df['Tag'] = tag_df['Tag'].astype(str)
grouped_tags = tag_df.groupby("Id")['Tag'].apply(lambda tag_df: ' '.join(tag_df))
grouped_tags

Id
80                                 flex actionscript-3 air
90            svn tortoisesvn branch branching-and-merging
120                                    sql asp.net sitemap
180         algorithm language-agnostic colors color-space
260                c# .net scripting compiler-construction
                                 ...                      
40143210                                     php .htaccess
40143300                                   google-bigquery
40143340                            android android-studio
40143360                                 javascript vue.js
40143380                                   npm mocha babel
Name: Tag, Length: 1264216, dtype: object

In [7]:
grouped_tags.reset_index()

Unnamed: 0,Id,Tag
0,80,flex actionscript-3 air
1,90,svn tortoisesvn branch branching-and-merging
2,120,sql asp.net sitemap
3,180,algorithm language-agnostic colors color-space
4,260,c# .net scripting compiler-construction
...,...,...
1264211,40143210,php .htaccess
1264212,40143300,google-bigquery
1264213,40143340,android android-studio
1264214,40143360,javascript vue.js


In [8]:
grouped_tags_final = pd.DataFrame({'Id':grouped_tags.index, 'Tags':grouped_tags.values})
grouped_tags_final

Unnamed: 0,Id,Tags
0,80,flex actionscript-3 air
1,90,svn tortoisesvn branch branching-and-merging
2,120,sql asp.net sitemap
3,180,algorithm language-agnostic colors color-space
4,260,c# .net scripting compiler-construction
...,...,...
1264211,40143210,php .htaccess
1264212,40143300,google-bigquery
1264213,40143340,android android-studio
1264214,40143360,javascript vue.js


In [9]:
question_df.drop(columns=['OwnerUserId', 'CreationDate', 'ClosedDate'], inplace=True)
question_df = question_df.merge(grouped_tags_final, on='Id')
question_df

Unnamed: 0,Id,Score,Title,Body,Tags
0,80,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,flex actionscript-3 air
1,90,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,svn tortoisesvn branch branching-and-merging
2,120,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,sql asp.net sitemap
3,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,algorithm language-agnostic colors color-space
4,260,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,c# .net scripting compiler-construction
...,...,...,...,...,...
1264211,40143210,0,URL routing in PHP (MVC),<p>I am building a custom MVC project and I ha...,php .htaccess
1264212,40143300,0,Bigquery.Jobs.Insert - Resumable Upload?,<p>The API docs show that you should be able t...,google-bigquery
1264213,40143340,1,Obfuscating code in android studio,<p>Under minifyEnabled I changed from false to...,android android-studio
1264214,40143360,0,How to fire function after v-model change?,<p>I have input which I use to filter my array...,javascript vue.js


In [10]:
print(f"Minimum Score: {question_df['Score'].min()}")
print(f"Maximum Score: {question_df['Score'].max()}")
#deleting queries with score less than 5
new_question_df = question_df[question_df['Score'] > 5]

Minimum Score: -73
Maximum Score: 5190


In [11]:
new_question_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72950 entries, 0 to 1264205
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Id      72950 non-null  int64 
 1   Score   72950 non-null  int64 
 2   Title   72950 non-null  object
 3   Body    72950 non-null  object
 4   Tags    72950 non-null  object
dtypes: int64(2), object(3)
memory usage: 3.3+ MB


In [12]:
new_question_df.drop(columns=['Id', 'Score'], inplace=True)
new_question_df

Unnamed: 0,Title,Body,Tags
0,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,flex actionscript-3 air
1,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,svn tortoisesvn branch branching-and-merging
2,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,sql asp.net sitemap
3,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,algorithm language-agnostic colors color-space
4,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,c# .net scripting compiler-construction
...,...,...,...
1262915,How to use a dict to subset a DataFrame?,"<p>Say, I have given a DataFrame with most of ...",python pandas dataframe categorical-data
1263065,Is there a way to use itertools in python to c...,<p>Let's say I have the following code:</p>\n\...,python iterator nested-loops itertools
1263399,How can I force file ordering in F# projects u...,<p>I'm trying to work with vscode in my WebSha...,f# vscode
1263454,Why does my result data returned as void* gets...,<p>I am working in a project with a huge legac...,c++


In [13]:
new_question_df['Tags']

0                                    flex actionscript-3 air
1               svn tortoisesvn branch branching-and-merging
2                                        sql asp.net sitemap
3             algorithm language-agnostic colors color-space
4                    c# .net scripting compiler-construction
                                 ...                        
1262915             python pandas dataframe categorical-data
1263065               python iterator nested-loops itertools
1263399                                            f# vscode
1263454                                                  c++
1264205    haskell types functional-programming agda lamb...
Name: Tags, Length: 72950, dtype: object

In [14]:
new_question_df['Tags'] = new_question_df['Tags'].apply(lambda x: str(x).split(', '))
unique_tags = list(set([item for sublist in new_question_df['Tags'].values for item in sublist]))
len(unique_tags)

54992

In [15]:
flat_list = [item for sublist in new_question_df['Tags'].values for item in sublist]
keywords = nltk.FreqDist(flat_list)
keywords = nltk.FreqDist(keywords)
sum(keywords.values())

72950

In [16]:
flat_list = [item for sublist in new_question_df['Tags'].values for item in sublist]
keywords = nltk.FreqDist(flat_list)
frequencies_words = keywords.most_common(10)
tags_features = [word[0] for word in frequencies_words]
tags_features

['android',
 'javascript',
 'python',
 'java',
 'javascript jquery',
 'c#',
 'c++',
 'php',
 'git',
 'html css']

In [17]:
def most_common(tags):
    tags_filtered = []
    for i in range(len(tags)):
        if tags[i] in tags_features:
            tags_filtered.append(tags[i])
    return tags_filtered

In [18]:
new_question_df['Tags'] = new_question_df['Tags'].apply(lambda x: most_common(x))
new_question_df['Tags'] = new_question_df['Tags'].apply(lambda x: x if len(x) > 0 else None)
new_question_df.dropna(subset=['Tags'], inplace=True)
new_question_df

Unnamed: 0,Title,Body,Tags
205,Code to ask yes/no question in javascript,<p>I could only find the function <code>confir...,[javascript]
223,Table cells larger than they are meant to be,<p>I've created a map system for a game that r...,[html css]
377,What is the best quick-read Python book out th...,<p>I am taking a class that requires Python. W...,[python]
379,duplicating jQuery datepicker,<p>The <code>datepicker</code> function only w...,[javascript jquery]
471,Writing/Using C++ Libraries,<p>I am looking for basic examples/tutorials o...,[c++]
...,...,...,...
1247925,Placeholder auto wrap inside a input field,<p>I need to put a long <code>placeholder</cod...,[html css]
1249431,Log.wtf vs. Unhandled Exception,"<p>I just learned about <a href=""https://devel...",[android]
1251020,Optimizing the use of arguments inside a function,"<p>In an interview test, for the following cod...",[c++]
1256924,How does a Java if statement work when it has ...,"<p>Why does this if statement, with an assignm...",[java]


In [19]:
new_question_df['Body'] = new_question_df['Body'].apply(lambda x: BeautifulSoup(x).get_text()) 

In [24]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tanaj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
def clean_text(text):
    text = text.lower()
    text = text.strip(' ')
    return text

token = ToktokTokenizer()
punct = '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~'
lemma = WordNetLemmatizer()
stop_words = set(corpus.stopwords.words("english"))

def strip_list_noempty(mylist):
    newlist = (item.strip() if hasattr(item, 'strip') else item for item in mylist)
    return [item for item in newlist if item != '']

def clean_punct(text): 
    words = token.tokenize(text)
    punctuation_filtered = []
    regex = re.compile('[%s]' % re.escape(punct))
    remove_punctuation = str.maketrans(' ', ' ', punct)
    for w in words:
        if w in tags_features:
            punctuation_filtered.append(w)
        else:
            punctuation_filtered.append(regex.sub('', w))
    filtered_list = strip_list_noempty(punctuation_filtered)
    return ' '.join(map(str, filtered_list))

def lemmatizeWords(text):
    words = token.tokenize(text)
    listLemma = []
    for w in words:
        x = lemma.lemmatize(w, pos="v")
        listLemma.append(x)
    return ' '.join(map(str, listLemma))

def stopWordsRemove(text):
    stop_words = set(corpus.stopwords.words("english"))
    words = token.tokenize(text)
    filtered = [w for w in words if not w in stop_words]
    return ' '.join(map(str, filtered))


In [22]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tanaj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [30]:
new_question_df['Body'] = new_question_df['Body'].apply(lambda x: clean_text(x)) 
new_question_df['Body'] = new_question_df['Body'].apply(lambda x: clean_punct(x)) 
new_question_df['Body'] = new_question_df['Body'].apply(lambda x: lemmatizeWords(x))
new_question_df['Body'] = new_question_df['Body'].apply(lambda x: stopWordsRemove(x))

In [31]:
new_question_df['Title'] = new_question_df['Title'].apply(lambda x: str(x))
new_question_df['Title'] = new_question_df['Title'].apply(lambda x: clean_text(x)) 
new_question_df['Title'] = new_question_df['Title'].apply(lambda x: clean_punct(x)) 
new_question_df['Title'] = new_question_df['Title'].apply(lambda x: lemmatizeWords(x)) 
new_question_df['Title'] = new_question_df['Title'].apply(lambda x: stopWordsRemove(x))

In [32]:
new_question_df

Unnamed: 0,Title,Body,Tags
205,code ask yesno question javascript,could find function confirm give okcancel butt...,[javascript]
223,table cells larger mean,create map system game run principle draw pict...,[html css]
377,best quick-read python book,take class require python review language clas...,[python]
379,duplicate jquery datepicker,datepicker function work first input box creat...,[javascript jquery]
471,writingusing c++ libraries,look basic examplestutorials writecompile libr...,[c++]
...,...,...,...
1247925,placeholder auto wrap inside input field,need put long placeholder text inside input fi...,[html css]
1249431,logwtf vs unhandled exception,learn logwtf terrible failure lol wonder use d...,[android]
1251020,optimize use arguments inside function,interview test follow code void getposition du...,[c++]
1256924,java statement work assignment equality check ...,statement assignment equality check evaluate f...,[java]


In [33]:
X1 = new_question_df['Body']
X2 = new_question_df['Title']
y = new_question_df['Tags']

multilabel_binarizer = MultiLabelBinarizer()
y_bin = multilabel_binarizer.fit_transform(y)
y_bin

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [34]:
vectorizer_X1 = TfidfVectorizer(analyzer = 'word',
                                       min_df=0.0,
                                       max_df = 1.0,
                                       strip_accents = None,
                                       encoding = 'utf-8', 
                                       preprocessor=None,
                                       token_pattern=r"(?u)\S\S+",
                                       max_features=1000)

vectorizer_X2 = TfidfVectorizer(analyzer = 'word',
                                       min_df=0.0,
                                       max_df = 1.0,
                                       strip_accents = None,
                                       encoding = 'utf-8', 
                                       preprocessor=None,
                                       token_pattern=r"(?u)\S\S+",
                                       max_features=1000)

In [35]:
X1_tfidf = vectorizer_X1.fit_transform(X1)
X2_tfidf = vectorizer_X2.fit_transform(X2)

X_tfidf = hstack([X1_tfidf,X2_tfidf])

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_bin, test_size = 0.2, random_state = 0)

In [37]:
classi = SGDClassifier()
clf = OneVsRestClassifier(classi)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
ham = hamming_loss(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
print("Hamming Loss: ", ham)
print('Precision: ', prec)
print('Recall: ', recall_score(y_test, y_pred, average='weighted'))

Hamming Loss:  0.03713163064833006
Precision:  0.8769344875925786
Recall:  0.7269155206286837
