# **StackOverflow Top10 tags**

The aim is here to build a model to predict the tag associated with the text. (Here we will build a model for the top 10 tags only)

In [1]:
 # Importing data sets 

In [2]:
!pip install -U -q PyDrive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
downloaded= drive.CreateFile({'id': '1h4Xd0WgRd14Nv0PRVypkqOt11hNteCnS'})
downloaded.GetContentFile('stacksample.zip')

In [None]:
!unzip stacksample.zip

In [None]:
#Importing required packages.
import numpy as np
import pandas as pd

In [None]:
questions_data= pd.read_csv('Questions.csv', encoding='iso-8859-1')
tags_data= pd.read_csv('Tags.csv', encoding='iso-8859-1')

In [None]:
#filtering the top_10 most associated tags from the entire dataset
top_10= list(tags_data['Tag'].value_counts().head(10).index)

In [None]:
top_10

In [None]:
tags_data= tags_data[tags_data['Tag'].isin(top_10)]

In [None]:
questions= pd.merge(tags_data, questions_data, how='inner', on=['Id'])

# **Data Preprocessing**

In [None]:
#Now, We will take only quesions with a score greater than 5. Will do that for 2 reasons:
  #1- it will require less computational resources.
  #2- The posts will probably be with a better quality and will be better tagged since they have lots of upvotes.

In [None]:
questions.shape

In [None]:
questions= questions[questions['Score']>5]

In [None]:
questions.shape

In [None]:
questions.head(3) #Here we see that [Id, OwnerUserId, CreationDate, ClosedDate, Score] will have no impact on predicting the tags;
                  #hence we will not consider those for prediction.

In [None]:
questions= questions[['Tag','Title','Body']]

In [None]:
questions.head(3)

In [None]:
# We will check whether we have any null values in the data set
questions.isnull().mean(axis=0) #No null values

In [None]:
#We will check whether we have any duplicate values in the data set
questions.duplicated().sum() #No duplicate values

In [None]:
# #In the next two columns: Body and Title, we will use lots of text processing:
#     Removing html format
#     Lowering text
#     Transforming abbreviations
#     Removing punctuation (but keeping words like c# since it's the most popular tag)
#     Lemmatizing words
#     Removing stop words

In [None]:
#Importing the required package.
import bs4
from bs4 import BeautifulSoup
import re

In [None]:
#Converting html to text in the BODY column.
questions['Body']= questions['Body'].apply(lambda x: bs4.BeautifulSoup(x).get_text())

In [None]:
#Converting html to text in the TITLE column.
questions['Title']= questions['Title'].apply(lambda x: bs4.BeautifulSoup(x).get_text())
questions['Title']= questions['Title'].apply(lambda x: str(x))

In [None]:
#Defining a function to clean the texts.
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub(r"\'\n", " ", text)
    text = re.sub(r"\'\xa0", " ", text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [None]:
#Cleaning the texts in the BODY column.
questions['Body']= questions['Body'].apply(lambda x: clean_text(x))

In [None]:
#Cleaning the texts in the TITLE column.
questions['Title']= questions['Title'].apply(lambda x: clean_text(x))

In [None]:
questions.head(3)

# **Model Builing**

In [None]:
#Importing the required packages.
import nltk
nltk.download('punkt')
from nltk import word_tokenize

from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, BatchNormalization, GRU, concatenate
from keras.models import Model

In [None]:
questions_train, questions_test= train_test_split(questions, test_size=0.2, random_state=100)

In [None]:
x_train_t= questions_train['Title']
x_train_b= questions_train['Body']
y_train= pd.get_dummies(questions_train['Tag'])

x_test_t= questions_test['Title']
x_test_b= questions_test['Body']
y_test= pd.get_dummies(questions_test['Tag'])

***For Title...***

In [None]:
sent_lens_t=[]
for sent in questions_train['Title']:
  sent_lens_t.append(len(word_tokenize(sent)))

max(sent_lens_t)

In [None]:
np.quantile(sent_lens_t,0.95)

In [None]:
#As we see that 95& of the word has lenth of 17, hence we will set the max length to 17
max_len_t=17

In [None]:
tok_t= Tokenizer(char_level=False, split=' ')
tok_t.fit_on_texts(x_train_t)
sequences_train_t= tok_t.texts_to_sequences(x_train_t)
sequences_test_t= tok_t. texts_to_sequences(x_test_t) #Applying same for Test data

In [None]:
vocab_len_t= len(tok_t.index_word.keys()) #verifying vocabulory length.
vocab_len_t

In [None]:
sequences_matrix_train_t= sequence.pad_sequences(sequences_train_t, maxlen= max_len_t)
sequences_matrix_test_t= sequence.pad_sequences(sequences_test_t, maxlen= max_len_t) #Applying the same for Test data.

In [None]:
sequences_matrix_train_t

In [None]:
sequences_matrix_test_t

***For Body...***

In [None]:
sent_lens_b=[]
for sent in questions_train['Body']:
  sent_lens_b.append(len(word_tokenize(sent)))

max(sent_lens_b)

In [None]:
np.quantile(sent_lens_b, 0.92)

In [None]:
#As we see that 92% of the word has length of 495, hence we will set the max length to 500
max_len_b=500

In [None]:
tok_b= Tokenizer(char_level=False, split=' ')
tok_b.fit_on_texts(x_train_b)
sequences_train_b= tok_b.texts_to_sequences(x_train_b)
sequences_test_b= tok_b.texts_to_sequences(x_test_b)  #Applying same for Test data

In [None]:
vocab_len_b= len(tok_b.index_word.keys())      #verifying vocabulory length.
vocab_len_b

In [None]:
sequences_matrix_train_b= sequence.pad_sequences(sequences_train_b, maxlen= max_len_b)
sequences_matrix_test_b= sequence.pad_sequences(sequences_test_b, maxlen= max_len_b)     #Applying the same for Test data.

In [None]:
sequences_matrix_train_b

In [None]:
sequences_matrix_test_b

Verifying the shapes of the matrices...

In [None]:
sequences_matrix_train_t.shape, sequences_matrix_train_b.shape, y_train.shape

In [None]:
sequences_matrix_test_t.shape, sequences_matrix_test_b.shape, y_test.shape

In [None]:
print(max_len_t)
print(max_len_b)

In [None]:
#Downloading the Embedding to use the pre-trained weights.. 
!wget http://nlp.stanford.edu/data/glove.twitter.27B.zip

In [None]:
#unzipping it..
!unzip glove.twitter.27B.zip

In [None]:
embeddings_index= {}
f= open('glove.twitter.27B.200d.txt')
for line in f:
  values= line.split()
  word= values[0]
  coefs= np.asarray(values[1:], dtype='float32')
  embeddings_index[word]= coefs
f.close()

In [None]:
embedding_matrix_t= np.zeros((len(tok_t.word_index)+1,200))
for word, i in tok_t.word_index.items():
  embedding_vector= embeddings_index.get(word)
  if embedding_vector is not None:
    #words not found in embedding index will all be set to zeros.
    embedding_matrix_t[i]= embedding_vector

In [None]:
embedding_matrix_b= np.zeros((len(tok_b.word_index)+1,200))
for word, i in tok_b.word_index.items():
  embedding_vector= embeddings_index.get(word)
  if embedding_vector is not None:
    #words not found in embedding index will all be set to zeros.
    embedding_matrix_b[i]= embedding_vector

# **Model Creation**

In [None]:
def RNN():
  #Title only...
  title_input= Input(shape=[max_len_t], name='title_input')
  title_Embed= Embedding(vocab_len_t+1, 200, weights=[embedding_matrix_t], trainable=False, input_length= max_len_t, mask_zero=True, name='title_Embed') (title_input)
  lstm_out_t= LSTM(100) (title_Embed)
  #Auxiliary output to tune LSTM weights smoothly...
  auxiliary_output= Dense(10, activation= 'sigmoid', name= 'auxiliary_output') (lstm_out_t)

  #Body only...
  body_input= Input(shape=[max_len_b], name='body_input')
  body_Embed= Embedding(vocab_len_b+1, 200, weights=[embedding_matrix_b], trainable=False, input_length= max_len_b, mask_zero=True, name='body_Embed') (body_input)
  lstm_out_b= LSTM(100) (body_Embed)
  
  #Combined with LSTM output...
  com= concatenate([lstm_out_t, lstm_out_b])

  #Now combined data is being fed to dense layers...
  dense1= Dense(50, activation='relu') (com)
  dp1= Dropout(0.3) (dense1)
  bn= BatchNormalization() (dp1)
  dense2= Dense(30, activation='relu') (bn)  
  main_output= Dense(10, activation='sigmoid', name='main_output') (dense2)

  model= Model(inputs=[title_input, body_input], outputs=[main_output, auxiliary_output])
  return model

In [None]:
model= RNN()
model.summary()

In [None]:
model.compile(optimizer='adam', loss={'main_output':'categorical_crossentropy', 'auxiliary_output': 'categorical_crossentropy'}, metrics=['accuracy'])

**Defining ModelCheckpoint to save our model every 10 epoch...**

In [None]:
from keras.callbacks import ModelCheckpoint
import os

output_folder= './stackoverflow_output'
if not os.path.exists(output_folder):
  os.makedirs(output_folder)

filepath= output_folder+"/weights-{epoch:02d}-accuracy-{main_output_acc:.4f}.h5"
checkpoint= ModelCheckpoint(filepath, verbose=1, monitor='main_output_acc',
                            save_best_only=False,
                            save_weights_only=True,
                            mode='auto', period=10) #This will save the weights every 10 epochs

In [None]:
results= model.fit({'title_input': sequences_matrix_train_t, 'body_input': sequences_matrix_train_b},
                   {'main_output': y_train, 'auxiliary_output': y_train},
                   validation_data= [{'title_input': sequences_matrix_test_t, 'body_input': sequences_matrix_test_b},
                                     {'main_output': y_test, 'auxiliary_output': y_test}],
                   epochs= 80, batch_size= 1000, callbacks=[checkpoint]
                   )

In [None]:
#Now load the saved model for prediction..
model.load_weights('/content/stackoverflow_output/weights-30-accuracy-0.8252.h5')

In [None]:
#Predicting the model..
(predict_main, predict_auxiliary)= model.predict({'title_input': sequences_matrix_test_t, 'body_input': sequences_matrix_test_b}, verbose=1)

In [None]:
#Importing required packages...
from sklearn.metrics import classification_report, f1_score

In [None]:
print(f1_score(y_test, predict_main>0.55, average='weighted'))

In [None]:
print(classification_report(y_test, predict_main>0.55))

In [None]:
predict_main[24].round(decimals=2)

In [None]:
top_10

In [None]:
predict_main[].round(decimals=2)

In [None]:
#Saving the model..
model.save('./stackoverflow_tags_model.h5')