In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import re
import warnings
import datetime as datetime
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import f1_score,precision_score,recall_score

In [None]:
df = pd.read_csv("../input/facebook-recruiting-iii-keyword-extraction/Train/Train.csv")
df.head()

Performing EDA

In [None]:
df.shape


In [None]:
# As this is large dataset I'll consider a small subset of it (50k rows) to train.
#Slicing the dataset.
df = df.iloc[:50000, :]
df.shape

#data['Tags'].value_counts() = 38783

In [None]:
#Checking for duplicates
duplicates = df.sort_values('Title', ascending=False).duplicated('Title')
print("Total number of duplicate questions : ", duplicates.sum())
df = df[~duplicates]
print("Dataframe shape without duplicates: ", df.shape)

In [None]:
# Counting the numer of tags per row(question) in training data. Split() helps in separating the 
#string into list and we are counting the length of each list.
def count_tags(x):
    return len(x.split())
df["tag_count"] = df["Tags"].apply(count_tags)
print("tag count:", df.tag_count)

In [None]:
#Frequency of tags 
print(df['tag_count'].value_counts())
#Average no of tags per question
print(df['tag_count'].mean())

# we can observe on avg there are three tags per question.

In [None]:
# dropping columns with na
df.dropna(inplace=True)

In [None]:
#vectorizer = CountVectorizer(tokenizer= lambda text : text.split(" "))
#tag_dtm = vectorizer.fit_transform(df["Tags"])

In [None]:
#Term Frequency-Inverse Document Frequency model (TFIDF)
tv=TfidfVectorizer(tokenizer= lambda text : text.split(" "))
tag_dtm=tv.fit_transform(df["Tags"])

In [None]:
tags = tv.get_feature_names()

In [None]:
freqs = tag_dtm.sum(axis=0).A1
result = dict(zip(tags,freqs))

In [None]:
tag_df = pd.DataFrame(result.items(), columns=["Tags", "Counts"])
tag_df

In [None]:
tag_df_sorted = tag_df.sort_values(['Counts'], ascending=False)
tag_counts = tag_df_sorted["Counts"].values

In [None]:
wordcloud = WordCloud(background_color='black',
         width = 1400,
         height = 800).generate_from_frequencies(result)
plt.figure(figsize=(30,20))
plt.imshow(wordcloud)
plt.show()

In [None]:
i = np.arange(30)
tag_df_sorted.head(30).plot(kind='bar')
plt.xticks(i, tag_df_sorted['Tags'][:30])
plt.show()

In [None]:
#Stripping HTML and stop words
def striphtml(data):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr,' ',str(data))
    return cleantext

stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

**Text Preprocessing**

In [None]:
question_list=[]
questions_with_code = 0
length_preprocessing = 0 
length_postprocessing = 0 
for index,row in df.iterrows():
    title, body, tags = row["Title"], row["Body"], row["Tags"]
    if '<code>' in body:
        questions_with_code+=1
    length_preprocessing+=len(title) + len(body)
    body=re.sub('<code>(.*?)</code>', '', body, flags=re.MULTILINE|re.DOTALL)
    body = re.sub('<.*?>', ' ', str(body.encode('utf-8')))
    title=title.encode('utf-8')
    question=str(title)+" "+str(body)
    question=re.sub(r'[^A-Za-z]+',' ',question)
    words=word_tokenize(str(question.lower()))
    question=' '.join(str(stemmer.stem(j)) for j in words if j not in stop_words and (len(j)!=1 or j=='c'))
    question_list.append(question)
    length_postprocessing += len(question)
df["question"] = question_list
avg_len_before_preprocessing=(length_preprocessing*1.0)/df.shape[0]
avg_len_after_preprocessing=(length_postprocessing*1.0)/df.shape[0]

print( "Avg. length of questions(Title+Body) before Text preprocessing: ", avg_len_before_preprocessing)
print( "Avg. length of questions(Title+Body) post Text processing: ", avg_len_after_preprocessing)
print ("% of questions containing code: ", (questions_with_code*100.0)/df.shape[0])


In [None]:
preprocessed_df = df[["question","Tags"]]
print("Shape of preprocessed data :", preprocessed_df.shape)

**ML Modelling**

In [None]:
vectorizer = CountVectorizer(tokenizer = lambda x: x.split(), binary='true')
#vectorizer=TfidfVectorizer(tokenizer= lambda text : text.split(" "), binary='true')
y_multilabel = vectorizer.fit_transform(preprocessed_df['Tags'])

In [None]:
y_multilabel.get_shape()

In [None]:
def tags_to_select(n):
    sum_of_tags = y_multilabel.sum(axis=0).tolist()[0]
    sorted_tags = sorted(range(len(sum_of_tags)), key=lambda i: sum_of_tags[i], reverse=True)
    ny_multilabel=y_multilabel[:,sorted_tags[:n]]
    return ny_multilabel

def questions_considered_fn(n):
    ny_multilabel = tags_to_select(n)
    x= ny_multilabel.sum(axis=1)
    return (np.count_nonzero(x==0))

In [None]:
question_explained = []
total_tags = y_multilabel.shape[1]
total_qs = preprocessed_df.shape[0]

for i in range(1000, total_tags, 100):
    question_explained.append(np.round(((total_qs-questions_considered_fn(i))/total_qs)*100,3))

In [None]:
fig, ax = plt.subplots()
ax.plot(question_explained)
xlabel = list(500+np.array(range(-50,450,50))*50)
ax.set_xticklabels(xlabel)
plt.xlabel("Number of tags")
plt.ylabel("Number Questions coverd partially")
plt.grid()
plt.show()
# you can choose any number of tags based on your computing power, minimun is 50(it covers 90% of the tags)
print("with ",1000,"tags we are covering ",question_explained[50],"% of questions")

In [None]:
yx_multilabel = tags_to_select(1000)
print("number of questions that are not covered :", questions_considered_fn(12500),"out of ", total_qs)

In [None]:
yx_multilabel.shape

In [None]:
#splitting data
total_size=preprocessed_df.shape[0]
train_size=int(0.80*total_size)

x_train=preprocessed_df.head(train_size)
x_test=preprocessed_df.tail(total_size - train_size)

y_train = yx_multilabel[0:train_size,:]
y_test = yx_multilabel[train_size:total_size,:]

In [None]:
print("Number of data points in train data :", y_train.shape)
print("Number of data points in test data :", y_test.shape)

In [None]:
#Featurizing the data
tfidf_vect = TfidfVectorizer(min_df=0.00009,max_features=200000,smooth_idf=True,norm='l2',\
               tokenizer=lambda x : x.split(),sublinear_tf=False, ngram_range=(1,3) )

In [None]:
x_train_vectors = tfidf_vect.fit_transform(x_train['question'])
x_test_vectors = tfidf_vect.transform(x_test['question'])

In [None]:
#Applying Logistic Regression with OneVsRest Classifier
classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l1'), n_jobs=-1)
classifier.fit(x_train_vectors,y_train)

In [None]:
predictions = classifier.predict(x_test_vectors)

In [None]:
print("accuracy ", metrics.accuracy_score(y_test,predictions))
print("macro f1 score ",metrics.f1_score(y_test,predictions, average='macro'))
print("micro f1 score ", metrics.f1_score(y_test, predictions, average='micro'))
print("hamming loss ", metrics.hamming_loss(y_test,predictions))

**OneVsRest Classifier with Linear SVM (Loss-Hinge)**

In [None]:
start = datetime.now()
classifier_5 = OneVsRestClassifier(SGDClassifier(loss='hinge', alpha=0.00001, penalty='l1'), n_jobs=-1)
classifier_5.fit(x_train_multilabel, y_train)
Saving the Classifier
joblib.dump(classifier_5, 'lr_with_more_title_weight_5.pkl') 

Loading the Classifier
classifier_5 = joblib.load('../input/d/elemento/facebook-recruiting-iii-keyword-extraction/lr_with_more_title_weight_5.pkl') 
predictions = classifier_5.predict(x_test_multilabel)

print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))

precision = precision_score(y_test, predictions, average='micro')
recall = recall_score(y_test, predictions, average='micro')
f1 = f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = precision_score(y_test, predictions, average='macro')
recall = recall_score(y_test, predictions, average='macro')
f1 = f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)