In [90]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [91]:
import warnings
warnings.filterwarnings('ignore')

In [136]:
df=pd.read_csv('../input/coding-project-datazeit/coding_project_category_classification.csv')

In [137]:
df.head()

In [94]:
df.shape

In [138]:
Null_percent=(df.isnull().sum()/df.shape[0])*100
print("The Percentage of Null Values in the dataset\n",Null_percent)

In [139]:
df=df.drop_duplicates()
df=df.dropna()
df.shape

In [142]:
import random
print("which category is the Most Occupied\n")
category_count=sorted(df['category_dz'].value_counts())
labels=df['category_dz'].unique()
fig = plt.figure(figsize=[10, 40])
ax = fig.add_subplot(111)


cmap = plt.cm.prism
colors = cmap(np.linspace(0., 1., len(category_count)))

pie_wedge_collection = ax.pie(category_count, colors=colors, labels=labels, labeldistance=1.05);

for pie_wedge in pie_wedge_collection[0]:
    pie_wedge.set_edgecolor('white')

ax.set_title("Figure 3")

In [143]:
import sklearn
from sklearn.preprocessing import LabelEncoder

In [144]:
le=LabelEncoder()
df['Target_category']=le.fit_transform(df['category_dz'])

In [145]:
df

In [104]:
def cleaning(Inpdata):
    import re
    cleanedArticle1=re.sub(r'\\[?|$|(),"".@#=><|!]Â&*/-',r' ',Inpdata)
    cleanedArticle2=re.sub(r'https?://\S+|www\.\S+',r' ',cleanedArticle1)
    cleanedArticle3=re.sub(r'\b\w{1,2}\b', ' ',cleanedArticle2)
    cleanedArticle4=re.sub(r' +', ' ',cleanedArticle3)
    cleanedArticle5=re.sub(r'[^a-z A-Z]',r' ',cleanedArticle4)
    cleanedArticle6=cleanedArticle5.replace("  ", "")
    cleanedArticle7=cleanedArticle6.lower()
    return(cleanedArticle7)

In [105]:
df['text_clean']=df['description_en'].apply(cleaning)

In [106]:
df.head()

In [108]:
df=df[['text_clean','Target_category']]

In [109]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import  DistilBertTokenizerFast,TFDistilBertForSequenceClassification

In [110]:
DATA_COLUMN = 'text_clean'
LABEL_COLUMN = 'Target_category'
MAX_SEQUENCE_LENGTH = 512
LEARNING_RATE = 0.001
BATCH_SIZE = 16
NUM_EPOCHS = 25

In [111]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [112]:
def tokenize(sentences, max_length=MAX_SEQUENCE_LENGTH, padding='max_length'):

    return tokenizer(sentences,truncation=True,padding=padding,max_length=max_length,return_tensors="tf")

In [113]:
train_data, validation_data, train_label, validation_label = train_test_split(df[DATA_COLUMN].tolist(),df[LABEL_COLUMN].tolist(),test_size=.2,shuffle=True)

In [133]:
full_data=df[DATA_COLUMN].tolist()
full_target=df[LABEL_COLUMN].tolist()

In [115]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(tokenize(train_data)),  # Convert BatchEncoding instance to dictionary
    train_label
)).shuffle(1000).batch(BATCH_SIZE).prefetch(1)

In [116]:
validation_dataset = tf.data.Dataset.from_tensor_slices((
    dict(tokenize(validation_data)),
    validation_label
)).batch(BATCH_SIZE).prefetch(1)

In [134]:
full_dataset= tf.data.Dataset.from_tensor_slices((
    dict(tokenize(full_data)),
    full_target
)).batch(BATCH_SIZE).prefetch(1)

In [117]:
NUM_LABELS=125
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=NUM_LABELS)

In [118]:
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model.compile(optimizer=optimizer,loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))

In [120]:
model.fit(train_dataset,y=None,validation_data=validation_dataset,batch_size=BATCH_SIZE,epochs=NUM_EPOCHS)

In [125]:
model.save('./')
pred=model.predict_generator(validation_dataset)
val=np.argmax(pred.logits,axis=1)

In [128]:
valid=pd.DataFrame(validation_label,columns=['Target_category'])
valid['pred_category']=val
valid['Status']=np.where(valid['Target_category']==valid['pred_category'],1,0)
Acc=(valid['Status'].sum()/valid.shape[0])*100
print("The Accuracy on Test Data",Acc)

In [132]:
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix, precision_recall_curve, auc, roc_curve, recall_score, classification_report, precision_score 
from yellowbrick.datasets import load_occupancy
from yellowbrick.model_selection import FeatureImportances
from yellowbrick.classifier import ClassificationReport

In [150]:
full_pred=model.predict_generator(full_dataset)
pred_target=np.argmax(full_pred.logits,axis=1)
full_df=pd.DataFrame(df,columns=df.columns)
full_df['Target_category']=full_target
full_df['pred_category']=pred_target
full_df['status']=np.where(full_df['Target_category']==full_df['pred_category'],1,0)
Accuracy_on_data=(full_df['status'].sum()/full_df.shape[0])*100
print("The Accuracy on Full Dataset",Accuracy_on_data)

In [151]:
classes = df['category_dz'].tolist()

group_names = ['True_Neg','False_Pos','False_Neg','True_Pos']

In [152]:
print("The Precision Score ",precision_score(full_df['Target_category'],full_df['pred_category'],average='weighted'))
print("The Recall Score", recall_score(full_df['Target_category'],full_df['pred_category'],average = 'weighted'))
print("The F1 Score ",   f1_score(full_df['Target_category'],full_df['pred_category'],average = 'weighted'))

In [185]:
print("The Classification Report")
fig, ax = plt.subplots(figsize=(10,40))
class_report=classification_report(full_df['Target_category'],full_df['pred_category'],
                                  labels=full_df['Target_category'].unique(),target_names=output_df['category_dz'].unique(),output_dict=True)
sns.heatmap(pd.DataFrame(class_report).iloc[:-1, :].T, annot=True,linewidths=.5, ax=ax)

In [202]:
print("The Confusion Matrix")
fig, ax = plt.subplots(figsize=(40,50))
conf_matrix=confusion_matrix(full_df['Target_category'],full_df['pred_category'])
sns.heatmap(pd.DataFrame(conf_matrix,columns=[output_df['category_dz'].unique()]),annot=True,linewidths=.5, ax=ax)


In [177]:
output_df=pd.DataFrame(full_df,columns=['parent_pid','domain_tld','category_dz','description_en'])
output_df.insert(3,'category_model',value=full_df['pred_category'])

In [179]:
output_df['category_model']=le.inverse_transform(full_df['pred_category'])
output_df.to_csv('./output.csv',index=False)