In [1]:
import numpy as np
import pandas as pd
import ssl
import copy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re
import random
from pprint import pprint

from sklearn.metrics import hamming_loss, roc_auc_score
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import train_test_split 

from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import wordcloud

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer


import scikitplot as skplt
import matplotlib.pyplot as plt

from keras import optimizers
from keras.losses import binary_crossentropy
from keras.metrics import binary_accuracy
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping

from scipy.special import softmax

ssl._create_default_https_context = ssl._create_unverified_context

Using TensorFlow backend.


In [2]:
def clean_statement(statement):
#     x = re.sub('-', ' ', x)
    statement = re.sub('$', ' ', statement)
    statement = re.sub('[^A-Za-z]+', ' ', statement)
    statement = re.sub('[,|.|?|\n]|\t', '', statement)
    statement = re.sub('n\'t', ' ', statement)
    statement = re.sub('submission|submissions|Submission|submission|th ', '', statement)
    statement = re.sub('one|two|given|need', '', statement)
    
    return statement

In [3]:
def process_problem_statement(q_statement):
    
    q_statement = clean_statement(q_statement)
    
#     q_statement = re.sub('[^A-Za-z]+', ' ', q_statement)
    
    tokens = word_tokenize(q_statement)
    
    stoplist = set(stopwords.words('english'))
    
    word_list = [i for i in q_statement.lower().split() if i not in stoplist]
    
    ps = PorterStemmer()
    
#     word_list = [ps.stem(word) for word in word_list]
    
    q_statement = ' '.join(word_list)
    
#     print(q_statement)
    
    return q_statement

In [4]:
def process_problem_solution(solution):
    
#     solution = clean_statement(solution)
    
    tokens = word_tokenize(solution)
    
    stoplist = set(stopwords.words('english'))
    
    word_list = [i for i in solution.lower().split() if i not in stoplist]
    
#     ps = PorterStemmer()
    
#     word_list = [ps.stem(word) for word in word_list]
    
    solution = ' '.join(word_list)
    
#     print(q_statement)
    
    return solution

In [5]:
def process_time_taken(time_col):
#     print(time_col.split())
    return time_col.split()[0]

In [6]:
def process_tags(all_tags_list,tag_col):
    
#     print(tag_col)
    tags_present = list(re.split(',',tag_col))
    
    
    tags_set = set(tags_present)
    tags_diff = tags_set.difference(set(all_tags_list))
    
    new_set = tags_set.difference(tags_diff)
#     print(new_set)
    return list(new_set)
    

In [7]:
def get_all_distinct_tags(tags_col):
    
    tags_list = []
    
    t_sets = set(tags_list)
    
    for row in tags_col:
#         print(row)
        t_list = re.split(',',row)
#         print(t_list)
        t_sets = t_sets.union(set(t_list))
#         print(t_sets)
    tags_list = list(t_sets)
    
    stoplist = set(stopwords.words('english'))
    
    word_list = [i for i in tags_list if i not in stoplist]
    
    return tags_list

In [8]:
# tag list obtained from the dataset
# global tags_list

tags_list = ['dsu', 'trees', 'chinese remainder theorem', 'sortings', 'games', 'implementation', 'bitmasks',
              '*special', 'hashing', 'geometry', 'two pointers', 'combinatorics', 'flows', 'strings',
              'probabilities', 'data structures', 'ternary search', 'greedy', 'math', 'matrices',
              'divide and conquer', 'dfs and similar', 'constructive algorithms', 'brute force', 'dp',
              '2-sat', 'graph matchings', 'binary search', 'number theory', 'graphs', 'fft', 'shortest paths',
              'schedules', 'meet-in-the-middle', 'string suffix structures', 'expression parsing']



In [9]:
def data_preprocessing():
    
    df = pd.read_csv("codeforces_question_v4.csv")
    df = df.drop(['id','name','author'],axis = 1)
    df = df[df.solution != "no code found"]
    
    global distinct_tags
    
    distinct_tags = get_all_distinct_tags(df["tags"])
    
    df["problem statement"] = [process_problem_statement(x) for x in df["problem statement"]]
    df["solution"] = [process_problem_solution(x) for x in df["solution"]]
    df["time_taken"] = [process_time_taken(x) for x in df["time_taken"]]
    
    X = copy.deepcopy(df["solution"])
    Y = [process_tags(distinct_tags,x) for x in df["tags"]]
    
    mlb = MultiLabelBinarizer()
    Y = mlb.fit_transform(Y)
    
    
    return X, Y, mlb

In [117]:
#validation_fraction = 0.2,early_stopping = True,learning_rate = 'adaptive',eta0 = 0.001,verbose = 2

global distinct_tags

X,Y, mlb = data_preprocessing()

X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size = 0.2, random_state = 0)


classifier = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range = (1,2),binary = True)),
    ('tfidf', TfidfTransformer(norm = 'l2',sublinear_tf = True)),
    ('clf', OneVsRestClassifier(LinearSVC(penalty="l2",loss="squared_hinge",tol=1
            ,random_state=0, max_iter=1000,C = 0.5)))])


classifier.fit(X_train, Y_train)

predicted = classifier.predict(X_train)
y_labels_predicted = mlb.inverse_transform(predicted)
y_labels_actual = mlb.inverse_transform(Y_train)

print("On Train data")
print("hamming_loss: ",hamming_loss(Y_train,predicted))
print("recall_score: ",recall_score(Y_train,predicted,average = 'weighted'))
print("precision_score: ",precision_score(Y_train,predicted,average = 'weighted'))
print("f1_score: ",f1_score(Y_train,predicted,average = 'weighted'))
print("roc_auc_score: ",roc_auc_score(Y_train,predicted,average = 'weighted'))
print("confusion_matrix: ",confusion_matrix(Y_train,predicted))

print()
print()

# print("Actual vs Predicted")

# for item, labels in zip(y_labels_actual, y_labels_predicted):
#         print('{0} => {1}'.format(item, ', '.join(labels)))

# print()
# print()


print("On Validation data")
predicted = classifier.predict(X_validation)
y_labels_predicted = mlb.inverse_transform(predicted)
y_labels_actual = mlb.inverse_transform(Y_validation)
print(predicted)
print("hamming_loss: ",hamming_loss(Y_validation,predicted))
print("recall_score: ",recall_score(Y_validation,predicted,average = 'weighted'))
print("precision_score: ",precision_score(Y_validation,predicted,average = 'weighted'))
print("f1_score: ",f1_score(Y_validation,predicted,average = 'weighted'))
# print("roc_auc_score: ",roc_auc_score(predicted,predicted,average = 'weighted'))
print("confusion_matrix: ",confusion_matrix(Y_validation,predicted))
print()
print()
# print("Actual vs Predicted")

# for item, labels in zip(y_labels_actual, y_labels_predicted):
#         print('{0} => {1}'.format(item, ', '.join(labels)))


        
        
# classifier2 = Pipeline([
#     ('vectorizer', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
#     ('clf', LinearSVC(penalty="l2",loss="squared_hinge",dual=True,tol=0.0000001, C=1.0, 
#             multi_class="ovr",random_state=0, max_iter=10000))])

# print(X_train.shape)

        
        
# print()
# print()

# actual_y = []
# predicted_list = []

# for index in range(len(distinct_tags)-1):
    
#     print('Processing tag: {}'.format(distinct_tags[index]))
#     classifier2.fit(X_train, Y[:,index])
#     predicted = classifier2.predict(X_test)
    
# #     print(predicted)
# #     print("##########################################################")
# #     print(target_names.iloc[:,index])
#     #     print(mlb.fit_transform(target_names)[:,index])
    
# #     actual_y.append(mlb.fit_transform(target_names)[:,index])
# #     predicted_list.append(predicted)
    
#     print('Test accuracy is {}'.format(accuracy_score(mlb.fit_transform(target_names)[:,index], predicted)))
#     print('Test recall_score is {}'.format(recall_score(mlb.fit_transform(target_names)[:,index], predicted)))
#     print('Test precision_score is {}'.format(precision_score(mlb.fit_transform(target_names)[:,index], predicted)))
#     print('Test f1_score is {}'.format(f1_score(mlb.fit_transform(target_names)[:,index], predicted)))
    
#     print()
    
#     y_true = mlb.fit_transform(target_names)[:,index]
#     y_probas = predicted
#     fpr, tpr, thresholds = roc_curve(y_true, y_probas, pos_label=0)

# #     print("$$$$$$$$$$$$$$$$$$$$$$$$")
# #     print(fpr, tpr, thresholds)
# #     print("$$$$$$$$$$$$$$$$$$$$$$$$")
    
#     # Print ROC curve
#     plt.plot(fpr,tpr)
#     plt.show() 

#     # Print AUC
#     auc = np.trapz(tpr,fpr)
#     print('AUC:', auc)

On Train data
hamming_loss:  0.021008136282735825
recall_score:  0.6974502866902526
precision_score:  0.9674742175553301
f1_score:  0.7951027800246083
roc_auc_score:  0.8451667439020218


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


ValueError: multilabel-indicator is not supported

In [10]:
def generate_tag_frequency(Y):
    
    freq = [0]*Y.shape[1]
    
    for col in range(Y.shape[1]):
        
        for row in list(Y[:,col]):
            if row == 1:
                
                freq[col] += 1
        
    return np.array(freq) 

In [12]:
def handle_class_imbalance(y_predicted,tag_freq):
    
    for row_index in range(y_predicted.shape[0]):
        
        for col_index in range(y_predicted.shape[1]):
            
            y_predicted[row_index,col_index] /= tag_freq[col_index]
    
    return y_predicted

In [13]:
X_train,X_test,y_train,y_test = data_preprocessing()

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(y_train)

tag_freq = generate_tag_frequency(Y)

n_most_common_words = 8000
max_len = 500

tokenizer = Tokenizer(num_words=n_most_common_words, filters=';', lower=False)
tokenizer.fit_on_texts(X_train.values)
sequences = tokenizer.texts_to_sequences(X_train.values)
# print(sequences)
word_index = tokenizer.word_index
# print('Found %s unique tokens.' % len(word_index))

X = pad_sequences(sequences, maxlen=max_len)

# print(X)
# print(X.shape)
# print(Y.shape)

X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2, random_state=42)

epochs = 2
emb_dim = 250
batch_size = 100

print((X_train.shape, y_train.shape, X_test.shape, y_test.shape))

model = Sequential()
model.add(Embedding(n_most_common_words, emb_dim, input_length=X.shape[1]))
# model.add(SpatialDropout1D(0.7))
model.add(LSTM(128, dropout=0.1, recurrent_dropout=0.3))
# model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(36, activation='sigmoid'))


sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)

model.compile(loss=binary_crossentropy, metrics=['binary_accuracy'],optimizer=sgd)

print(model.summary())
callbacks=[EarlyStopping(monitor='val_loss',patience=7, min_delta=0.01)]
history = model.fit(X_train, y_train[:,0], epochs=epochs, batch_size=batch_size,validation_split=0.2,callbacks=callbacks)


y_predicted = model.predict(X_test)

accr = model.evaluate(X_test,y_test[:,0])
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

print(y_predicted)

print('Test accuracy is {}'.format(accuracy_score(y_test[:,0], y_predicted)))
print('Test recall_score is {}'.format(recall_score(y_test[:,0], y_predicted)))
print('Test precision_score is {}'.format(precision_score(y_test[:,0], y_predicted)))
print('Test f1_score is {}'.format(f1_score(y_test[:,0], y_predicted)))


# print(y_predicted)

# y_predicted = handle_class_imbalance(y_predicted,tag_freq)

# print(y_predicted)

# threshold = 0.03

# m = []
# for row in y_predicted:
# #     m.append(softmax(row))
#     m.append(row)
    
# final_prediction = []

# print(m)

# for row in m:
#     temp = []
#     val = np.sort(row)[-3]
#     for item in row:
#         if item < val:
#             temp.append(0)
#         else:
#             temp.append(1)
#     final_prediction.append(temp)
    
# print(final_prediction)    
# print(y_test.shape)
# print(np.array(final_prediction).shape)
# print()
# print("hamming_loss: ",hamming_loss(np.array(final_prediction),y_test))
# print()

# all_labels = mlb.inverse_transform(np.array(final_prediction))
# y_labels = mlb.inverse_transform(y_test)

# for item, labels in zip(y_labels, all_labels):
#         print('{0} => {1}'.format(item, ', '.join(labels)))

# acc = history.history['binary_accuracy']
# val_acc = history.history['val_acc']
# loss = history.history['loss']
# val_loss = history.history['val_loss']

# epochs = range(1, len(acc) + 1)

# plt.plot(epochs, acc, 'bo', label='Training acc')
# plt.plot(epochs, val_acc, 'b', label='Validation acc')
# plt.title('Training and validation accuracy')
# plt.legend()

# plt.figure()

# plt.plot(epochs, loss, 'bo', label='Training loss')
# plt.plot(epochs, val_loss, 'b', label='Validation loss')
# plt.title('Training and validation loss')
# plt.legend()

# plt.show()



FileNotFoundError: [Errno 2] File b'codeforces_question_v4.csv' does not exist: b'codeforces_question_v4.csv'

In [None]:
l = np.array([[1,2,3,4],[2,4,6,7],[45,67,99,2]])
print(l)
m = []
for row in l:
    print(row)
    m.append(softmax(row))
    
m = np.array(m)
print(m)

k = [3,2,4,5,-1,-5,6]

np.sort(k)[-2]

In [11]:
def comments():
    
    #     global tags_list
    #     print(set(distinct_tags).difference(set(tags_list)))


    #     print(df["tags"])  
    #     print(df[df['difficulty'] == ''])

    #     np.where(df.applymap(lambda x: x == ''))

    #     nan_rows = df[df['difficulty'].isna()]
    #     print(nan_rows)
    #     print(df["difficulty"].describe())
    #     print(df["solution"].describe())

    #     print(df["time_taken"])


    #     one_hot = pd.get_dummies(df['tags'])
    #     # Drop column B as it is now encoded
    #     df = df.drop('tags',axis = 1)
    #     # Join the encoded df
    #     df = df.join(one_hot)
    #     print(df)


    #     print(distinct_tags)
    #     print(df.describe())
    #     print(df["problem statement"])
    #     print(df["tags"])

    #     cloud = wordcloud.WordCloud(background_color='black', max_font_size=60, relative_scaling=.5).generate(' '.join(df["solution"]))
    #     plt.figure(figsize=(20,10))
    #     plt.axis('off')
    #     plt.imshow(cloud);
    
    
    #############################################################################################
    
    #OneVsRestClassifier(MultinomialNB())
    #OneVsRestClassifier(LinearSVC())
    #OneVsRestClassifier(LogisticRegression(solver='sag'))

    # classifier.fit(X_train, Y)
    # predicted = classifier.predict(X_test)
    # all_labels = mlb.inverse_transform(predicted)
    # predicted = copy.deepcopy(all_labels)
    # print(list(map(list, all_labels)))
    # all_labels = list(map(list, all_labels))
    # print(all_labels)
    # all_labels = all_labels.append(distinct_tags)
    # target_names.append(distinct_tags)

    # print(target_names)
    # print(all_labels)
    
    # print("Accuracy: ",accuracy_score(mlb.fit_transform(target_names),mlb.fit_transform(all_labels)))
    # print()
    # for item, labels in zip(target_names, predicted):
    #     print('{0} => {1}'.format(item, ', '.join(labels)))
    
    pass