In [1]:
import pandas as pd

In [8]:
df = pd.read_csv("train_file.csv")

In [9]:
df=df.drop(["UsageClass", "CheckoutType", "CheckoutYear", "CheckoutMonth"], axis=1)

In [10]:
df.head()

Unnamed: 0,ID,Checkouts,Title,Creator,Subjects,Publisher,PublicationYear,MaterialType
0,1,1,Tidal wave,,"Tsunamis, Tsunamis Juvenile literature",,,BOOK
1,2,1,London holiday / Richard Peck.,"Peck, Richard, 1934-",,"Viking,",1998.,BOOK
2,3,3,Cinco de Mayo : celebrating Hispanic pride / C...,"Gnojewski, Carol",Cinco de Mayo Mexican holiday History Juvenile...,"Enslow Publishers,",c2002.,BOOK
3,4,1,Annapolis,,"War stories, Historical fiction, Domestic fict...",,,BOOK
4,5,1,As a man thinketh,,Thought and thinking,,,BOOK


In [20]:
print(df["Creator"].isna().sum())
print(df["Subjects"].isna().sum())
print(df["Publisher"].isna().sum())
print(df["PublicationYear"].isna().sum())
print(df["Title"].isna().sum())

23137
1763
21916
21931
0


In [21]:
# NaN values are more than 70%
df = df.drop(["Creator", "Publisher", "PublicationYear"], axis=1)

In [22]:
df.head()

Unnamed: 0,ID,Checkouts,Title,Subjects,MaterialType
0,1,1,Tidal wave,"Tsunamis, Tsunamis Juvenile literature",BOOK
1,2,1,London holiday / Richard Peck.,,BOOK
2,3,3,Cinco de Mayo : celebrating Hispanic pride / C...,Cinco de Mayo Mexican holiday History Juvenile...,BOOK
3,4,1,Annapolis,"War stories, Historical fiction, Domestic fict...",BOOK
4,5,1,As a man thinketh,Thought and thinking,BOOK


In [33]:
import numpy as np
nan_id =[]
for i in range(len(df)):
    if str(df["Subjects"][i]) == "nan":
        nan_id.append(i)

In [246]:
df_nlp = df["Title"]

In [247]:
# Text processing

from nltk.corpus import stopwords
sw = stopwords.words("english")

import re
from nltk.stem.snowball import SnowballStemmer
import string

def clean_text(text):

    ## Remove puncuation
    text = text.translate(string.punctuation)

    ## Convert words to lower case and split them
    text = text.lower().split()

    ## Remove stop words
    text = [w for w in text if not w in sw and len(w) >= 3]

    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    ## Stemming
    txt = text.split()
    text = [i for i in txt if i.isalpha()]
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)

    return text


In [248]:
df_nlp = df_nlp.apply(clean_text)


from sklearn.feature_extraction.text import TfidfVectorizer
tfv = TfidfVectorizer(min_df=3, max_features=1000,
strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1)

tfv.fit(list(df_nlp))
desc = tfv.transform(df_nlp)

# truncate the matrix/ array to 30 column
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=50)
svd.fit(desc)
desc = svd.transform(desc)

desc = pd.DataFrame(desc, columns=['nlp_{}'.format(i) for i in range(50)])

final_df = pd.concat([df, desc],axis=1, sort=False)

In [249]:
X = final_df.iloc[:, 5:]

In [250]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

final_df["MaterialType"] = le.fit_transform(final_df["MaterialType"])

In [260]:
Y = final_df["MaterialType"]

In [261]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

In [265]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=500, max_depth=2, random_state=0)

rf.fit(X_train, Y_train)

Y_pred = rf.predict(X_test)


Y_pred = Y_pred.tolist()

In [266]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, Y_pred)
from sklearn.metrics import accuracy_score
acc_3 = accuracy_score(Y_test, Y_pred)

In [267]:
acc_3

0.7148091988880465

#### Test Data

In [271]:
test_data = pd.read_csv("test_file.csv")

test_data=test_data.drop(["UsageClass", "CheckoutType", "CheckoutYear", 
                          "CheckoutMonth","Creator", "Publisher", "PublicationYear"], axis=1)

test_data["Title"] = test_data["Title"].apply(clean_text)

desc_test = tfv.transform(test_data["Title"])
desc_test = svd.transform(desc_test)
desc_test = pd.DataFrame(desc_test, columns=['nlp_{}'.format(i) for i in range(50)])

In [243]:
test_Y_pred = clf.predict(desc_test)

test_Y_pred = test_Y_pred.tolist()
final_Y = le.inverse_transform(test_Y_pred)
final = final_Y.tolist()

In [244]:
submit = pd.DataFrame({"MaterialType":final})
submit.insert(loc=0, column='ID', value=test_data["ID"])
#submit.insert(loc=1, column='MaterialType', value=test_data["Essayset"])

#submit.to_csv('submission_2.csv',index=False)

In [245]:
submit.to_csv('submission_LR.csv',index=False)

#### Decision Tree

In [252]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0).fit(X_train, Y_train)

Y_pred = clf.predict(X_test)

acc_DT = accuracy_score(Y_test, Y_pred)

In [253]:
acc_DT

0.65390447308567101

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(X_train, Y_train)
Y_pred = clf.predict(X_test)


from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, Y_pred)
from sklearn.metrics import accuracy_score
acc_3 = accuracy_score(Y_test, Y_pred)

In [255]:
acc_3

0.74323982815264089

In [57]:
#XGB

In [None]:
from xgboost import XGBClassifier

lr = [0.01,0.1,0.4]
estimator = [100, 300, 500, 700, 100]
for i in lr:
    for j in estimator:
        xgb = XGBClassifier(learning_rate=i,n_estimators=j, max_depth=3,booster='dart')
        xgb.fit(X_train, Y_train)
        Y_pred_xgb = xgb.predict(X_test)
        Y_pred_xgb = Y_pred_xgb.tolist()
        acc_1 = accuracy_score(Y_test, Y_pred_xgb)
        print ("lr and estimator", i, j, acc_1)

In [236]:
#from fine tuning we get to know, learning rate = 0.1 and n_estimator =300 gives max accuracy

In [268]:
xgb = XGBClassifier(learning_rate=0.1,n_estimators=300, max_depth=3,booster='dart')
xgb.fit(X_train, Y_train)
Y_pred_xgb = xgb.predict(X_test)
Y_pred_xgb = Y_pred_xgb.tolist()
acc_1 = accuracy_score(Y_test, Y_pred_xgb)

#Y_pred_xgb = model.predict(X_test)
Y_pred_xgb = xgb.predict(X_test)
Y_pred_xgb = Y_pred_xgb.tolist()

acc_1 = accuracy_score(Y_test, Y_pred_xgb)

In [269]:
acc_1

0.75991913065453631

In [195]:
acc

0.75650745514278495

In [272]:
#test_Y_xgb = model.predict(desc_test)
test_Y_xgb = xgb.predict(desc_test)
#test_Y_cat =  test_Y_xgb.tolist()
#test_Y_xgb = [int(test_Y_cat[i][0]) for i in range(len(test_Y_cat))]

In [273]:

final_Y = le.inverse_transform(test_Y_xgb)
final = final_Y.tolist()
submit = pd.DataFrame({"MaterialType":final})
submit.insert(loc=0, column='ID', value=test_data["ID"])

In [275]:
submit.to_csv('submission_final.csv',index=False)

#### LSTM approach

In [66]:
df["Title"].head()

0                                           Tidal wave
1                       London holiday / Richard Peck.
2    Cinco de Mayo : celebrating Hispanic pride / C...
3                                            Annapolis
4                                    As a man thinketh
Name: Title, dtype: object

In [92]:
df_2 = df["Title"].copy()

In [93]:
df_2 = df_2.apply(clean_text)

In [128]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 150
# This is fixed.
EMBEDDING_DIM = 100

In [129]:
from keras.preprocessing.text import Tokenizer

In [130]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df_2.values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 25186 unique tokens.


In [131]:
from keras.preprocessing.sequence import pad_sequences

In [132]:
X = tokenizer.texts_to_sequences(df_2.values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (31653, 150)


In [133]:
Y =df["MaterialType"].copy()

In [134]:
Y = pd.get_dummies(df['MaterialType']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (31653, 8)


In [135]:
X_train_L, X_test_L, Y_train_L, Y_test_L = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train_L.shape,Y_train_L.shape)
print(X_test_L.shape,Y_test_L.shape)

(28487, 150) (28487, 8)
(3166, 150) (3166, 8)


In [136]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
#model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(8, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 5
batch_size = 64

In [137]:
history = model.fit(X_train_L, Y_train_L, epochs=epochs, batch_size=batch_size,validation_split=0.1)


Train on 25638 samples, validate on 2849 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [153]:
df["MaterialType"].unique()

array(['BOOK', 'SOUNDDISC', 'VIDEOCASS', 'VIDEODISC', 'SOUNDCASS', 'MUSIC',
       'MIXED', 'CR'], dtype=object)

In [152]:
labels[np.argmax(Y_train_L[7])]

'MUSIC'

In [107]:
accr = model.evaluate(X_test_L,Y_test_L)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.988
  Accuracy: 0.724


In [155]:
test_data = test_data["Title"].apply(clean_text)

In [156]:
tst = tokenizer.texts_to_sequences(test_data.values)
tst = pad_sequences(tst, maxlen=MAX_SEQUENCE_LENGTH)

In [157]:
pred = model.predict(tst)

In [158]:
len(tst)

21102

In [159]:
labels = list(df["MaterialType"].unique())

In [121]:
labels[np.argmax(pred[0])]

'MUSIC'

In [160]:
final = []
for i in range(len(pred)):
    final.append(labels[np.argmax(pred[i])])

In [161]:
test_data = pd.read_csv("test_file.csv")

In [162]:
submit = pd.DataFrame({"MaterialType":final})
submit.insert(loc=0, column='ID', value=test_data["ID"])

In [163]:
submit.to_csv('submission_L2.csv',index=False)

In [170]:
from catboost import CatBoostClassifier

In [213]:
model = CatBoostClassifier(
    iterations=200,
    learning_rate=1,
    loss_function='MultiClass',
    depth = 2
    # loss_function='CrossEntropy'
)
model.fit(
    X_train, Y_train,
    #cat_features=cat_features,
    eval_set=(X_test, Y_test),
    verbose=False
    
)
print('Model is fitted: ' + str(model.is_fitted()))
print('Model params:')
print(model.get_params())

Model is fitted: True
Model params:
{'depth': 2, 'iterations': 200, 'loss_function': 'MultiClass', 'learning_rate': 1}


In [214]:
Y_pred = model.predict(data=X_test)

In [215]:
Y_pred = Y_pred.tolist()

In [216]:
acc_3 = accuracy_score(Y_test, Y_pred)

In [217]:
acc_3

0.71973717462724285