In [1]:
import pandas as pd
df_train = pd.read_csv('offcampus_training.csv')

In [2]:
print('The number of rows ',df_train.shape[0])
print('The number of coloumns',df_train.shape[1])
print('The different coloumns are ',df_train.columns)

The number of rows  1399
The number of coloumns 3
The different coloumns are  Index(['id', 'category', 'text'], dtype='object')


In [3]:
y_train = df_train['category']
x_train = df_train.iloc[:,:]
x_train.drop(['category'],axis=1,inplace=True)

In [4]:
from sklearn.model_selection import train_test_split as tts
xtrain,xtest,ytrain,ytest = tts(x_train,y_train,test_size=0.3,random_state = 0)

In [5]:
print(xtrain.shape)
print(xtest.shape)

(979, 2)
(420, 2)


In [6]:
xtrain = xtrain[pd.notnull(xtrain['text'])]
xtest = xtest[pd.notnull(xtest['text'])]
ytrain = ytrain[pd.notnull(ytrain)]
ytest = ytest[pd.notnull(ytest)]

In [7]:
print('X training shape',xtrain.shape)
print('X testing shape',xtest.shape)
print('Y training ',ytrain.shape)
print('Y testing ',ytest.shape)

X training shape (842, 2)
X testing shape (359, 2)
Y training  (842,)
Y testing  (359,)


## Multinomial Naive Bayes

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [9]:
#text_clf = Pipeline([('vect', TfidfVectorizer()), ('clf', MultinomialNB()) ])
text_clf = Pipeline([('vect', TfidfVectorizer()),
                     ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])


In [10]:
text_clf.fit(xtrain.text, ytrain)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [11]:
predicted = text_clf.predict(xtest.text)

from sklearn import metrics
from sklearn.metrics import accuracy_score
import numpy as np

print('Accuracy achieved is ' + str(np.mean(predicted == ytest)))

Accuracy achieved is 0.8690807799442897


## SVM Classifier

In [12]:
from sklearn.linear_model import SGDClassifier
text_clf_sgd = Pipeline([('vect', TfidfVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, 
                                                   random_state=42))])

#text_sgd = Pipeline([('vect', TfidfVectorizer()),('tfidf', TfidfTransformer()),
#                    ('clf', MultinomialNB())])

In [13]:
text_clf_sgd.fit(xtrain.text, ytrain)



Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...dom_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [14]:
predicted = text_clf_sgd.predict(xtest.text)

from sklearn import metrics
from sklearn.metrics import accuracy_score
import numpy as np

print('Accuracy achieved is ' + str(np.mean(predicted == ytest)))

Accuracy achieved is 0.9665738161559888


In [15]:
#print(predicted);
#print(predicted.shape)
#print(xtest.text.shape)
predicted = text_clf_sgd.predict(xtest.text)

In [16]:
df_predict = pd.DataFrame({'id': xtest.id, 'category': list(predicted)}, columns=['id', 'category'])
df_predict["category"]= df_predict["category"].astype(int)
df_predict["id"]= df_predict["id"].astype(int)

In [17]:
df_predict.to_csv("submission_sgd_mine.csv", encoding='utf-8',index=False)

## on Skillenza test dataset

In [18]:
df_final_test = pd.read_csv('offcampus_test.csv')
df_final_test.head()

Unnamed: 0,id,text
0,1,17862 1601 5262 6549 7839 11123 13217 33047 45...
1,2,17567 10153 9393 17574 9237 8269 12648 7839 27...
2,3,15427 4275 10279 31061 7813 17850 33597 3819 3...
3,4,34414 23445 21263 12329 29992 33667 11809 2828...
4,5,34457 8030 30584 34448 33553 27396 1194 33597 ...


In [19]:
predicted = text_clf_sgd.predict(df_final_test.text)
df_predict = pd.DataFrame({'id': df_final_test.id, 'category': list(predicted)}, columns=['id', 'category'])
df_predict["category"]= df_predict["category"].astype(int)
df_predict["id"]= df_predict["id"].astype(int)
df_predict.to_csv("submission_sgd_final.csv", encoding='utf-8',index=False)

## Random Forest Classifier

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
text_clf_random = Pipeline([('vect', TfidfVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf-random',RandomForestClassifier(max_depth=50, n_estimators=10, max_features=1) )])

In [21]:
text_clf_random.fit(xtrain.text, ytrain)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [22]:
predicted = text_clf_random.predict(xtest.text)

from sklearn import metrics
from sklearn.metrics import accuracy_score
import numpy as np

print('Accuracy achieved is ' + str(np.mean(predicted == ytest)))

Accuracy achieved is 0.5069637883008357


In [23]:
from sklearn.svm import SVC
text_clf_svm = Pipeline([('vect', TfidfVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm',SVC(C=50.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False))])

In [24]:
text_clf_svm.fit(xtrain.text, ytrain)


Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [25]:
predicted = text_clf_svm.predict(xtest.text)

from sklearn import metrics
from sklearn.metrics import accuracy_score
import numpy as np

print('Accuracy achieved is ' + str(np.mean(predicted == ytest)))

Accuracy achieved is 0.2590529247910863


## LSTM network

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline

Using TensorFlow backend.


In [12]:
max_words = 1000
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(xtrain.text)
sequences = tok.texts_to_sequences(xtrain.text)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [19]:
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,5,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [20]:
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 150)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 150, 5)            5000      
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                17920     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               16640     
_________________________________________________________________
activation_3 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 1)                 257       
__________

In [21]:
model.fit(sequences_matrix,ytrain,batch_size=10,epochs=20,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Train on 673 samples, validate on 169 samples
Epoch 1/20
Epoch 2/20


<keras.callbacks.History at 0x7f78206063c8>