In [0]:
import numpy as np
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import gensim
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
!pip install emot

Collecting emot
  Downloading https://files.pythonhosted.org/packages/49/07/20001ade19873de611b7b66a4d5e5aabbf190d65abea337d5deeaa2bc3de/emot-2.1-py3-none-any.whl
Installing collected packages: emot
Successfully installed emot-2.1


In [0]:
import re
from emot.emo_unicode import UNICODE_EMO, EMOTICONS

## Reading train and test files

In [0]:
df=pd.read_csv("/content/drive/My Drive/assign5_dataset/train.csv")
# print(df)
test=pd.read_csv("/content/drive/My Drive/assign5_dataset/test.csv")
# print(test)

In [0]:
labels=df['labels']
# print(labels)

In [0]:
print(df['text'])

0       @realDonaldTrump This is one of the worst time...
1       How about the crowd in Oval in today's #AUSvIN...
2       @skroskz @shossy2 @JoeBiden Biden &amp; his so...
3       #etsy shop: Benedict Donald so called presiden...
4       @realDonaldTrump Good build a wall around Arka...
                              ...                        
5261    @ICC should allow ms dhoni to keep glove. It i...
5262    Trump on avoiding movie pirating: 'of course y...
5263    I noticed recently Jamie Oliver's restaurants ...
5264    #TeamIndia geared up is okay. What's on the GL...
5265    Is this the same piece of paper McCarthy used ...
Name: text, Length: 5266, dtype: object


## For breaking hashtags in meaningful words(Using Wordninja)

In [0]:
!pip install wordninja

Collecting wordninja
[?25l  Downloading https://files.pythonhosted.org/packages/30/15/abe4af50f4be92b60c25e43c1c64d08453b51e46c32981d80b3aebec0260/wordninja-2.0.0.tar.gz (541kB)
[K     |▋                               | 10kB 19.2MB/s eta 0:00:01[K     |█▏                              | 20kB 6.5MB/s eta 0:00:01[K     |█▉                              | 30kB 7.5MB/s eta 0:00:01[K     |██▍                             | 40kB 7.8MB/s eta 0:00:01[K     |███                             | 51kB 7.2MB/s eta 0:00:01[K     |███▋                            | 61kB 8.1MB/s eta 0:00:01[K     |████▎                           | 71kB 8.0MB/s eta 0:00:01[K     |████▉                           | 81kB 8.6MB/s eta 0:00:01[K     |█████▌                          | 92kB 8.4MB/s eta 0:00:01[K     |██████                          | 102kB 8.9MB/s eta 0:00:01[K     |██████▋                         | 112kB 8.9MB/s eta 0:00:01[K     |███████▎                        | 122kB 8.9MB/s eta 0:00:01

In [0]:
import wordninja as wn

In [0]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

## Data Preprocessing

In [0]:
# Converting emojis to text
#https://www.kaggle.com/sudalairajkumar/getting-started-with-text-preprocessing

def convert_emojis(text):
    for emot in UNICODE_EMO:
        text = text.replace(emot, "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()))
    return text
def remove_numbers(text): 
    result = re.sub(r'\d+', '', text) 
    return result

PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

def clean_text(text):
  text=text.lower()
    
  # text=re.sub(r'["|\'|?|,)|(|-]',r'',text)
  text=" ".join(filter(lambda x:x[0]!='@', text.split()))
  text=remove_punctuation(text)
  text=re.sub(r"http\S+", "",text)
  text=lemmatize_words(text)
  text=remove_numbers(text)
  text=stem_words(text)
  
  # text=convert_emojis(text)
  # # # breaking #tags in comments to meaningful words
  
  # text=wn.split(text)

  
  return text

round1= lambda x: clean_text(x)

In [0]:
tempdata=df.text.apply(round1)
clean_data=pd.DataFrame(tempdata)


In [0]:
print(clean_data)

                                                   text
0     thi is one of the worst time to be american be...
1     how about the crowd in oval in today ausvind h...
2     biden amp hi son hunter took advantag of their...
3     etsi shop benedict donald so call presid is a ...
4     good build a wall around arkansa fucktrump fuc...
...                                                 ...
5261  should allow m dhoni to keep glove it is attac...
5262  trump on avoid movi pirat of cours you have to...
5263  i notic recent jami oliv restaur closingi onli...
5264  teamindia gear up is okay what on the glove ar...
5265  is thi the same piec of paper mccarthi use to ...

[5266 rows x 1 columns]


# Different Methods of converting text to vector

## TF-IDF

In [0]:
def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(
    # analyzer='word',
    # tokenizer=dummy_fun,
    # preprocessor=dummy_fun,
    stop_words='english')
    # token_pattern=None)  

data_cv1=tfidf.fit_transform(clean_data['text'])
dataset=pd.DataFrame(data_cv1.toarray(),columns=tfidf.get_feature_names())

In [0]:
print(dataset)

      aaaaarrrrrggggghhhhh  aadmi  aag  aaj  ...  𝙜𝙧𝙤𝙪𝙣𝙙  𝙨𝙖𝙡𝙪𝙩𝙚  𝙩𝙚𝙖𝙢   𝙩𝙤
0                      0.0    0.0  0.0  0.0  ...     0.0     0.0   0.0  0.0
1                      0.0    0.0  0.0  0.0  ...     0.0     0.0   0.0  0.0
2                      0.0    0.0  0.0  0.0  ...     0.0     0.0   0.0  0.0
3                      0.0    0.0  0.0  0.0  ...     0.0     0.0   0.0  0.0
4                      0.0    0.0  0.0  0.0  ...     0.0     0.0   0.0  0.0
...                    ...    ...  ...  ...  ...     ...     ...   ...  ...
5261                   0.0    0.0  0.0  0.0  ...     0.0     0.0   0.0  0.0
5262                   0.0    0.0  0.0  0.0  ...     0.0     0.0   0.0  0.0
5263                   0.0    0.0  0.0  0.0  ...     0.0     0.0   0.0  0.0
5264                   0.0    0.0  0.0  0.0  ...     0.0     0.0   0.0  0.0
5265                   0.0    0.0  0.0  0.0  ...     0.0     0.0   0.0  0.0

[5266 rows x 11026 columns]


In [0]:
Finaltest=pd.read_csv("/content/drive/My Drive/assign5_dataset/f_test.csv")
print(Finaltest.columns)
print(Finaltest['Unnamed: 0'])

Index(['Unnamed: 0', 'text_id', 'text'], dtype='object')
0          0
1          1
2          2
3          3
4          4
        ... 
1148    1148
1149    1149
1150    1150
1151    1151
1152    1152
Name: Unnamed: 0, Length: 1153, dtype: int64


In [0]:
Finaltest=pd.read_csv("/content/drive/My Drive/assign5_dataset/f_test.csv")
clean_test_data=pd.DataFrame(Finaltest.text.apply(round1))
data_cv1=tfidf.transform(clean_test_data.text)
Finaltestdata=pd.DataFrame(data_cv1.toarray(),columns=tfidf.get_feature_names())

In [0]:
print(Finaltestdata)

      aaaaarrrrrggggghhhhh  aadmi  aag  aaj  ...  𝙜𝙧𝙤𝙪𝙣𝙙  𝙨𝙖𝙡𝙪𝙩𝙚  𝙩𝙚𝙖𝙢   𝙩𝙤
0                      0.0    0.0  0.0  0.0  ...     0.0     0.0   0.0  0.0
1                      0.0    0.0  0.0  0.0  ...     0.0     0.0   0.0  0.0
2                      0.0    0.0  0.0  0.0  ...     0.0     0.0   0.0  0.0
3                      0.0    0.0  0.0  0.0  ...     0.0     0.0   0.0  0.0
4                      0.0    0.0  0.0  0.0  ...     0.0     0.0   0.0  0.0
...                    ...    ...  ...  ...  ...     ...     ...   ...  ...
1148                   0.0    0.0  0.0  0.0  ...     0.0     0.0   0.0  0.0
1149                   0.0    0.0  0.0  0.0  ...     0.0     0.0   0.0  0.0
1150                   0.0    0.0  0.0  0.0  ...     0.0     0.0   0.0  0.0
1151                   0.0    0.0  0.0  0.0  ...     0.0     0.0   0.0  0.0
1152                   0.0    0.0  0.0  0.0  ...     0.0     0.0   0.0  0.0

[1153 rows x 11040 columns]


In [0]:
# converting testdata using TFIDF
clean_test_data=pd.DataFrame(test.text.apply(round1))
data_cv1=tfidf.transform(clean_test_data.text)
test_dataset=pd.DataFrame(data_cv1.toarray(),columns=tfidf.get_feature_names())

In [0]:
print(test_dataset)

     001   01   02   03   04   05  ...  𝙘𝙧𝙞𝙘𝙠𝙚𝙩  𝙛𝙧𝙤𝙢  𝙜𝙧𝙤𝙪𝙣𝙙  𝙨𝙖𝙡𝙪𝙩𝙚  𝙩𝙚𝙖𝙢   𝙩𝙤
0    0.0  0.0  0.0  0.0  0.0  0.0  ...      0.0   0.0     0.0     0.0   0.0  0.0
1    0.0  0.0  0.0  0.0  0.0  0.0  ...      0.0   0.0     0.0     0.0   0.0  0.0
2    0.0  0.0  0.0  0.0  0.0  0.0  ...      0.0   0.0     0.0     0.0   0.0  0.0
3    0.0  0.0  0.0  0.0  0.0  0.0  ...      0.0   0.0     0.0     0.0   0.0  0.0
4    0.0  0.0  0.0  0.0  0.0  0.0  ...      0.0   0.0     0.0     0.0   0.0  0.0
..   ...  ...  ...  ...  ...  ...  ...      ...   ...     ...     ...   ...  ...
581  0.0  0.0  0.0  0.0  0.0  0.0  ...      0.0   0.0     0.0     0.0   0.0  0.0
582  0.0  0.0  0.0  0.0  0.0  0.0  ...      0.0   0.0     0.0     0.0   0.0  0.0
583  0.0  0.0  0.0  0.0  0.0  0.0  ...      0.0   0.0     0.0     0.0   0.0  0.0
584  0.0  0.0  0.0  0.0  0.0  0.0  ...      0.0   0.0     0.0     0.0   0.0  0.0
585  0.0  0.0  0.0  0.0  0.0  0.0  ...      0.0   0.0     0.0     0.0   0.0  0.0

[586 rows x 11486 columns]


## Word2vec Method

In [0]:
model=Word2Vec(textData,window=5,size=100,min_count=1)

In [0]:
vectorizer=TfidfVectorizer()
vectors=vectorizer.fit_transform(df['text'])
TF_dataset = vectors.todense()

### Average Word2vec

In [0]:
#avg word2vec
def convToVec(model,textData):
  review_vec=[]
  for line in textData:
    rvec=np.zeros(100)
    cnt=0
    for w in line:
      try:
        vec=model.wv[w]
        rvec+=vec
        cnt+=1
      except:
        pass
    rvec/=cnt
    review_vec.append(rvec)
  print(len(review_vec))
  print(review_vec[0])
  return review_vec
#tfidf word2vec


### TF-IDF Form of Word2vec

In [0]:
features=vectorizer.get_feature_names()
def convTotfVec(model,textData):
  review_vec_tf=[]
  row=0
  for line in textData:
    rvec_tf=np.zeros(100)
    weight_sum=0
    for w in line:
      try:
        vec=model.wv[w]
        tfidf=finaltfidf(row,features.index[w])
        rvec_tf+=(vec*tfidf)
        weight_sum+=tfidf
      except:
        pass
    rvec_tf/=weight_sum
    review_vec_tf.append(rvec_tf)
    row+=1
  print(len(review_vec_tf))
  print(review_vec_tf[0])
  return review_vec_tf
AVG_dataset=np.asarray(convToVec(model,textData))
TFW_dataset=np.asarray(convTotfVec(model,textData))

In [0]:
from sklearn.preprocessing import StandardScaler
dataset=StandardScaler().fit_transform(dataset)
AVG_dataset=np.nan_to_num(AVG_dataset)
TFW_dataset=np.nan_to_num(TFW_dataset)

In [0]:
labels=np.array(labels)

## Training Models using TFIDF Dataform

### Models

#### SVM

## Using GridSearchCV for fine tuning different parameters of SVM

In [0]:
from sklearn.model_selection import GridSearchCV

In [0]:
param_grid = {'C': [0.1], 'gamma': [1,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}
# param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}

## Splitting data

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset,labels, test_size=0.1)

In [0]:
from sklearn.svm import SVC

In [0]:
from sklearn import svm

In [0]:
print(X_train.shape)
print(test_dataset.shape)

## Applying GridSearchCV

In [0]:
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2,n_jobs=-1)
grid.fit(X_train,y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  3.6min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.1], 'gamma': [1, 0.001],
                         'kernel': ['rbf', 'poly', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [0]:
print(grid.best_estimator_)

SVC(C=0.1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)


In [0]:
Svmlabels = grid.predict(X_test)

## SVM Model

In [0]:
SVMmodel=svm.SVC(kernel='rbf')
SVMmodel.fit(X_train,y_train)
SVMlabels = SVMmodel.predict(X_test)

In [0]:
print(SVMlabels)

In [0]:
predlabels = SVMmodel.predict(test_dataset)

In [0]:
finalpredlabels = SVMmodel.predict(Finaltestdata)

In [0]:
print(SVMlabels)

[1 1 1 ... 1 1 1]


In [0]:
np.savetxt('submission.csv', predlabels, delimiter=' ', header='labels')

In [0]:
np.savetxt('submission.csv', finalpredlabels, delimiter=' ', header='labels')

In [0]:
def create_file(predictLabels,testdata):
  smiles=testdata['Unnamed: 0'] 
  list_of_tuples = list(zip(smiles, predictLabels))  
  sub = pd.DataFrame(list_of_tuples, columns = [' ', 'labels'])   
  print(sub)
  sub.to_csv('submission.csv')

In [0]:
create_file(finalpredlabels,Finaltest)

            labels
0        0       1
1        1       1
2        2       1
3        3       1
4        4       0
...    ...     ...
1148  1148       1
1149  1149       1
1150  1150       1
1151  1151       1
1152  1152       1

[1153 rows x 2 columns]


In [0]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,SVMlabels)

0.6698292220113852

In [0]:
from sklearn.metrics import f1_score
f1_score(y_test, SVMlabels)

0.7774936061381074

#### LOGISTIC REGRESSION

In [0]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
# tunned_param=[{'C':[.000241,.000257,.0004,.0005,.0006,.007]}]
# LRmodel=GridSearchCV(LogisticRegression(max_iter=1000),tunned_param,scoring='accuracy')
LRmodel=LogisticRegression()
LRmodel.fit(X_train,y_train)
LRlabels=LRmodel.predict(X_test)
# print(LRlabels)
# print(LRmodel.best_estimator_)
print(LRmodel.score(X_test,y_test))

0.6907020872865275


In [0]:
from sklearn.metrics import f1_score
f1_score(y_test, LRlabels)

0.788036410923277

#### MLP

In [0]:
import keras
from tensorflow.keras.layers import Dense
from tensorflow.keras import Sequential
from tensorflow.keras import optimizers


Using TensorFlow backend.


In [0]:
ytrain = keras.utils.to_categorical(y_train,2)
ytest = keras.utils.to_categorical(y_test,2)

In [0]:
print(dataset.shape)

(5266, 11486)


In [0]:
model = Sequential()
model.add(Dense(units=5,activation="relu",input_shape=(11486,)))
# model.add(Dropout(0.5))
model.add(Dense(units=6,activation="relu"))
# model.add(Dropout(0.5))
model.add(Dense(units=2,activation="softmax"))

In [0]:
model.compile(optimizer="adam",loss="categorical_crossentropy",metrics=["accuracy"])
history=model.fit(X_train,ytrain,batch_size=64,epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [0]:
score=model.evaluate(X_test,ytest,verbose = 2)

17/17 - 0s - loss: 1.7666 - accuracy: 0.6186


In [0]:
print(score[1])

0.6185958385467529


In [0]:
y_pred = model.predict(X_test)
#Converting predictions to label
MLPlabels = list()
for i in range(len(y_pred)):
    MLPlabels.append(np.argmax(y_pred[i]))
#Converting one hot encoded test label to label
test = list()
for i in range(len(ytest)):
    test.append(np.argmax(ytest[i]))

In [0]:
print(MLPlabels)

[1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 

In [0]:
from sklearn.metrics import f1_score
a =f1_score(MLPlabels,test)
print('Accuracy is:', a*100)

Accuracy is: 68.26706676669167


#### LSTM

In [0]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.layers import LSTM
from sklearn.model_selection import train_test_split

In [0]:
t = Tokenizer(nb_words=5500,lower=True,split=' ')
t.fit_on_texts(clean_data['text'].values)
# print(t.word_counts)
# print(t.document_count)
# print(t.word_index)
# print(t.word_docs)
X = t.texts_to_sequences(clean_data['text'])
X = pad_sequences(X)
print(X.shape)



(5266, 58)


In [0]:
print(X[1])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0   62   50
    1 1408    8 1409    8  148 1036  484  495  350 3040 1302   22  122
  571   28]


In [0]:
from keras.layers import Conv1D, MaxPooling1D
from keras.layers import Dense, Dropout, Flatten

In [0]:
embed_dim = 128
batch_size = 32

LSTMmodel = Sequential()
LSTMmodel.add(Embedding(5500, embed_dim,input_length = X.shape[1]))
LSTMmodel.add(Dropout(0.2))
# LSTMmodel.add(LSTM(15,recurrent_dropout=0.2,input_shape=(1,10908)))
LSTMmodel.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
LSTMmodel.add(MaxPooling1D(pool_size=2))
LSTMmodel.add(LSTM(15, dropout=0.2, recurrent_dropout=0.2))
LSTMmodel.add(Dropout(0.5))
# LSTMmodel.add(Dense(40,activation='relu'))
LSTMmodel.add(Dense(64,activation='relu'))
LSTMmodel.add(Dropout(0.3))
LSTMmodel.add(Dense(32,activation='relu'))
LSTMmodel.add(Dense(16,activation='relu'))
LSTMmodel.add(Dropout(0.3))
LSTMmodel.add(Dense(1,activation='sigmoid'))

LSTMmodel.compile(loss = 'binary_crossentropy', optimizer='adadelta',metrics = ['accuracy'])
print(LSTMmodel.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 58, 128)           704000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 58, 128)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 58, 32)            12320     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 29, 32)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 15)                2880      
_________________________________________________________________
dropout_2 (Dropout)          (None, 15)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)               

In [0]:
X_train, X_test, Y_train, Y_valid = train_test_split(X,labels,test_size = 0.20)
# x_train=np.array(X_train)
# trainX= np.reshape(x_train,(x_train.shape[0], 1, x_train.shape[1]))
LSTMmodel.fit(X_train, Y_train, batch_size =batch_size, nb_epoch = 50)

  after removing the cwd from sys.path.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x7fd978642400>

In [0]:
x_test=np.array(X_test)
testX= np.reshape(x_test,(x_test.shape[0], 1, x_test.shape[1]))
score=LSTMmodel.evaluate(X_test,Y_valid)



In [0]:
print(score[1])

0.6081593632698059


In [0]:
LSTMlabels=LSTMmodel.predict(testX)

## random forest

In [0]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier()
clf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
RFlabels=clf.predict(X_test)

In [0]:
RFlabels=list(RFlabels)
print(RFlabels)

[1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 

## combining all results using above trained models such as SVM, LR, MLP, LSTM, RF for Boosting method

In [0]:
print(type(LSTMlabels))
print(type(SVMlabels))
print(type(LRlabels))
print(type(MLPlabels))
def convert(predict_labels):
  predicted=[]
  for i in range(0,len(predict_labels)):
    predicted.append(predict_labels[i][0])
  return predicted
LSTMlabel=convert(LSTMlabels)



<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'list'>


In [0]:
# print(type(LSTMlabel))
SVMlabel=list(SVMlabels)
LRlabel=list(LRlabels)
print(type(LSTMlabel))
print(type(SVMlabel))
print(type(LRlabel))
print(type(MLPlabels))

<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>


In [0]:
res_list = [] 
for i in range(0, len(MLPlabels)): 
    res_list.append(int((round((MLPlabels[i] + SVMlabel[i]+LRlabel[i]+LSTMlabel[i]+RFlabels[i])/5))))

In [0]:
print(len(res_list))
print(len(y_test))

1054
1054


## accuracy after applying Boosting method

In [0]:
from sklearn.metrics import accuracy_score,f1_score
a =accuracy_score(res_list,y_test)
b =f1_score(res_list,y_test)
print('score is:', a*100)
print('f1score is:', b*100)

score is: 64.51612903225806
f1score is: 74.9665327978581


In [0]:
SVMtestlabels=list(SVMmodel.predict(test_dataset))
LRtestlabels=list(LRmodel.predict(test_dataset))
MLPtestslabels=model.predict(test_dataset)
MLPtestlabels = list()
for i in range(len(MLPtestslabels)):
   MLPtestlabels.append(np.argmax(MLPtestslabels[i]))
x=np.array(test_dataset)
X= np.reshape(x,(x.shape[0], 1, x.shape[1]))
LSTMtestlabels=LSTMmodel.predict(X)
LSTMtestlabel=convert(LSTMtestlabels)

In [0]:
RFtestslabels=list(clf.predict(test_dataset))

In [0]:
print(RFtestslabels)
print(LSTMtestlabel)
print(SVMtestlabels)
print(LRtestlabels)
print(MLPtestlabels)

[1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 

In [0]:
sublist = [] 
for i in range(0, len(MLPtestlabels)): 
    sublist.append(int((round((MLPtestlabels[i] + SVMtestlabels[i]+LRtestlabels[i]+LSTMtestlabel[i]+RFtestslabels[i])/5))))

In [0]:
np.savetxt('submission.csv', sublist, delimiter=' ', header='labels')

In [0]:
from sklearn.metrics import f1_score
a =f1_score(predict,Y_valid)
print('score is:', a*100)