In [None]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [45]:
# Packages import
import pandas as pd
import warnings
import numpy as np
import re
import emoji

# Package for preprocssing
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.utils.class_weight import compute_class_weight
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder

# Package for train test split
from sklearn.model_selection import train_test_split

# Deep Learning Package Import
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense,Dropout,Flatten
from tensorflow.keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

warnings.simplefilter("ignore")

In [2]:
data = pd.read_csv('../input/feedback-prize-effectiveness/train.csv')

In [3]:
# Lower the text
def lowerText(aText):
    return aText.lower()

data['discourse_text'] = data['discourse_text'].apply(lowerText)

# Remove puntuation and numerical values
def removePunctuation(aText):
    return re.sub(r'[^\w\s]', '', aText)

data['discourse_text'] = data['discourse_text'].apply(removePunctuation)

# Remove stopwords
stpWrds = set(stopwords.words('english'))

def removeStpWrds(aText):
    return ' '.join([words for words in aText.split(' ') if words not in stpWrds])

data['discourse_text'] = data['discourse_text'].apply(removeStpWrds)

# Stemming the words
stemmer = PorterStemmer()

def stemWords(aText):
    return ' '.join([stemmer.stem(words) for words in aText.split(' ')])

data['discourse_text'] = data['discourse_text'].apply(stemWords)

# Removing all the emoji if present
def remove_emoji(aText):
    return emoji.get_emoji_regexp().sub(u'', aText)

data['discourse_text'] = data['discourse_text'].apply(remove_emoji)

In [4]:
# Subsettig the dataset
df = data[['discourse_text','discourse_effectiveness']]

In [5]:
Y = df['discourse_effectiveness']
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [6]:
t = Tokenizer(num_words=2000)
t.fit_on_texts(df.discourse_text)
feature = t.texts_to_matrix(df.discourse_text, mode='tfidf')

In [7]:
X_train, X_test, y_train, y_test = train_test_split(feature,dummy_y, test_size=0.3)

In [46]:
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(512, input_dim=X_train.shape[1], activation='relu'))

    model.add(Dropout(.3))
    model.add(Dense(256,activation='relu'))
    
    model.add(Dropout(.3))
    model.add(Dense(128,activation='relu'))
    
    model.add(Dropout(.3))
    model.add(Dense(64,activation='relu'))
    
    model.add(Dropout(.3))
    model.add(Dense(32,activation='relu'))
    
    model.add(Dropout(.3))
    model.add(Dense(16,activation='relu'))
    
    model.add(Dropout(.3))
    model.add(Dense(8,activation='relu'))
    
    model.add(Dropout(.3))
    model.add(Dense(3, activation='softmax'))
    
    # optimiser
    opt = Adam(lr=1e-5)

    # Compile model
    model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), optimizer=opt, metrics=['accuracy'])
    
    return model

In [47]:
model = baseline_model()
model.summary()

In [40]:
# estimator = KerasClassifier(build_fn=baseline_model, epochs=10, batch_size=64, verbose=1)

In [48]:
model.fit(X_train, y_train, epochs=10, batch_size=64,validation_data=(X_test,y_test))

In [51]:
test = pd.read_csv('../input/feedback-prize-effectiveness/test.csv')

In [52]:
to_check = test[['discourse_text']]

In [53]:
# Preprocessing the test file
to_check['discourse_text'] = to_check['discourse_text'].apply(lowerText)
to_check['discourse_text'] = to_check['discourse_text'].apply(removePunctuation)
to_check['discourse_text'] = to_check['discourse_text'].apply(removeStpWrds)
to_check['discourse_text'] = to_check['discourse_text'].apply(stemWords)
to_check['discourse_text'] = to_check['discourse_text'].apply(remove_emoji)

In [54]:
to_check

In [55]:
test_feat = t.texts_to_matrix(to_check.discourse_text, mode='tfidf')

In [56]:
pred = model.predict(test_feat, verbose = 1)

In [59]:
sub = pd.read_csv('../input/feedback-prize-effectiveness/sample_submission.csv')

In [61]:
sub['Ineffective'] = pred[:,0]
sub['Adequate'] = pred[:,1]
sub['Effective'] = pred[:,2]

In [63]:
sub.to_csv("submission.csv", index=False)
print('Success')