In [21]:
import pandas as pd
import warnings
import numpy as np
import re
import emoji

# Package for preprocssing
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.utils.class_weight import compute_class_weight

# Package for train test split
from sklearn.model_selection import train_test_split

# Model package for classification
from sklearn.ensemble import RandomForestClassifier

warnings.simplefilter("ignore")

In [5]:
data = pd.read_csv('./Data/train.csv')

In [6]:
# Lower the text
def lowerText(aText):
    return aText.lower()

data['discourse_text'] = data['discourse_text'].apply(lowerText)

# Remove puntuation and numerical values
def removePunctuation(aText):
    return re.sub(r'[^\w\s]', '', aText)

data['discourse_text'] = data['discourse_text'].apply(removePunctuation)

# Remove stopwords
stpWrds = set(stopwords.words('english'))

def removeStpWrds(aText):
    return ' '.join([words for words in aText.split(' ') if words not in stpWrds])

data['discourse_text'] = data['discourse_text'].apply(removeStpWrds)

# Stemming the words
stemmer = PorterStemmer()

def stemWords(aText):
    return ' '.join([stemmer.stem(words) for words in aText.split(' ')])

data['discourse_text'] = data['discourse_text'].apply(stemWords)

# Removing all the emoji if present
def remove_emoji(aText):
    return emoji.get_emoji_regexp().sub(u'', aText)

data['discourse_text'] = data['discourse_text'].apply(remove_emoji)

In [7]:
# Subsettig the dataset
df = data[['discourse_text','discourse_type','discourse_effectiveness']]

### Preprocessing the target

In [8]:
# Encoding the categorical variable
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder() # Encoder for type of discourse_type

fe = LabelEncoder() # Encoder for type of discourse_effectiveness

df['discourse_type'] = le.fit_transform(df['discourse_type'])

df['discourse_effectiveness'] = fe.fit_transform(df['discourse_effectiveness'])

In [9]:
# Keeping the encoding values in a dictionary format
type_mapping = dict(zip(range(len(le.classes_)), le.classes_))
class_mapping = dict(zip(range(len(fe.classes_)), fe.classes_))

In [10]:
# Normalising the weights since we have different weights for three classes
class_weight = compute_class_weight('balanced',classes = np.unique(df['discourse_effectiveness']), y = df['discourse_effectiveness'])

class_weight # Pass this weight to model classification

array([0.58421128, 1.3140682 , 1.89647168])

## Tokenizer

In [11]:
from keras.preprocessing.text import Tokenizer
t = Tokenizer(num_words=1000)

In [12]:
t.fit_on_texts(df.discourse_text)
feature = t.texts_to_matrix(df.discourse_text, mode='tfidf')

In [17]:
y = np.asarray(df.discourse_effectiveness)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(feature,y, test_size=0.3)

In [22]:
rf = RandomForestClassifier(class_weight='balanced',n_estimators=400,bootstrap= True,max_depth= 30,max_features='sqrt',min_samples_leaf=1,min_samples_split=5)

rf.fit(X_train,y_train)

In [23]:
# Prediction
rf_prediction = rf.predict(X_test)

In [24]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

print(f'The confusion matrix for Random Forest before hyper parameter tunning is \n{confusion_matrix(y_test,rf_prediction)}')

print(f'\n Classification report for Random Forest before hyper parameter tunning is:\n{classification_report(y_test,rf_prediction)}')

The confusion matrix for Random Forest before hyper parameter tunning is 
[[3552 1026 1653]
 [ 985 1632  177]
 [ 860  113 1032]]

 Classification report for Random Forest before hyper parameter tunning is:
              precision    recall  f1-score   support

           0       0.66      0.57      0.61      6231
           1       0.59      0.58      0.59      2794
           2       0.36      0.51      0.42      2005

    accuracy                           0.56     11030
   macro avg       0.54      0.56      0.54     11030
weighted avg       0.59      0.56      0.57     11030



In [25]:
test = pd.read_csv('./Data/test.csv')

In [26]:
test_data = test['discourse_text']
test_data = test_data.apply(lowerText)
test_data = test_data.apply(removePunctuation)
test_data = test_data.apply(removeStpWrds)
test_data = test_data.apply(stemWords)
test_data = test_data.apply(remove_emoji)

In [28]:
feat = t.texts_to_matrix(test_data, mode='tfidf')

In [32]:
rf.predict_proba(feat)

array([[0.34216201, 0.57879827, 0.07903972],
       [0.45271629, 0.41772717, 0.12955653],
       [0.39150505, 0.28157212, 0.32692282],
       [0.3796475 , 0.32499964, 0.29535285],
       [0.31973675, 0.42123411, 0.25902914],
       [0.30581966, 0.61462021, 0.07956014],
       [0.27796269, 0.63513985, 0.08689746],
       [0.32457466, 0.45505374, 0.2203716 ],
       [0.29732987, 0.56009527, 0.14257485],
       [0.24861406, 0.68819317, 0.06319277]])