In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from nltk.tokenize import word_tokenize
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, roc_curve, auc

Using TensorFlow backend.


In [2]:
data=pd.read_csv("hm_train.csv", header=0)
data.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence,predicted_category
0,27673,24h,I went on a successful date with someone I fel...,1,affection
1,27674,24h,I was happy when my son got 90% marks in his e...,1,affection
2,27675,24h,I went to the gym this morning and did yoga.,1,exercise
3,27676,24h,We had a serious talk with some friends of our...,2,bonding
4,27677,24h,I went with grandchildren to butterfly display...,1,affection


In [3]:
data.index

RangeIndex(start=0, stop=60321, step=1)

In [4]:
data=data[['cleaned_hm','predicted_category']]

In [7]:
def process_text(text):
    sent = text.lower()
    word = word_tokenize(sent)
    clwrds = [w for w in word if not w in stop]
    lmtzr = WordNetLemmatizer()
    lmtzrs = [lmtzr.lemmatize(i) for i in clwrds]
    return(lmtzrs)

In [8]:
data['review_new']=data['cleaned_hm'].apply(process_text)

In [9]:
data['review_new'].head()

0    [went, successful, date, someone, felt, sympat...
1          [happy, son, got, 90, %, mark, examination]
2                        [went, gym, morning, yoga, .]
3    [serious, talk, friend, flaky, lately, ., unde...
4    [went, grandchild, butterfly, display, crohn, ...
Name: review_new, dtype: object

In [10]:
data['review_clean'] = [' '.join(map(str, l)) for l in data['review_new']]

In [11]:
data['Review_processed'] = data['review_clean'].str.replace('[^\w\s]','')

In [12]:
review_drop_text=["review_clean","review_new"]

In [13]:
data.drop(review_drop_text,axis=1,inplace=True)

In [14]:
data.head()

Unnamed: 0,cleaned_hm,predicted_category,Review_processed
0,I went on a successful date with someone I fel...,affection,went successful date someone felt sympathy con...
1,I was happy when my son got 90% marks in his e...,affection,happy son got 90 mark examination
2,I went to the gym this morning and did yoga.,exercise,went gym morning yoga
3,We had a serious talk with some friends of our...,bonding,serious talk friend flaky lately understood g...
4,I went with grandchildren to butterfly display...,affection,went grandchild butterfly display crohn conser...


In [15]:
X = data['Review_processed']
y = data['predicted_category']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.4,random_state=200)

In [16]:
vectorizer=TfidfVectorizer(max_features=200)
train_vectors=vectorizer.fit_transform(X_train)
validation_vectors=vectorizer.transform(X_test)
print(train_vectors.shape,validation_vectors.shape)


(36192, 200) (24129, 200)


In [17]:
rf=RandomForestClassifier()

In [18]:
rf.fit(train_vectors, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [19]:
pred=rf.predict(train_vectors)

In [67]:
pred_val=rf.predict(validation_vectors)

In [22]:
confusion_matrix(y_test,pred_val)

array([[6969,  477,   49,  356,   19,  215,   36],
       [ 774, 7183,   85,  184,    4,  131,   26],
       [ 220,  169, 2176,   44,    0,   32,    2],
       [1008,  228,   15, 1138,    6,  157,   23],
       [ 178,   24,    4,   25,   11,   41,    8],
       [ 410,  103,    9,  145,    8,  961,   14],
       [ 139,   54,    4,   36,    9,   28,  192]], dtype=int64)

In [70]:
f1_score(y_test, pred_val,average='weighted')

0.7644672583922377