In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
import nltk

In [37]:
import pandas as pd
text=pd.read_csv('mood_data.txt', sep=';', header=None, names=['Text','Emotion'])
text

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
15995,i just had a very brief time in the beanbag an...,sadness
15996,i am now turning and i feel pathetic that i am...,sadness
15997,i feel strong and good overall,joy
15998,i feel like this was such a rude comment and i...,anger


In [38]:
df1 = pd.DataFrame(text)
df1

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
15995,i just had a very brief time in the beanbag an...,sadness
15996,i am now turning and i feel pathetic that i am...,sadness
15997,i feel strong and good overall,joy
15998,i feel like this was such a rude comment and i...,anger


In [6]:
import string,re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
def clean_text(df1):
    df1 = word_tokenize(df1) # Create tokens
    df1= " ".join(df1) # Join tokens
    df1 = [char for char in df1 if char not in string.punctuation] # Remove punctuations
    df1 = ''.join(df1) # Join the leters
    df1 = [word for word in df1.split() if word.lower() not in stopwords.words('english')] # Remove common english words (I, you, we,...)
    return " ".join(df1)

In [7]:
df1['cleaned_text']=df1['Text'].apply(clean_text)

df1['cleaned_text'].head()

0                                didnt feel humiliated
1    go feeling hopeless damned hopeful around some...
2            im grabbing minute post feel greedy wrong
3    ever feeling nostalgic fireplace know still pr...
4                                      feeling grouchy
Name: cleaned_text, dtype: object

In [8]:
features = df1['cleaned_text']
processed_features = []

for sentence in range(0, len(features)):
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(features[sentence]))
    
    # Remove single characters appearing in the text except the start
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
    
    # Remove single characters appearing at the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 
    
    # Substitute multiple spaces with a single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)
    
    # Remove prefix 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)
    
    # Convert to lowercase
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)

In [9]:
processed_features[:5]

['didnt feel humiliated',
 'go feeling hopeless damned hopeful around someone cares awake',
 'im grabbing minute post feel greedy wrong',
 'ever feeling nostalgic fireplace know still property',
 'feeling grouchy']

In [10]:
df1['processed_text'] = processed_features
df1

Unnamed: 0,Text,Emotion,cleaned_text,processed_text
0,i didnt feel humiliated,sadness,didnt feel humiliated,didnt feel humiliated
1,i can go from feeling so hopeless to so damned...,sadness,go feeling hopeless damned hopeful around some...,go feeling hopeless damned hopeful around some...
2,im grabbing a minute to post i feel greedy wrong,anger,im grabbing minute post feel greedy wrong,im grabbing minute post feel greedy wrong
3,i am ever feeling nostalgic about the fireplac...,love,ever feeling nostalgic fireplace know still pr...,ever feeling nostalgic fireplace know still pr...
4,i am feeling grouchy,anger,feeling grouchy,feeling grouchy
...,...,...,...,...
15995,i just had a very brief time in the beanbag an...,sadness,brief time beanbag said anna feel like beaten,brief time beanbag said anna feel like beaten
15996,i am now turning and i feel pathetic that i am...,sadness,turning feel pathetic still waiting tables sub...,turning feel pathetic still waiting tables sub...
15997,i feel strong and good overall,joy,feel strong good overall,feel strong good overall
15998,i feel like this was such a rude comment and i...,anger,feel like rude comment im glad,feel like rude comment im glad


In [11]:
df1.head()

Unnamed: 0,Text,Emotion,cleaned_text,processed_text
0,i didnt feel humiliated,sadness,didnt feel humiliated,didnt feel humiliated
1,i can go from feeling so hopeless to so damned...,sadness,go feeling hopeless damned hopeful around some...,go feeling hopeless damned hopeful around some...
2,im grabbing a minute to post i feel greedy wrong,anger,im grabbing minute post feel greedy wrong,im grabbing minute post feel greedy wrong
3,i am ever feeling nostalgic about the fireplac...,love,ever feeling nostalgic fireplace know still pr...,ever feeling nostalgic fireplace know still pr...
4,i am feeling grouchy,anger,feeling grouchy,feeling grouchy


In [12]:
df2 = df1.drop(['Text', 'cleaned_text'], axis=1)
df2

Unnamed: 0,Emotion,processed_text
0,sadness,didnt feel humiliated
1,sadness,go feeling hopeless damned hopeful around some...
2,anger,im grabbing minute post feel greedy wrong
3,love,ever feeling nostalgic fireplace know still pr...
4,anger,feeling grouchy
...,...,...
15995,sadness,brief time beanbag said anna feel like beaten
15996,sadness,turning feel pathetic still waiting tables sub...
15997,joy,feel strong good overall
15998,anger,feel like rude comment im glad


In [39]:
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [40]:
def tokenize(text): 
    tk = TweetTokenizer()
    return tk.tokenize(text)

vectorizer = CountVectorizer(analyzer = 'word',tokenizer = tokenize,lowercase = True,ngram_range=(1, 1))

In [41]:
count = vectorizer.fit_transform(df2['processed_text'])

In [42]:
count.shape

(16000, 15060)

In [43]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

In [44]:
X = df2['processed_text'].values
y = df2['Emotion'].values

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=100, test_size=0.3)

In [45]:
vectorizer = TfidfVectorizer(max_features=1000)
X_train_idf = vectorizer.fit_transform(X_train)
X_test_idf = vectorizer.transform(X_test)
X_train_idf.shape

(11200, 1000)

In [46]:
df_idf = pd.DataFrame(vectorizer.idf_, index=vectorizer.get_feature_names(),columns=["idf_weights"])
df_idf.sort_values(by=['idf_weights'],ascending = False).head()



Unnamed: 0,idf_weights
blah,7.758809
chest,7.684701
film,7.55117
dad,7.55117
voice,7.490545


In [61]:
m=MultinomialNB()
m.fit(X_train_idf,y_train)

In [62]:
pred_mnb = m.predict(X_test_idf)

# Calculate accuracy of predicted values
acc = accuracy_score(y_test, pred_mnb)


results = pd.DataFrame([['Multinomial Naive Bayes', acc]],
               columns = ['Model', 'Accuracy'])

print(results)

                     Model  Accuracy
0  Multinomial Naive Bayes   0.78875


In [63]:
# Perform Random Forest classification on the processed data and compare the accuracy score of both these models

# Random Forest Classifier with 'gini'

from sklearn.ensemble import RandomForestClassifier
clf_gini = RandomForestClassifier()
clf_gini.fit(X_train_idf, y_train)

# Predict using testing data
y_pred_gini = clf_gini.predict(X_test_idf)

# Calculate accuracy
acc1 = accuracy_score(y_test, y_pred_gini)

model_results = pd.DataFrame([['Random Forest(Gini)', acc1]],
               columns = ['Model', 'Accuracy'])

results = results.append(model_results, ignore_index = True)
print(results)

                     Model  Accuracy
0  Multinomial Naive Bayes  0.788750
1      Random Forest(Gini)  0.850208


  results = results.append(model_results, ignore_index = True)


In [64]:
from sklearn.ensemble import RandomForestClassifier
clf_ent = RandomForestClassifier(criterion='entropy')
clf_ent.fit(X_train_idf, y_train)

# Predict using testing data
y_pred_ent = clf_ent.predict(X_test_idf)

# Calculate accuracy
acc2 = accuracy_score(y_test, y_pred_ent)

model_results = pd.DataFrame([['Random Forest(Entropy)', acc2]],
               columns = ['Model', 'Accuracy'])

results = results.append(model_results, ignore_index = True)
print(results)

                     Model  Accuracy
0  Multinomial Naive Bayes  0.788750
1      Random Forest(Gini)  0.850208
2   Random Forest(Entropy)  0.848750


  results = results.append(model_results, ignore_index = True)


In [65]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
clf=SVC()
clf.fit(X_train_idf, y_train)
y_pred = clf.predict(X_test_idf)
acc3 = accuracy_score(y_test, y_pred)
model_results = pd.DataFrame([['SVC by SVM', acc3]],
               columns = ['Model', 'Accuracy'])
results = results.append(model_results, ignore_index = True)
results

  results = results.append(model_results, ignore_index = True)


Unnamed: 0,Model,Accuracy
0,Multinomial Naive Bayes,0.78875
1,Random Forest(Gini),0.850208
2,Random Forest(Entropy),0.84875
3,SVC by SVM,0.839583
