# Importing Libraries


import pandas as pd
import numpy as np
import nltk
import re
#importing stopwords is optional, in this case it decreased accuracy
#from nltk.corpus import stopwords
import itertools
import time
nltk.download('wordnet')


In [2]:
start_time = time.time()

# Load DataSet

In [3]:
data = pd.read_csv('text_emotion.csv')
data = data.iloc[:10,:]

#data

# Lemmatization Of DataSet

In [4]:
#stopset = set(stopwords.words('english'))

from nltk.stem.wordnet import WordNetLemmatizer 
lem = WordNetLemmatizer()

# Pre-Processing of Data

In [5]:

def cleaning(text):
    txt = str(text)
    txt = re.sub(r"http\S+", "", txt)
    if len(txt) == 0:
        return 'no text'
    else:
        txt = txt.split()
        index = 0
        for j in range(len(txt)):
            if txt[j][0] == '@':
                index = j
        txt = np.delete(txt, index)
        if len(txt) == 0:
            return 'no text'
        else:
            words = txt[0]
            for k in range(len(txt)-1):
                words+= " " + txt[k+1]
            txt = words
            txt = re.sub(r'[^\w]', ' ', txt)
            if len(txt) == 0:
                return 'no text'
            else:
                txt = ''.join(''.join(s)[:2] for _, s in itertools.groupby(txt))
                txt = txt.replace("'", "")
                txt = nltk.tokenize.word_tokenize(txt)
                #data.content[i] = [w for w in data.content[i] if not w in stopset]
                for j in range(len(txt)):
                    txt[j] = lem.lemmatize(txt[j], "v")
                if len(txt) == 0:
                    return 'no text'
                else:
                    return txt

# Construction Of Words Dictionary

In [6]:
data['content'] = data['content'].map(lambda x: cleaning(x))
#data
data = data.reset_index(drop=True)
for i in range(len(data)):
    words = data.content[i][0]
    for j in range(len(data.content[i])-1):
        words+= ' ' + data.content[i][j+1]
    data.content[i] = words

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


# Training of Data

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(data.content, data.sentiment, test_size=0.25, random_state=0)

x_train = x_train.reset_index(drop = True)
x_test = x_test.reset_index(drop = True)

y_train = y_train.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

vectorizer = TfidfVectorizer(min_df=3, max_df=0.9)

train_vectors = vectorizer.fit_transform(x_train)
test_vectors = vectorizer.transform(x_test)

model = svm.SVC(kernel='linear') 
model.fit(train_vectors, y_train) 
predicted_sentiment = model.predict(test_vectors)

print(classification_report(y_test, predicted_sentiment))
predicted_sentiments = []
for s in range(len(predicted_sentiment)):
    predicted_sentiments.append(predicted_sentiment[s])
    
prediction_df = pd.DataFrame({'Content':x_test, 'Emotion_predicted':predicted_sentiment, 'Emotion_actual': y_test})
prediction_df.to_csv('emotion_recognizer_svm.csv', index = False)

elapsed_time = time.time() - start_time
print ("processing time:", elapsed_time, "seconds")

              precision    recall  f1-score   support

       anger       0.00      0.00      0.00         4
     boredom       0.00      0.00      0.00         8
       empty       0.00      0.00      0.00        18
  enthusiasm       0.00      0.00      0.00        16
         fun       1.00      0.05      0.10        20
   happiness       0.00      0.00      0.00        55
        hate       0.27      0.08      0.13        71
        love       0.00      0.00      0.00        35
     neutral       0.27      0.26      0.27       255
      relief       0.00      0.00      0.00        27
     sadness       0.34      0.30      0.32       289
    surprise       0.00      0.00      0.00        65
       worry       0.34      0.63      0.45       387

   micro avg       0.32      0.32      0.32      1250
   macro avg       0.17      0.10      0.10      1250
weighted avg       0.27      0.32      0.27      1250

processing time: 4029.7645978927612 seconds


  'precision', 'predicted', average, warn_for)


# Accuracy 

In [8]:
from sklearn.metrics import accuracy_score, confusion_matrix

cm = confusion_matrix(predicted_sentiment, y_test)
print(cm)

print(accuracy_score(y_test, predicted_sentiment))

[[  0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   1   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   1   0   0]
 [  0   0   1   0   0   0   6   0   3   0   6   1   5]
 [  0   0   0   0   0   1   0   0   0   0   0   1   1]
 [  1   4   7   3   4  17  12   3  67   6  46  15  64]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   1   4   0   4  13  14  10  40   4  87  11  71]
 [  0   0   0   0   0   0   0   0   0   0   0   0   1]
 [  3   3   6  13  11  24  39  22 145  17 149  37 245]]
0.3248


# Test File

In [9]:
prediction_df

Unnamed: 0,Content,Emotion_predicted,Emotion_actual
0,Ready To Go School,worry,neutral
1,dad s not feel well I want to make him soup Su...,worry,worry
2,cause my eyelids be not deep set thank it woul...,worry,worry
3,she s in LA want sun today but apparently LA i...,neutral,neutral
4,don t think i can take a needle watch a horrib...,neutral,worry
5,your boyfriend didn t even say bye,worry,sadness
6,you didnt send me the text,sadness,worry
7,bore at school and my throat it swell,worry,worry
8,leonardo You be a great mini fiddler crab,worry,fun
9,get finish watch marley and me and I cry like ...,sadness,sadness
