In [1]:
import nltk
nltk.download('punkt')
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from sklearn import preprocessing

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


<p style = 'color:#00ff00;font-size:20px'>Function to read and organise the text data into a dictionary</p>

In [3]:
def convertData(filepath):
    with open(filepath,'r') as file:
        tokens = [nltk.word_tokenize(line) for line in file.readlines()]
        train_data = {'data':[],
                     'target':[]}
        for  h ,token in enumerate(tokens):
            temp_x = []
            for i , word in enumerate(token):
                if word == ";":
                    continue
                elif i == len(token)-1:
                    train_data['target'].append(word)
                else:
                    temp_x.append(word)
            train_data['data'].append(' '.join(temp_x))
    
    return train_data

<p>Load data into necessary categories</p>

In [4]:
train_data = convertData('/content/drive/MyDrive/Colab Notebooks/Data science/ML/Emotions/Data/train.txt')
val_data = convertData('/content/drive/MyDrive/Colab Notebooks/Data science/ML/Emotions/Data/val.txt')
test_data = convertData('/content/drive/MyDrive/Colab Notebooks/Data science/ML/Emotions/Data/test.txt')

<p>convert categorial target variables into nemerical</p>

In [5]:
enc = preprocessing.LabelEncoder()
train_data['target'] = enc.fit_transform(train_data['target'])
val_data['target'] = enc.fit_transform(val_data['target'])
test_data['target'] = enc.fit_transform(test_data['target'])
train_data['target']

array([4, 4, 0, ..., 2, 0, 4])

<h2>PART 1 : WITHOUT USING NATURAL LANGUAGE PROCESSING TECHNIQUES</h2>

<p>Function to preprocess the data for model building and evaluation</p>

In [6]:
vectorizer = CountVectorizer()
vectorizer.fit(train_data['data'])
transformer = TfidfTransformer()
def dataPreparation(data_dict):
    count_data = vectorizer.transform(data_dict['data'])
    transformer.fit(count_data)
    tfidf_data = transformer.transform(count_data)
    return tfidf_data

<p>Prepare data for model fitting ,evaluation and testing</p>

In [7]:
x_train = dataPreparation(train_data)
x_val = dataPreparation(val_data)
x_test = dataPreparation(test_data)
print(x_train.shape)
print(x_test.shape)

(16000, 15184)
(2000, 15184)


<p>Build and Fit model<p>

In [8]:
NBModel = MultinomialNB()
NBModel.fit(x_train,train_data['target'])

MultinomialNB()

<p>Evaluate model perfomance on Evaluation data</p>

In [9]:
eval_ = NBModel.predict(x_val)
print('Accuracy :{:.2f}%'.format(accuracy_score(val_data['target'],eval_) * 100))

Accuracy :63.90%


In [10]:
eval_ = NBModel.predict(x_test)
print('Accuracy :{:.2f}%'.format(accuracy_score(test_data['target'],eval_) * 100))

Accuracy :64.70%


<p>Function to build and evaluate perfomance<p>

In [11]:
def emotionModel(x_train,y_train , x_test , y_test):
    NBmodel = MultinomialNB()
    NBmodel.fit(x_train,y_train)
    
    y_pred = NBmodel.predict(x_test)
    print('Accuracy : {:.2f}%'.format(accuracy_score(y_test,y_pred) * 100))

<p>Use the function to for evaluation</p>

In [12]:
emotionModel(x_train,train_data['target'],x_val,val_data['target'])
emotionModel(x_train,train_data['target'],x_test,test_data['target'])

Accuracy : 63.90%
Accuracy : 64.70%


<h2>PART 2 :USING NATURAL LANGUAGE PROCESSING TECHNIQUES TECHNIQUES</h2>

<p>Function to process data using nltk techniques</p>

In [13]:
def dataProcessing(raw_data):
    stop_words = set(stopwords.words('english'))
    clean_data = []
    for row in raw_data:
        tokens = nltk.word_tokenize(row)
        prt = nltk.PorterStemmer()
        stems = []
        for token in tokens:
            if token not in stop_words:
                if token.isalnum() == True:
                    stems.append(token)
        sent = ' '.join(stems)
        clean_data.append(sent)
    return clean_data
    

<p>Process Data using function</p>

In [None]:
train_data['data'] = dataProcessing(train_data['data'])
test_data['data'] = dataProcessing(test_data['data'])


<p>Prepare data for model training and testing</p>

In [None]:
vectorizer = CountVectorizer()
vectorizer.fit(train_data['data'])


x_train = dataPreparation(train_data)
x_val = dataPreparation(val_data)
x_test = dataPreparation(test_data)



<p>Use new data and test perfomance</p>

In [None]:
emotionModel(x_train,train_data['target'],x_val,val_data['target'])
emotionModel(x_train,train_data['target'],x_test,test_data['target'])

Accuracy : 67.85%
Accuracy : 68.75%
