In [150]:
import os
import re
import string
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [151]:
def readCSV(fileName: str):
    """ Read the .csv file and combine all data into a large dataset

    Read .csv file from current folder, put all records into dataframe, and 
    return the dataset and labels.

    Arg:
        fileName(str): The string that contain the .csv file location.

    Return:
        A dataframe that contain all record from .csv file.
        The labels for all features.

    Rasies:
        File Error: File not exist or not in the correct location.
    """
    pass

    try:
        df = pd.read_csv(fileName)
        # dataset = df.drop(['action', 'object', 'location'], axis=1)
        # df['text'] = (df.content).apply(clean_text)
        # dataset = df.drop(['tweet_id', 'sentiment', 'content'], axis=1)
        # labels = df.drop(['tweet_id', 'content', 'text'], axis=1)
        dataset = df['transcription']
        labels = df['action']
        return dataset, labels
    except:
        print("Open file error")

In [152]:
def clean_text(text):
    """
    https://www.kaggle.com/toygarr/mixup-text-augmentation-in-emotion-detection-task
    """
    regex_html = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    remove_digits = str.maketrans('', '', string.digits + string.punctuation)
    text = re.sub(regex_html, '', text)
    text = text.translate(remove_digits)
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text).split()).lower()

In [155]:
dirPath = os.path.abspath('.')
print(dirPath)
# fileName = "tweet_emotions.csv"
fileName = dirPath + "\\train_data.csv"
dataset, labels = readCSV(fileName)
# X = dataset['text']
X = dataset
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)
# y = label_encoder.fit_transform(labels['sentiment'])
dataset

c:\Users\cheng\Documents\Machine Learning\MLCodes\MLAssignments\Assignment03


0                        Turn on the kitchen lights
1                           Turn up the temperature
2        OK now switch the main language to Chinese
3                Turn down the bathroom temperature
4                               Change the language
                            ...                    
11561                               Kitchen heat up
11562                       Turn the temperature up
11563                                   Bring shoes
11564                                   Volume mute
11565                   Turn off the kitchen lights
Name: transcription, Length: 11566, dtype: object

In [169]:
# Learning the vocabulary dictionary and return document-term matrix
c_vectorizer = CountVectorizer(stop_words="english")
text_dt_matrix = c_vectorizer.fit_transform(X)
text_dt_matrix.shape

(11566, 65)

In [None]:
# Learning the vocabulary dictionary and return document-term matrix
# sen_dt_matrix = c_vectorizer.fit_transform(labels['sentiment'])
# sen_dt_matrix

In [159]:
dd = c_vectorizer.vocabulary_
dd

{'turn': 60,
 'kitchen': 22,
 'lights': 28,
 'temperature': 59,
 'ok': 40,
 'switch': 58,
 'main': 33,
 'language': 25,
 'chinese': 6,
 'bathroom': 2,
 'change': 5,
 'sound': 55,
 'decrease': 9,
 'heating': 18,
 'washroom': 64,
 'loud': 29,
 'bedroom': 3,
 'heat': 17,
 'increase': 20,
 'set': 50,
 'need': 38,
 'practice': 45,
 'english': 12,
 'hear': 16,
 'volume': 63,
 'lamp': 24,
 'bring': 4,
 'shoes': 52,
 'newspaper': 39,
 'fetch': 14,
 'socks': 53,
 'make': 34,
 'music': 36,
 'louder': 30,
 'phone': 43,
 'lower': 32,
 'korean': 23,
 'far': 13,
 'max': 35,
 'juice': 21,
 'device': 10,
 'video': 62,
 'low': 31,
 'play': 44,
 'audio': 1,
 'quieter': 47,
 'languages': 26,
 'quiet': 46,
 'resume': 49,
 'use': 61,
 'different': 11,
 'mute': 37,
 'hotter': 19,
 'pause': 42,
 'levels': 27,
 'german': 15,
 'start': 56,
 'settings': 51,
 'stop': 57,
 'softer': 54,
 'reduce': 48,
 'allow': 0,
 'couldn': 8,
 'cooler': 7,
 'open': 41}

In [160]:
c_vectorizer.get_feature_names_out()

array(['allow', 'audio', 'bathroom', 'bedroom', 'bring', 'change',
       'chinese', 'cooler', 'couldn', 'decrease', 'device', 'different',
       'english', 'far', 'fetch', 'german', 'hear', 'heat', 'heating',
       'hotter', 'increase', 'juice', 'kitchen', 'korean', 'lamp',
       'language', 'languages', 'levels', 'lights', 'loud', 'louder',
       'low', 'lower', 'main', 'make', 'max', 'music', 'mute', 'need',
       'newspaper', 'ok', 'open', 'pause', 'phone', 'play', 'practice',
       'quiet', 'quieter', 'reduce', 'resume', 'set', 'settings', 'shoes',
       'socks', 'softer', 'sound', 'start', 'stop', 'switch',
       'temperature', 'turn', 'use', 'video', 'volume', 'washroom'],
      dtype=object)

In [161]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)

In [162]:
X_train

10551    Turn up the temperature in the kitchen
6339                           Turn the lamp on
6068                                       Play
9280            Could you increase the heating?
2748                          Kitchen lights on
                          ...                  
4859                            Bedroom heat up
919                   Lights off in the kitchen
500                        Go get the newspaper
4517                Turn on the bathroom lights
5925                          Bedroom lights on
Name: transcription, Length: 8674, dtype: object

In [163]:
y_train

array([5, 0, 0, ..., 1, 0, 0])

In [177]:
mnb_model = MultinomialNB()
v_mnb_model = Pipeline(steps=[('vectorizer', c_vectorizer), ('classifier', mnb_model)])
v_mnb_model

Pipeline(steps=[('vectorizer', CountVectorizer(stop_words='english')),
                ('classifier', MultinomialNB())])

In [165]:
v_mnb_model.fit(X_train, y_train)
v_mnb_model.score(X_test, y_test)

0.7576071922544951

In [175]:
text = ["Cj get ac os"]
predictions = v_mnb_model.predict(text)
predictions

array([5])