# Machine Learning Intro¶
* text_classifier

In [47]:

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

## Reading data from sources

In [48]:
file_path = {
    "yelp": "yelp_labelled.txt",
    "amazon": "amazon_cells_labelled.txt",
    "imdb": "imdb_labelled.txt",
}

data_list = []

for source, path in file_path.items():
    data_frame = pd.read_csv(path, names=["sentences", "label"], sep="\t")
    data_frame["source"] = source
    data_list.append(data_frame)


data_frame = pd.concat(data_list)
data_frame.head()


Unnamed: 0,sentences,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp


## Data analysis likes to "vectorize" data to be more efficient to work with


In [49]:
sentences = ['Bashar likes food', 'Bashar hates chocolate.']


In [50]:
vectorizer = CountVectorizer(lowercase=False, min_df=1)
vectorizer.fit(sentences)
vectorizer.vocabulary_


{'Bashar': 0, 'likes': 4, 'food': 2, 'hates': 3, 'chocolate': 1}

In [51]:
vectorizer.transform(sentences)


<2x5 sparse matrix of type '<class 'numpy.int64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [52]:
vectorizer.transform(sentences).toarray()


array([[1, 0, 1, 0, 1],
       [1, 1, 0, 1, 0]])

In [53]:
def report(vectorizer, sentences):

    vectorizer.fit(sentences)


    items = vectorizer.vocabulary_.items()

    matrix = vectorizer.transform(sentences)

    transformed_array = matrix.toarray()

    for i, sentence in enumerate(sentences):
        print("\nSentence:", sentence, "\n")
        transformed_sentence = transformed_array[i]
        for index, value in enumerate(transformed_sentence):
            for item in items:
                if item[1] == index:
                    if value:
                        print("\t", item[0], ": found")
                    else:
                        print("\t", item[0], ": not found")
                        

report(CountVectorizer(), sentences)


Sentence: Bashar likes food 

	 bashar : found
	 chocolate : not found
	 food : found
	 hates : not found
	 likes : found

Sentence: Bashar hates chocolate. 

	 bashar : found
	 chocolate : found
	 food : not found
	 hates : found
	 likes : not found


## Let's move on to one of the full data sets


In [54]:

df_yelp = data_frame[data_frame['source'] == 'yelp']

sentences = df_yelp['sentences'].values
y = df_yelp['label'].values

sentences_train, sentences_test, y_train, y_test = train_test_split(
    sentences, y, test_size=0.25, random_state=1000)


In [55]:
len(sentences_train)


750

In [56]:
sentences_train[:4]


array(['The food was barely lukewarm, so it must have been sitting waiting for the server to bring it out to us.',
       'Sorry, I will not be getting food from here anytime soon :(',
       'Of all the dishes, the salmon was the best, but all were great.',
       'The fries were not hot, and neither was my burger.'], dtype=object)

In [57]:
y_train[:4]


array([0, 0, 1, 0])

In [58]:

vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test = vectorizer.transform(sentences_test)
X_train


<750x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 7368 stored elements in Compressed Sparse Row format>

In [59]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

print("Accuracy:", score)


Accuracy: 0.796


Let's do all 3 data sets

In [63]:

for source in data_frame['source'].unique():
    df_source = data_frame[data_frame['source'] == source]
    sentences = df_source['sentences'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))


Accuracy for yelp data: 0.7960
Accuracy for amazon data: 0.7960
Accuracy for imdb data: 0.7487
