In [11]:
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.model_selection import train_test_split  
from sklearn.linear_model import LogisticRegression


In [20]:
filepath_dict = {'yelp':   'yelp_labelled.txt',
                 'amazon': 'amazon_cells_labelled.txt',
                 'imdb':   'imdb_labelled.txt'}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)

df = pd.concat(df_list)
# print(df.iloc[0])
df.head()

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp


In [12]:
sen = ['John likes ice cream', 'John hates chocolate.']
vector = CountVectorizer()

In [13]:
vector.fit(sen)

CountVectorizer()

In [14]:
vector.vocabulary_

{'john': 4, 'likes': 5, 'ice': 3, 'cream': 1, 'hates': 2, 'chocolate': 0}

In [16]:
vector.transform(sen).toarray()

array([[0, 1, 0, 1, 1, 1],
       [1, 0, 1, 0, 1, 0]])

In [23]:
sen = df['sentence'].values
sen


array(['Wow... Loved this place.', 'Crust is not good.',
       'Not tasty and the texture was just nasty.', ...,
       'In a word, it is embarrassing.  ', 'Exceptionally bad!  ',
       "All in all its an insult to one's intelligence and a huge waste of money.  "],
      dtype=object)

In [31]:
label = df['label'].values
label

array([1, 0, 0, ..., 0, 0, 0])

In [32]:
# len(label)
len(sen)

2748

In [38]:
x_train , x_test , y_train , y_test = train_test_split(sen,label, test_size=0.3)
x_train
x_test


array(["Point your finger at any item on the menu, order it and you won't be disappointed.",
       'Buttons are too small.',
       "I'm still trying to get over how bad it was.  ",
       'I was sitting in my vehicle, with the cradle on my belt, and the headset lost signal.',
       'Appetite instantly gone.',
       'You learn a lot about the real inside emotions of people in this movie, and a lot about the movie business itself.  ',
       "Lot of holes in the plot: there's nothing about how he became the emperor; nothing about where he spend 20 years between his childhood and mature age.  ",
       'The deal included 5 tastings and 2 drinks, and Jeff went above and beyond what we expected.',
       'I really enjoyed Crema Café before they expanded; I even told friends they had the BEST breakfast.',
       'Motorola finally got the voice quality of a bluetooth headset right.',
       'Who in their right mind is gonna buy this battery?.',
       'I agree with Jessica, this movie is 

In [40]:
vector.fit(x_train)
vector.vocabulary_

{'the': 3536,
 'plot': 2646,
 'such': 3421,
 'as': 216,
 'it': 1896,
 'is': 1891,
 'so': 3258,
 'derivative': 939,
 'and': 160,
 'predictable': 2695,
 'that': 3534,
 'ending': 1189,
 'like': 2047,
 'mercy': 2220,
 'killing': 1958,
 'they': 3549,
 'work': 3974,
 'about': 50,
 'weeks': 3905,
 'then': 3545,
 'break': 438,
 'couldn': 803,
 'take': 3479,
 'them': 3542,
 'seriously': 3122,
 'great': 1584,
 'food': 1423,
 'service': 3129,
 'in': 1813,
 'clean': 649,
 'friendly': 1462,
 'setting': 3133,
 'accents': 57,
 'are': 203,
 'absolutely': 54,
 'abysmal': 56,
 'really': 2842,
 'this': 3558,
 'product': 2735,
 'over': 2483,
 'motorola': 2295,
 'because': 325,
 'allot': 132,
 'clearer': 651,
 'on': 2433,
 'ear': 1119,
 'piece': 2606,
 'mic': 2233,
 'waitress': 3850,
 'was': 3871,
 'happy': 1643,
 'to': 3598,
 'accomodate': 70,
 'for': 1427,
 'vegan': 3797,
 'veggie': 3801,
 'options': 2450,
 'big': 361,
 'fan': 1317,
 'of': 2416,
 'series': 3120,
 'mostly': 2290,
 'due': 1102,
 'anne': 17

In [57]:
X_train = vector.transform(x_train)
X_train
X_test = vector.transform(x_test)
X_test

<825x4027 sparse matrix of type '<class 'numpy.int64'>'
	with 8532 stored elements in Compressed Sparse Row format>

In [69]:
model = LogisticRegression()
model.fit(X_train,y_train)
model.score(X_test, y_test)

0.8278787878787879

In [72]:
predicted = vector.transform(x_test)
predicted.toarray
model.predict(predicted)


array([1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1,

In [73]:
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for yelp data: 0.7960
Accuracy for amazon data: 0.7960
Accuracy for imdb data: 0.7487
