In [15]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [19]:
filepath_dict = {'yelp':   'data/yelp_labelled.txt',
                 'amazon': 'data/amazon_cells_labelled.txt',
                 'imdb':   'data/imdb_labelled.txt'}
df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)


df = pd.concat(df_list)
# df

In [23]:
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(df.sentence)
vectorizer.vocabulary_

{'Wow': 1532,
 'Loved': 853,
 'this': 5371,
 'place': 4309,
 'Crust': 372,
 'is': 3569,
 'not': 4058,
 'good': 3204,
 'Not': 976,
 'tasty': 5305,
 'and': 1671,
 'the': 5344,
 'texture': 5339,
 'was': 5725,
 'just': 3603,
 'nasty': 4004,
 'Stopped': 1291,
 'by': 2020,
 'during': 2721,
 'late': 3655,
 'May': 900,
 'bank': 1811,
 'holiday': 3374,
 'off': 4090,
 'Rick': 1146,
 'Steve': 1287,
 'recommendation': 4579,
 'loved': 3772,
 'it': 3573,
 'The': 1347,
 'selection': 4843,
 'on': 4108,
 'menu': 3881,
 'great': 3227,
 'so': 5027,
 'were': 5765,
 'prices': 4420,
 'Now': 979,
 'am': 1654,
 'getting': 3170,
 'angry': 1675,
 'want': 5711,
 'my': 3994,
 'damn': 2457,
 'pho': 4282,
 'Honeslty': 684,
 'didn': 2571,
 'taste': 5301,
 'THAT': 1317,
 'fresh': 3102,
 'potatoes': 4385,
 'like': 3712,
 'rubber': 4733,
 'you': 5873,
 'could': 2367,
 'tell': 5323,
 'they': 5361,
 'had': 3262,
 'been': 1846,
 'made': 3790,
 'up': 5603,
 'ahead': 1626,
 'of': 4089,
 'time': 5400,
 'being': 1858,
 'kept'

In [27]:
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for yelp data: 0.7960
Accuracy for amazon data: 0.7960
Accuracy for imdb data: 0.7487
